diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index 58c5e705b2..945c9b0318 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -77,8 +77,12 @@ __configure_fbgemm_gpu_build_rocm () {
       echo "[BUILD] Architectures list from rocminfo: ${arch_list}"
 
       if [ "$arch_list" == "" ]; then
-        # By default, build for MI250 only to save time
-        local arch_list=gfx90a
+        echo "[BUILD] rocminfo did not return anything valid!"
+
+        # By default, we build just for MI100 and MI250 to save time.  This list
+        # needs to be updated if the CI ROCm machines have different hardware.
+        # Architecture mapping can be found at: https://wiki.gentoo.org/wiki/ROCm
+        local arch_list="gfx908,gfx90a"
       fi
     else
       echo "[BUILD] rocminfo not found in PATH!"
@@ -92,7 +96,12 @@ __configure_fbgemm_gpu_build_rocm () {
   echo "[BUILD] Setting ROCm build args ..."
   build_args=(
     --package_variant=rocm
-    -DTORCH_USE_HIP_DSA=1
+    # HIP_ROOT_DIR now required for HIP to be correctly detected by CMake
+    -DHIP_ROOT_DIR=/opt/rocm
+    # Enable device-side assertions in HIP
+    # https://stackoverflow.com/questions/44284275/passing-compiler-options-in-cmake-command-line
+    -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA"
+    -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
   )
 }
 
@@ -140,6 +149,7 @@ __configure_fbgemm_gpu_build_cuda () {
   build_args=(
     --package_variant=cuda
     --nvml_lib_path="${nvml_lib_path}"
+    # Pass to PyTorch CMake
     -DTORCH_CUDA_ARCH_LIST="'${arch_list}'"
   )
 }
diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
index 36bb802eed..0325a257dd 100644
--- a/.github/scripts/fbgemm_gpu_test.bash
+++ b/.github/scripts/fbgemm_gpu_test.bash
@@ -50,11 +50,11 @@ run_python_test () {
 run_fbgemm_gpu_tests () {
   local env_name="$1"
   local fbgemm_variant="$2"
-  if [ "$env_name" == "" ]; then
+  if [ "$fbgemm_variant" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} ENV_NAME [FBGEMM_VARIANT]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env        # Run all tests applicable to CUDA"
     echo "    ${FUNCNAME[0]} build_env cpu    # Run all tests applicable to CPU"
+    echo "    ${FUNCNAME[0]} build_env cuda   # Run all tests applicable to CUDA"
     echo "    ${FUNCNAME[0]} build_env rocm   # Run all tests applicable to ROCm"
     return 1
   else
@@ -71,9 +71,11 @@ run_fbgemm_gpu_tests () {
 
   # Enable ROCM testing if specified
   if [ "$fbgemm_variant" == "rocm" ]; then
-    echo "[TEST] Set environment variable FBGEMM_TEST_WITH_ROCM to enable ROCm tests ..."
+    echo "[TEST] Set environment variables for ROCm testing ..."
     # shellcheck disable=SC2086
     print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1
+    # shellcheck disable=SC2086
+    print_exec conda env config vars set ${env_prefix} HIP_LAUNCH_BLOCKING=1
   fi
 
   # These are either non-tests or currently-broken tests in both FBGEMM_GPU and FBGEMM_GPU-CPU
@@ -138,7 +140,7 @@ test_setup_conda_environment () {
   if [ "$pytorch_variant_type" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} ENV_NAME PYTHON_VERSION PYTORCH_INSTALLER PYTORCH_VERSION PYTORCH_VARIANT_TYPE [PYTORCH_VARIANT_VERSION]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env 3.10 pip test cuda 12.1.0       # Setup environment with pytorch-test for Python 3.10 + CUDA 12.1.0"
+    echo "    ${FUNCNAME[0]} build_env 3.12 pip test cuda 12.1.0       # Setup environment with pytorch-test for Python 3.12 + CUDA 12.1.0"
     return 1
   else
     echo "################################################################################"
@@ -210,8 +212,8 @@ test_fbgemm_gpu_build_and_install () {
   cd -
   install_fbgemm_gpu_wheel    "${env_name}" fbgemm_gpu/dist/*.whl             || return 1
 
-  cd fbgemm_gpu/test                        || return 1
-  run_fbgemm_gpu_tests        "${env_name}" || return 1
+  cd fbgemm_gpu/test                                                          || return 1
+  run_fbgemm_gpu_tests        "${env_name}" "${pytorch_variant_type}"         || return 1
   # shellcheck disable=SC2164
   cd -
 }
diff --git a/.github/scripts/nova_postscript.bash b/.github/scripts/nova_postscript.bash
index 4602d6bd21..19d2d4ec8d 100644
--- a/.github/scripts/nova_postscript.bash
+++ b/.github/scripts/nova_postscript.bash
@@ -20,6 +20,10 @@ echo "[NOVA] Current working directory: $(pwd)"
 # shellcheck source=.github/scripts/setup_env.bash
 . "${PRELUDE}";
 
+# Collect PyTorch environment information
+collect_pytorch_env_info "${BUILD_ENV_NAME}"
+
+# Install the wheel
 install_fbgemm_gpu_wheel "${BUILD_ENV_NAME}" fbgemm_gpu/dist/*.whl
 
 # Test with PyTest
@@ -31,3 +35,6 @@ fi
 $CONDA_RUN python3 -c "import torch; print('cuda.is_available() ', torch.cuda.is_available()); print ('device_count() ',torch.cuda.device_count());"
 cd "${FBGEMM_REPO}/fbgemm_gpu/test" || { echo "[NOVA] Failed to cd to fbgemm_gpu/test from $(pwd)"; };
 run_fbgemm_gpu_tests "${BUILD_ENV_NAME}" "${CPU_GPU}"
+
+# Workaround EACCES: permission denied error at checkout step
+chown -R 1000:1000 /__w/FBGEMM/FBGEMM/ || echo "Unable to chown 1000:1000 from $USER, uid: $(id -u)"
diff --git a/.github/scripts/nova_prescript.bash b/.github/scripts/nova_prescript.bash
index f52e3b163a..9fda9d847e 100644
--- a/.github/scripts/nova_prescript.bash
+++ b/.github/scripts/nova_prescript.bash
@@ -33,6 +33,9 @@ install_cxx_compiler "${BUILD_ENV_NAME}"
 # Install Build Tools
 install_build_tools "${BUILD_ENV_NAME}"
 
+# Collect PyTorch environment information
+collect_pytorch_env_info "${BUILD_ENV_NAME}"
+
 if [[ $CU_VERSION = cu* ]]; then
   # Extract the CUDA version number from CU_VERSION
   cuda_version=$(echo "[NOVA] ${CU_VERSION}" | cut -c 3-)
diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash
index 09b0b543d5..1bef3cd5ca 100644
--- a/.github/scripts/utils_cuda.bash
+++ b/.github/scripts/utils_cuda.bash
@@ -77,6 +77,14 @@ install_cuda () {
   # Print nvcc version
   # shellcheck disable=SC2086
   print_exec conda run ${env_prefix} nvcc --version
+
+  if which nvidia-smi; then
+    # If nvidia-smi is installed on a machine without GPUs, this will return error
+    (print_exec nvidia-smi) || true
+  else
+    echo "[CHECK] nvidia-smi not found"
+  fi
+
   echo "[INSTALL] Successfully installed CUDA ${cuda_version}"
 }
 
diff --git a/.github/scripts/utils_pytorch.bash b/.github/scripts/utils_pytorch.bash
index 77e88a8130..a25d78c2d1 100644
--- a/.github/scripts/utils_pytorch.bash
+++ b/.github/scripts/utils_pytorch.bash
@@ -146,3 +146,38 @@ install_pytorch_pip () {
 
   echo "[INSTALL] Successfully installed PyTorch through PyTorch PIP"
 }
+
+
+################################################################################
+# PyTorch Diagnose Functions
+################################################################################
+
+collect_pytorch_env_info () {
+  local env_name="$1"
+  if [ "$env_name" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env         # Collect PyTorch environment information from Conda environment build_env"
+    return 1
+  else
+    echo "################################################################################"
+    echo "# Collect PyTorch Environment Information (for Reporting Issues)"
+    echo "#"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
+    echo "################################################################################"
+    echo ""
+  fi
+
+  test_network_connection || return 1
+
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
+  # This is the script required for collecting info and reporting to https://github.com/pytorch/pytorch/issues/new
+  echo "[INFO] Downloading the PyTorch environment info collection script ..."
+  print_exec wget -q "https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py"
+
+  echo "[INFO] Collecting PyTorch environment info (will be needed for reporting issues to PyTorch) ..."
+  # shellcheck disable=SC2086
+  (exec_with_retries 3 conda run ${env_prefix} python collect_env.py) || return 1
+}
diff --git a/.github/scripts/utils_rocm.bash b/.github/scripts/utils_rocm.bash
index beac98d303..ff57758047 100644
--- a/.github/scripts/utils_rocm.bash
+++ b/.github/scripts/utils_rocm.bash
@@ -75,6 +75,8 @@ install_rocm_ubuntu () {
   print_exec rm -f "${package_name}"
 
   echo "[INFO] Check ROCM GPU info ..."
+  # If rocm-smi is installed on a machine without GPUs, this will return error
+  (print_exec rocminfo) || true
   print_exec rocm-smi
 
   echo "[INSTALL] Successfully installed ROCm ${rocm_version}"
diff --git a/.github/scripts/utils_system.bash b/.github/scripts/utils_system.bash
index d6be9707ff..b01441eccc 100644
--- a/.github/scripts/utils_system.bash
+++ b/.github/scripts/utils_system.bash
@@ -118,6 +118,12 @@ print_gpu_info () {
       return 1
     fi
   else
+    if which rocminfo; then
+      # If rocminfo is installed on a machine without GPUs, this will return error
+      (print_exec rocminfo) || true
+    else
+      echo "[CHECK] rocminfo not found"
+    fi
     if which rocm-smi; then
       # If rocm-smi is installed on a machine without GPUs, this will return error
       (print_exec rocm-smi) || true
diff --git a/.github/workflows/fbgemm_gpu_cpu_nightly.yml b/.github/workflows/fbgemm_gpu_ci_cpu.yml
similarity index 67%
rename from .github/workflows/fbgemm_gpu_cpu_nightly.yml
rename to .github/workflows/fbgemm_gpu_ci_cpu.yml
index 8c5efd66fe..d331e626b8 100644
--- a/.github/workflows/fbgemm_gpu_cpu_nightly.yml
+++ b/.github/workflows/fbgemm_gpu_ci_cpu.yml
@@ -3,7 +3,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-name: FBGEMM_GPU-CPU Nightly Build
+# This workflow is used for FBGEMM_GPU-CPU CI as well as nightly builds of
+# FBGEMM_GPU-CPU against PyTorch-CPU Nightly.
+name: FBGEMM_GPU-CPU CI
 
 on:
   # PR Trigger (enabled for regression checks and debugging)
@@ -64,7 +66,7 @@ jobs:
           { arch: x86, instance: "linux.4xlarge" },
           { arch: arm, instance: "linux.arm64.2xlarge" },
         ]
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
 
     steps:
     - name: Setup Build Container
@@ -96,10 +98,14 @@ jobs:
     - name: Install PyTorch-CPU Nightly
       run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu
 
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run:  . $PRELUDE; collect_pytorch_env_info $BUILD_ENV
+
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Build FBGEMM_GPU Nightly (CPU version)
+    - name: Build FBGEMM_GPU Wheel
       run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly cpu
 
     - name: Upload Built Wheel as GHA Artifact
@@ -128,7 +134,7 @@ jobs:
           { arch: x86, instance: "linux.4xlarge" },
           { arch: arm, instance: "linux.arm64.2xlarge" },
         ]
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
     needs: build_artifact
 
     steps:
@@ -164,10 +170,14 @@ jobs:
     - name: Install PyTorch-CPU Nightly
       run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu
 
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run:  . $PRELUDE; collect_pytorch_env_info $BUILD_ENV
+
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Install FBGEMM_GPU Nightly (CPU version)
+    - name: Install FBGEMM_GPU Wheel
       run: |
         . $PRELUDE
         pwd; ls -la .
@@ -177,8 +187,74 @@ jobs:
       timeout-minutes: 15
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
 
-    - name: Push FBGEMM_GPU Nightly (CPU version) Binary to PYPI
+    - name: Push Wheel to PyPI
       if: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true') }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
       run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu_nightly_cpu-*.whl "$PYPI_TOKEN"
+
+
+  build_and_test_ubuntu:
+    runs-on: ${{ matrix.host-machine.instance }}
+    container:
+      image: ${{ matrix.container-image }}
+      options: --user root
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: build_binary
+    strategy:
+      fail-fast: false
+      matrix:
+        host-machine: [
+          { arch: x86, instance: "linux.4xlarge" },
+          { arch: arm, instance: "linux.arm64.2xlarge" },
+        ]
+        container-image: [ "ubuntu:20.04", "ubuntu:22.04" ]
+        python-version: [ "3.11" ]
+
+    steps:
+    - name: Setup Build Container
+      run: |
+        apt update -y
+        apt install -y binutils build-essential git pciutils sudo wget
+        git config --global --add safe.directory '*'
+
+    - name: Checkout the Repository
+      uses: actions/checkout@v4
+      with:
+        submodules: true
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install Build Tools
+      run: . $PRELUDE; install_build_tools $BUILD_ENV
+
+    - name: Install PyTorch
+      run:  . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu
+
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run:  . $PRELUDE; collect_pytorch_env_info $BUILD_ENV
+
+    - name: Prepare FBGEMM_GPU Build
+      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
+
+    - name: Build + Install FBGEMM_GPU (CPU version)
+      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_install $BUILD_ENV cpu
+
+    - name: Test FBGEMM_GPU-CPU Nightly Installation
+      timeout-minutes: 15
+      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
diff --git a/.github/workflows/fbgemm_gpu_cuda_nightly.yml b/.github/workflows/fbgemm_gpu_ci_cuda.yml
similarity index 90%
rename from .github/workflows/fbgemm_gpu_cuda_nightly.yml
rename to .github/workflows/fbgemm_gpu_ci_cuda.yml
index f5ed26aec3..8eb94da4ac 100644
--- a/.github/workflows/fbgemm_gpu_cuda_nightly.yml
+++ b/.github/workflows/fbgemm_gpu_ci_cuda.yml
@@ -3,7 +3,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-name: FBGEMM_GPU-CUDA Nightly Build
+# This workflow is used for FBGEMM_GPU-CUDA CI as well as nightly builds of
+# FBGEMM_GPU-CUDA against PyTorch-CUDA Nightly.
+name: FBGEMM_GPU-CUDA CI
 
 on:
   # PR Trigger (enabled for regression checks and debugging)
@@ -62,7 +64,7 @@ jobs:
         host-machine: [
           { arch: x86, instance: "linux.24xlarge" },
         ]
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
         cuda-version: [ "11.8.0", "12.1.1" ]
 
     steps:
@@ -99,13 +101,17 @@ jobs:
     - name: Install PyTorch Nightly
       run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda ${{ matrix.cuda-version }}
 
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run:  . $PRELUDE; collect_pytorch_env_info $BUILD_ENV
+
     - name: Install cuDNN
       run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}
 
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Build FBGEMM_GPU Nightly
+    - name: Build FBGEMM_GPU Wheel
       run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly cuda
 
     - name: Upload Built Wheel as GHA Artifact
@@ -133,7 +139,7 @@ jobs:
         host-machine: [
           { arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
         ]
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
         cuda-version: [ "11.8.0", "12.1.1" ]
         # Specify exactly ONE CUDA version for artifact publish
         cuda-version-publish: [ "12.1.1" ]
@@ -174,17 +180,21 @@ jobs:
     - name: Install PyTorch Nightly
       run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda ${{ matrix.cuda-version }}
 
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run:  . $PRELUDE; collect_pytorch_env_info $BUILD_ENV
+
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Install FBGEMM_GPU Nightly
+    - name: Install FBGEMM_GPU Wheel
       run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl
 
     - name: Test with PyTest
       timeout-minutes: 15
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
+      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda
 
-    - name: Push FBGEMM_GPU Nightly Binary to PYPI
+    - name: Push Wheel to PyPI
       if: ${{ (github.event_name == 'schedule' && matrix.cuda-version == matrix.cuda-version-publish) || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish) }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci_rocm.yml
similarity index 59%
rename from .github/workflows/fbgemm_gpu_ci.yml
rename to .github/workflows/fbgemm_gpu_ci_rocm.yml
index 43006e5a3e..ebf9c7f532 100644
--- a/.github/workflows/fbgemm_gpu_ci.yml
+++ b/.github/workflows/fbgemm_gpu_ci_rocm.yml
@@ -3,10 +3,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-name: FBGEMM_GPU CI
+# This workflow is used for FBGEMM_GPU-ROCm CI as well as nightly builds of
+# FBGEMM_GPU-ROCm against PyTorch-ROCm Nightly.
+name: FBGEMM_GPU-ROCm CI
 
 on:
-  # PR Trigger
+  # PR Trigger (enabled for regression checks and debugging)
   #
   pull_request:
     branches:
@@ -18,9 +20,23 @@ on:
     branches:
       - main
 
-  # Manual Trigger (for testing only)
+  # Cron Trigger (UTC)
+  #
+  # Based on the Conda page for PyTorch-nightly, the GPU nightly releases appear
+  # around 02:30 PST every day (roughly 2 hours after the CPU releases)
+  #
+  schedule:
+    - cron:  '45 12 * * *'
+
+  # Manual Trigger
   #
   workflow_dispatch:
+    inputs:
+      publish_to_pypi:
+        description: Publish Artifact to PyPI
+        type: boolean
+        required: false
+        default: false
 
 concurrency:
   # Cancel previous runs in the PR if a new commit is pushed
@@ -28,7 +44,8 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  build_and_test_amd:
+  # Build on CPU hosts and upload to GHA
+  build_artifact:
     runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: ${{ matrix.container-image }}
@@ -43,10 +60,10 @@ jobs:
       fail-fast: false
       matrix:
         host-machine: [
-          { arch: x86, instance: "linux.12xlarge" },
+          { arch: x86, instance: "linux.24xlarge" },
         ]
         container-image: [ "ubuntu:20.04" ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
         rocm-version: [ "5.7" ]
 
     steps:
@@ -79,24 +96,34 @@ jobs:
     - name: Install ROCm
       run: . $PRELUDE; install_rocm_ubuntu $BUILD_ENV ${{ matrix.rocm-version }}
 
+    - name: Install C/C++ Compilers
+      run: . $PRELUDE; install_cxx_compiler $BUILD_ENV
+
     - name: Install Build Tools
       run: . $PRELUDE; install_build_tools $BUILD_ENV
 
     - name: Install PyTorch-ROCm Nightly
       run:  . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }}
 
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run:  . $PRELUDE; collect_pytorch_env_info $BUILD_ENV
+
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Build FBGEMM_GPU-ROCm Nightly
-      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm gfx90a
+    - name: Build FBGEMM_GPU Wheel
+      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly rocm
 
-    - name: Test FBGEMM_GPU-ROCm Nightly Installation
-      timeout-minutes: 15
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
+    - name: Upload Built Wheel as GHA Artifact
+      uses: actions/upload-artifact@v3
+      with:
+        name: fbgemm_gpu_nightly_rocm_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_rocm${{ matrix.rocm-version }}.whl
+        path: fbgemm_gpu/dist/fbgemm_gpu_nightly_rocm-*.whl
 
 
-  test_amd_gpu:
+  # Download the built artifact from GHA, test on GPU, and push to PyPI
+  test_and_publish_artifact:
     runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: "rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}-complete"
@@ -114,9 +141,10 @@ jobs:
         host-machine: [
           { arch: x86, instance: "rocm" },
         ]
-        # ROCm machines are limited, so we only test against Python 3.10
-        python-version: [ "3.10" ]
+        # ROCm machines are limited, so we only test a subset of Python versions
+        python-version: [ "3.11", "3.12" ]
         rocm-version: [ "5.7" ]
+    needs: build_artifact
 
     steps:
     - name: Setup Build Container
@@ -126,9 +154,12 @@ jobs:
         git config --global --add safe.directory '*'
 
     - name: Checkout the Repository
-      uses: actions/checkout@v4
+      uses: actions/checkout@v3
+
+    - name: Download Wheel Artifact from GHA
+      uses: actions/download-artifact@v3
       with:
-        submodules: true
+        name: fbgemm_gpu_nightly_rocm_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_rocm${{ matrix.rocm-version }}.whl
 
     - name: Display System Info
       run: . $PRELUDE; print_system_info
@@ -151,74 +182,16 @@ jobs:
     - name: Install PyTorch-ROCm Nightly
       run:  . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }}
 
-    - name: Prepare FBGEMM_GPU Build
-      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
-
-    - name: Build FBGEMM_GPU-ROCm Nightly
-      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm
-
-    - name: Test FBGEMM_GPU-ROCm Nightly Installation
-      timeout-minutes: 15
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
-
-
-  build_and_test_cpu:
-    runs-on: ${{ matrix.host-machine.instance }}
-    container:
-      image: ${{ matrix.container-image }}
-      options: --user root
-    defaults:
-      run:
-        shell: bash
-    env:
-      PRELUDE: .github/scripts/setup_env.bash
-      BUILD_ENV: build_binary
-    strategy:
-      fail-fast: false
-      matrix:
-        host-machine: [
-          { arch: x86, instance: "linux.4xlarge" },
-          { arch: arm, instance: "linux.arm64.2xlarge" },
-        ]
-        container-image: [ "ubuntu:20.04", "ubuntu:22.04" ]
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
-
-    steps:
-    - name: Setup Build Container
-      run: |
-        apt update -y
-        apt install -y binutils build-essential git pciutils sudo wget
-        git config --global --add safe.directory '*'
-
-    - name: Checkout the Repository
-      uses: actions/checkout@v4
-      with:
-        submodules: true
-
-    - name: Display System Info
-      run: . $PRELUDE; print_system_info
-
-    - name: Display GPU Info
-      run: . $PRELUDE; print_gpu_info
-
-    - name: Setup Miniconda
-      run: . $PRELUDE; setup_miniconda $HOME/miniconda
-
-    - name: Create Conda Environment
-      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
-
-    - name: Install Build Tools
-      run: . $PRELUDE; install_build_tools $BUILD_ENV
-
-    - name: Install PyTorch
-      run:  . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run:  . $PRELUDE; collect_pytorch_env_info $BUILD_ENV
 
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Build + Install FBGEMM_GPU (CPU version)
-      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_install $BUILD_ENV cpu
+    - name: Install FBGEMM_GPU Wheel
+      run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl
 
-    - name: Test FBGEMM_GPU-CPU Nightly Installation
+    - name: Test with PyTest
       timeout-minutes: 15
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
+      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
diff --git a/.github/workflows/fbgemm_gpu_docs.yml b/.github/workflows/fbgemm_gpu_docs.yml
index d3a69bca5b..0dc3d6e890 100644
--- a/.github/workflows/fbgemm_gpu_docs.yml
+++ b/.github/workflows/fbgemm_gpu_docs.yml
@@ -72,6 +72,10 @@ jobs:
     - name: Install PyTorch-CPU Nightly
       run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu
 
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run:  . $PRELUDE; collect_pytorch_env_info $BUILD_ENV
+
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
index d9f6dc2ff6..f1e433cc3c 100644
--- a/.github/workflows/fbgemm_gpu_pip.yml
+++ b/.github/workflows/fbgemm_gpu_pip.yml
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# This workflow is used for testing the download and installation of FBGEMM_GPU
+# nightly releases published to PyTorch PyPI.
 name: FBGEMM_GPU PIP Install + Test
 
 on:
@@ -59,7 +61,7 @@ jobs:
           { instance: "linux.4xlarge" },
           { instance: "linux.arm64.2xlarge" },
         ]
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
 
     steps:
     - name: Setup Build Container
@@ -86,6 +88,10 @@ jobs:
     - name: Install PyTorch-CPU
       run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_version || 'nightly' }} cpu
 
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run:  . $PRELUDE; collect_pytorch_env_info $BUILD_ENV
+
     - name: Install FBGEMM_GPU-CPU
       run: . $PRELUDE; install_fbgemm_gpu_pip $BUILD_ENV ${{ github.event.inputs.fbgemm_gpu_version || 'nightly' }} cpu
 
@@ -110,7 +116,7 @@ jobs:
         host-machine: [
           { instance: "linux.g5.4xlarge.nvidia.gpu" },
         ]
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
         cuda-version: [ "11.8.0", "12.1.1" ]
         # Specify exactly ONE CUDA version for artifact publish
         cuda-version-publish: [ "11.8.0" ]
@@ -144,12 +150,16 @@ jobs:
     - name: Install PyTorch-CUDA
       run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_version || 'nightly' }} cuda ${{ matrix.cuda-version }}
 
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run:  . $PRELUDE; collect_pytorch_env_info $BUILD_ENV
+
     - name: Install FBGEMM_GPU-CUDA
       run: . $PRELUDE; install_fbgemm_gpu_pip $BUILD_ENV ${{ github.event.inputs.fbgemm_gpu_version || 'nightly' }} cuda ${{ matrix.cuda-version }}
 
     - name: Test with PyTest
       timeout-minutes: 15
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
+      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda
 
 
   test_pypi_install_rocm:
@@ -169,10 +179,10 @@ jobs:
       fail-fast: false
       matrix:
         host-machine: [
-          { instance: "rocm" },
+          { arch: x86, instance: "rocm" },
         ]
-        # ROCm machines are limited, so we only test against Python 3.10
-        python-version: [ "3.10" ]
+        # ROCm machines are limited, so we only test a subset of Python versions
+        python-version: [ "3.11", "3.12" ]
         rocm-version: [ "5.7" ]
 
     steps:
@@ -206,6 +216,10 @@ jobs:
     - name: Install PyTorch-ROCm
       run:  . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_version || 'nightly' }} rocm ${{ matrix.rocm-version }}
 
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run:  . $PRELUDE; collect_pytorch_env_info $BUILD_ENV
+
     - name: Install FBGEMM_GPU-ROCm
       run: . $PRELUDE; install_fbgemm_gpu_pip $BUILD_ENV ${{ github.event.inputs.fbgemm_gpu_version || 'nightly' }} rocm ${{ matrix.rocm-version }}
 
diff --git a/.github/workflows/fbgemm_gpu_cpu_release.yml b/.github/workflows/fbgemm_gpu_release_cpu.yml
similarity index 89%
rename from .github/workflows/fbgemm_gpu_cpu_release.yml
rename to .github/workflows/fbgemm_gpu_release_cpu.yml
index aba87df783..213164cc59 100644
--- a/.github/workflows/fbgemm_gpu_cpu_release.yml
+++ b/.github/workflows/fbgemm_gpu_release_cpu.yml
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# This workflow is used for building and publishing FBGEMM_GPU-CPU release
+# builds against PyTorch-CPU Release to public PyPI.
 name: FBGEMM_GPU-CPU Release Build
 
 on:
@@ -10,13 +12,13 @@ on:
   #
   pull_request:
     branches:
-      - main
+      - ^v([0-9]+)\.([0-9]+)\.([0-9]+)-release
 
   # Push Trigger (enable to catch errors coming out of multiple merges)
   #
   push:
     branches:
-      - main
+      - ^v([0-9]+)\.([0-9]+)\.([0-9]+)-release
 
   # Manual Trigger
   #
@@ -61,7 +63,7 @@ jobs:
           { arch: x86, instance: "linux.4xlarge" },
           { arch: arm, instance: "linux.arm64.2xlarge" },
         ]
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
 
     steps:
     - name: Setup Build Container
@@ -93,6 +95,10 @@ jobs:
     - name: Install PyTorch-CPU Test
       run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_channel || 'test' }} cpu
 
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run:  . $PRELUDE; collect_pytorch_env_info $BUILD_ENV
+
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
@@ -125,7 +131,7 @@ jobs:
           { arch: x86, instance: "linux.4xlarge" },
           { arch: arm, instance: "linux.arm64.2xlarge" },
         ]
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
     needs: build_artifact
 
     steps:
@@ -157,6 +163,10 @@ jobs:
     - name: Install PyTorch Test
       run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_channel || 'test' }} cpu
 
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run:  . $PRELUDE; collect_pytorch_env_info $BUILD_ENV
+
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
diff --git a/.github/workflows/fbgemm_gpu_cuda_release.yml b/.github/workflows/fbgemm_gpu_release_cuda.yml
similarity index 89%
rename from .github/workflows/fbgemm_gpu_cuda_release.yml
rename to .github/workflows/fbgemm_gpu_release_cuda.yml
index 74b79a88dc..72f42db605 100644
--- a/.github/workflows/fbgemm_gpu_cuda_release.yml
+++ b/.github/workflows/fbgemm_gpu_release_cuda.yml
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# This workflow is used for building and publishing FBGEMM_GPU-CUDA release
+# builds against PyTorch-CUDA Release to public PyPI.
 name: FBGEMM_GPU-CUDA Release Build
 
 on:
@@ -10,13 +12,13 @@ on:
   #
   pull_request:
     branches:
-      - main
+      - ^v([0-9]+)\.([0-9]+)\.([0-9]+)-release
 
   # Push Trigger (enable to catch errors coming out of multiple merges)
   #
   push:
     branches:
-      - main
+      - ^v([0-9]+)\.([0-9]+)\.([0-9]+)-release
 
   # Manual Trigger
   #
@@ -66,7 +68,7 @@ jobs:
         host-machine: [
           { arch: x86, instance: "linux.24xlarge" },
         ]
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
         cuda-version: [ "11.8.0", "12.1.1" ]
 
     steps:
@@ -102,6 +104,10 @@ jobs:
     - name: Install PyTorch Test
       run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_channel || 'test' }} cuda ${{ matrix.cuda-version }}
 
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run:  . $PRELUDE; collect_pytorch_env_info $BUILD_ENV
+
     - name: Install cuDNN
       run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}
 
@@ -134,7 +140,7 @@ jobs:
         host-machine: [
           { arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
         ]
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
         cuda-version: [ "11.8.0", "12.1.1" ]
     needs: build_artifact
 
@@ -170,6 +176,10 @@ jobs:
     - name: Install PyTorch Test
       run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_channel || 'test' }} cuda ${{ matrix.cuda-version }}
 
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run:  . $PRELUDE; collect_pytorch_env_info $BUILD_ENV
+
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
@@ -178,7 +188,7 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: 15
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
+      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda
 
     - name: Push FBGEMM_GPU Binary to PYPI
       if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == github.event.inputs.cuda_version }}
diff --git a/fbgemm_gpu/bench/merge_embeddings_benchmark.py b/fbgemm_gpu/bench/merge_embeddings_benchmark.py
index d7f574d6f7..8059c85541 100644
--- a/fbgemm_gpu/bench/merge_embeddings_benchmark.py
+++ b/fbgemm_gpu/bench/merge_embeddings_benchmark.py
@@ -499,7 +499,6 @@ def main(
         "output size (MB), all-to-one BW (GB/s), link BW (GB/s), t (ms)"
     )
     if sweep:
-
         # pyre-fixme[3]: Return type must be annotated.
         # pyre-fixme[2]: Parameter must be annotated.
         def handler(signum, frame):
diff --git a/fbgemm_gpu/bench/sparse_ops_benchmark.py b/fbgemm_gpu/bench/sparse_ops_benchmark.py
index a578f3f40a..0602d0ae82 100644
--- a/fbgemm_gpu/bench/sparse_ops_benchmark.py
+++ b/fbgemm_gpu/bench/sparse_ops_benchmark.py
@@ -878,7 +878,6 @@ def ben(fn, name, ad_indices, ad_lengths, batch_offsets, num_ads_in_batch):
 def block_bucketize_sparse_features_bench(
     row_size: int, batch_size: int, bucket_num: int, input_precision: str, device: str
 ) -> None:
-
     dtype = torch.int
     if input_precision == "int":
         dtype = torch.int
diff --git a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
index 143b8a0e3d..cb7d30a817 100644
--- a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
+++ b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
@@ -785,6 +785,7 @@ def benchmark_cpu_requests(
 @click.option("--output-dtype", type=SparseType, default=SparseType.FP16)
 @click.option("--fp8-exponent-bits", type=int, default=None)
 @click.option("--fp8-exponent-bias", type=int, default=None)
+@click.option("--pooling", type=str, default="sum")
 def nbit_cpu(  # noqa C901
     alpha: float,
     bag_size: int,
@@ -807,6 +808,7 @@ def nbit_cpu(  # noqa C901
     output_dtype: SparseType,
     fp8_exponent_bits: Optional[int],
     fp8_exponent_bias: Optional[int],
+    pooling: str,
 ) -> None:
     np.random.seed(42)
     torch.manual_seed(42)
@@ -825,11 +827,23 @@ def nbit_cpu(  # noqa C901
     else:
         Ds = [D] * T
 
+    if pooling is None or pooling == "sum":
+        pooling = "sum"
+        pooling_mode = PoolingMode.SUM
+        do_pooling = True
+    elif pooling == "mean":
+        pooling_mode = PoolingMode.MEAN
+        do_pooling = True
+    else:  # "none"
+        pooling_mode = PoolingMode.NONE
+        do_pooling = False
+
     emb = IntNBitTableBatchedEmbeddingBagsCodegen(
         [("", E, d, weights_precision, EmbeddingLocation.HOST) for d in Ds],
         device="cpu",
         index_remapping=[torch.arange(E) for _ in Ds] if index_remapping else None,
         output_dtype=output_dtype,
+        pooling_mode=pooling_mode,
         fp8_exponent_bits=fp8_exponent_bits,
         fp8_exponent_bias=fp8_exponent_bias,
     ).cpu()
@@ -839,9 +853,16 @@ def nbit_cpu(  # noqa C901
     nparams_byte = sum(w.numel() for (w, _) in emb.split_embedding_weights())
     param_size_multiplier = weights_precision.bit_rate() / 8.0
     output_size_multiplier = output_dtype.bit_rate() / 8.0
-    read_write_bytes = (
-        output_size_multiplier * B * T * D + param_size_multiplier * B * T * L * D
-    )
+    if do_pooling:
+        read_write_bytes = (
+            output_size_multiplier * B * T * D + param_size_multiplier * B * T * L * D
+        )
+    else:
+        read_write_bytes = (
+            output_size_multiplier * B * T * L * D
+            + param_size_multiplier * B * T * L * D
+        )
+
     logging.info(
         f"{weights_precision} Embedding tables: {E * T} rows, {nparams_byte / param_size_multiplier / 1.0e9: .2f} GParam, "
         f"{nparams_byte / 1.0e9: .2f} GB"  # IntN TBE use byte for storage
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp
index fbdbcf45b4..5f0f445f2f 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp
+++ b/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp
@@ -403,9 +403,9 @@ for (const auto d : c10::irange(D)) {
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   {% if not dense %}
-  m.def("split_embedding_backward_codegen_{{ optimizer }}_cpu(Tensor grad_output, Tensor(a!) host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets,int pooling_mode, Tensor indice_weights, bool stochastic_rounding, {{ (args.split_function_args | join(", ")).replace("double", "float").replace("int64_t", "int")}}, int output_dtype = 0) -> ()");
+  m.def("split_embedding_backward_codegen_{{ optimizer }}_cpu(Tensor grad_output, Tensor(a!) host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets,int pooling_mode, Tensor indice_weights, bool stochastic_rounding, {{ (args.split_function_args | join(", ")).replace("double", "float").replace("int64_t", "int").replace("Tensor momentum1_host", "Tensor(b!) momentum1_host")}}, int output_dtype = 0) -> ()");
   {% else %}
-  m.def("split_embedding_backward_codegen_{{ optimizer }}_cpu(Tensor grad_output, Tensor(a!) host_weights, Tensor weights_offsets, Tensor D_offsets, int max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets,int pooling_mode, Tensor indice_weights, {{ (args.split_function_args | join(", ")).replace("double", "float").replace("int64_t", "int")}}) -> Tensor");
+  m.def("split_embedding_backward_codegen_{{ optimizer }}_cpu(Tensor grad_output, Tensor(a!) host_weights, Tensor weights_offsets, Tensor D_offsets, int max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets,int pooling_mode, Tensor indice_weights, {{ (args.split_function_args | join(", ")).replace("double", "float").replace("int64_t", "int").replace("Tensor momentum1_host", "Tensor(b!) momentum1_host")}}) -> Tensor");
   {% endif %}
   DISPATCH_TO_CPU("split_embedding_backward_codegen_{{ optimizer }}_cpu", split_embedding_backward_codegen_{{ optimizer }}_cpu);
 }
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
index 2791dd3c12..2efef10f82 100644
--- a/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
+++ b/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
@@ -10,6 +10,7 @@
 #include <ATen/TypeDefault.h>
 #include <ATen/core/op_registration/op_registration.h>
 #include <torch/script.h>
+#include "fbgemm_gpu/dispatch_macros.h"
 #include "fbgemm_gpu/embedding_common.h"
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
@@ -169,7 +170,8 @@ TORCH_LIBRARY_FRAGMENT(fb, m) {
   // The (a!) tells PyTorch this is an impure operation and so cannot be CSE'd
   // or DCE'd, etc.
   m.def(
-      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None, Tensor? B_offsets=None, int max_B=-1) -> ()");
+      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None, Tensor? B_offsets=None, int max_B=-1) -> ()",
+      {PT2_COMPLIANT_TAG});
   DISPATCH_TO_CPU("bounds_check_indices", bounds_check_indices_cpu);
 }
 
diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp
index 652fc894cf..6540dfc6ca 100644
--- a/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp
+++ b/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp
@@ -200,6 +200,13 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
         const float* indice_weights_acc = indice_weights.data_ptr<float>();
         {% endif %}
 
+        using float16 = uint16_t;
+        using bfloat16 = uint16_t;
+        using fbgemm_out_t = typename std::conditional<
+            std::is_same<output_t, at::Half>::value,
+            float16,
+            std::conditional<std::is_same<output_t, at::BFloat16>::value, bfloat16, float>::type >::type;
+
         AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_", [&] {
             const auto* indices_acc = indices.data_ptr<index_t>();
             const auto* offsets_acc = offsets.data_ptr<index_t>();
@@ -208,10 +215,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
 
             auto* output_acc = output.data_ptr<output_t>();
             int32_t num_indices_m_1 = indices.numel() - 1;
-
             int32_t D_start_ = 0;
-for (const auto t : c10::irange(T)) {
 
+            for (const auto t : c10::irange(T)) {
                 {% if not nobag %}
                 const auto* D_offsets_acc = D_offsets.data_ptr<int32_t>();
                 const int32_t D_start = D_offsets_acc[t];
@@ -226,164 +232,100 @@ for (const auto t : c10::irange(T)) {
                 const auto& weight_tensor = (placement == PlacementType::HOST) ? dev_weights : uvm_weights;
                 weights_acc = weight_tensor.data_ptr<uint8_t>();
                 const uint8_t* weights = &weights_acc[weights_offsets_acc[t]];
-                auto weight_ty = static_cast<SparseType>(weights_tys_acc[t]);
+                const auto weight_ty = static_cast<SparseType>(weights_tys_acc[t]);
                 // default to 1 byte alignment for CPU TBE
                 const int32_t D_bytes = nbit::padded_row_size_in_bytes(D, weight_ty, row_alignment);
 
                 int tt;
                 for (tt = t + 1; tt < T && weights_offsets_acc[tt] == weights_offsets_acc[t]; ++tt);
-                size_t num_rows = ((tt == T ? weight_tensor.numel() : weights_offsets_acc[tt]) - weights_offsets_acc[t]) / D_bytes;
+                const size_t num_rows = ((tt == T ? weight_tensor.numel() : weights_offsets_acc[tt]) - weights_offsets_acc[t]) / D_bytes;
                 const index_t* offsets_begin_ptr = offsets_acc + t * B;
 
-                using float16 = uint16_t;
-                using bfloat16 = uint16_t;
-                using fbgemm_out_t = typename std::conditional<
-                    std::is_same<output_t, at::Half>::value,
-                    float16,
-                    std::conditional<std::is_same<output_t, at::BFloat16>::value, bfloat16, float>::type >::type;
-
                 bool success = true;
-                bool has_weight = {{ "true" if weighted else "false" }};
-                bool normalize_by_lengths = static_cast<PoolingMode>(pooling_mode) == PoolingMode::MEAN;
+                const bool has_weight = {{ "true" if weighted else "false" }};
+                const bool normalize_by_lengths = static_cast<PoolingMode>(pooling_mode) == PoolingMode::MEAN;
+
+                const index_t index_size = offsets_acc[(t + 1) * B] - *offsets_begin_ptr;
+
+                {% if nobag %}
+                // Create virtual offsets for the nobag case. Lengths are all ones.
+                const auto offsets_nobag = at::arange(*offsets_begin_ptr, offsets_acc[(t + 1) * B] + 1, offsets.options());
+                const index_t* offsets_nobag_ptr = offsets_nobag.data_ptr<index_t>();
+                TORCH_CHECK(offsets_nobag.numel() == index_size + 1);
+                TORCH_CHECK(offsets_nobag_ptr[index_size] - offsets_nobag_ptr[0] == index_size);
+                {% endif %}
 
-                index_t index_size = offsets_acc[(t + 1) * B] - *offsets_begin_ptr;
                 const float* indice_weights_ptr = nullptr;
                 {% if weighted %}
                 indice_weights_ptr = indice_weights_acc + *offsets_begin_ptr;
                 {% endif %}
+
+                {% macro generate_and_exec_kernel(weight_type, use_base, use_nbit, use_fp8) %}
+                {% set has_asmjit = use_base or use_nbit %}
+                {% set kernel_name = "GenerateEmbeddingSpMDMWithStrides"
+                    if use_base else ("GenerateEmbeddingSpMDMNBitWithStrides"
+                    if use_nbit else "GenerateEmbeddingSpMDMFP8WithStrides")
+                 %}
+                const auto kernel = fbgemm::{{ kernel_name }}<
+                    {% if use_base %}
+                    {{ weight_type }},
+                    {% endif %}
+                    index_t,
+                    index_t,
+                    {% if has_asmjit %}
+                    fbgemm_out_t,
+                    /*THREAD_LOCAL=*/true
+                    {% else %}
+                    fbgemm_out_t
+                    {% endif %}
+                >(
+                    {% if use_nbit %}
+                    /*bit_rate=*/bit_rate,
+                    {% endif %}
+                    D,
+                    {% if has_asmjit %}
+                    has_weight,
+                    {% endif %}
+                    normalize_by_lengths,
+                    {% if has_asmjit %}
+                    /*prefetch=*/16,
+                    {% endif %}
+                    /*is_weight_positional=*/false,
+                    /*use_offsets=*/true,
+                    /*output_stride=*/{{ "total_D" if not nobag else "D" }},
+                    /*input_stride=*/D_bytes / sizeof({{ weight_type }}),
+                    {% if use_fp8 %}
+                    /*exponent_bits=*/fp8_exponent_bits,
+                    /*exponent_bias=*/fp8_exponent_bias,
+                    {% endif %}
+                    {% if has_asmjit %}
+                    /*scale_bias_last=*/false,
+                    {% endif %}
+                    {% if use_base %}
+                    /*no_bag=*/false,
+                    {% endif %}
+                    /*is_bf16_out=*/output_is_bf16
+                );
+                success = kernel(
+                    {{ "B" if not nobag else "index_size"}},
+                    index_size,
+                    num_rows,
+                    reinterpret_cast<const {{ weight_type }}*>(weights),
+                    indices_acc + *offsets_begin_ptr,
+                    {{ "offsets_begin_ptr" if not nobag else "offsets_nobag_ptr" }},
+                    indice_weights_ptr,
+                    reinterpret_cast<fbgemm_out_t*>(output_acc + D_start));
+                {% endmacro %}
+
                 if (weight_ty == SparseType::FP32) {
-                    auto kernel = fbgemm::GenerateEmbeddingSpMDMWithStrides<float, index_t, index_t, fbgemm_out_t, /*THREAD_LOCAL=*/true>(
-                        D,
-                        has_weight,
-                        normalize_by_lengths,
-                        /*prefetch=*/16,
-                        /*is_weight_positional=*/false,
-                        /*use_offsets=*/true,
-                        {% if not nobag %}
-                        /*output_stride=*/total_D,
-                        {% else %}
-                        /*output_stride=*/D,
-                        {% endif %}
-                        /*input_stride=*/D_bytes / sizeof(float),
-                        {% if not nobag %}
-                        /*scale_bias_last=*/false,
-                        /*no_bag=*/false,
-                        /*is_bf16_out=*/output_is_bf16);
-                        {% else %}
-                        /*scale_bias_last=*/false,
-                        /*no_bag=*/true,
-                        /*is_bf16_out=*/output_is_bf16);
-                        {% endif %}
-                    success = kernel(
-                        {% if not nobag %}
-                        B,
-                        {% else %}
-                        index_size,
-                        {% endif %}
-                        index_size,
-                        num_rows,
-                        reinterpret_cast<const float*>(weights),
-                        indices_acc + *offsets_begin_ptr,
-                        offsets_begin_ptr,
-                        indice_weights_ptr,
-                        reinterpret_cast<fbgemm_out_t*>(output_acc + D_start));
+                    {{ generate_and_exec_kernel("float", True, False, False) }}
                 } else if (weight_ty == SparseType::FP16) {
-                    auto kernel = fbgemm::GenerateEmbeddingSpMDMWithStrides<float16, index_t, index_t, fbgemm_out_t, /*THREAD_LOCAL=*/true>(
-                        D,
-                        has_weight,
-                        normalize_by_lengths,
-                        /*prefetch=*/16,
-                        /*is_weight_positional=*/false,
-                        /*use_offsets=*/true,
-                        {% if not nobag %}
-                        /*output_stride=*/total_D,
-                        {% else %}
-                        /*output_stride=*/D,
-                        {% endif %}
-                        /*input_stride=*/D_bytes / sizeof(float16),
-                        {% if not nobag %}
-                        /*scale_bias_last=*/false,
-                        /*no_bag=*/false,
-                        /*is_bf16_out=*/output_is_bf16);
-                        {% else %}
-                        /*scale_bias_last=*/false,
-                        /*no_bag=*/true,
-                        /*is_bf16_out=*/output_is_bf16);
-                        {% endif %}
-                    success = kernel(
-                        {% if not nobag %}
-                        B,
-                        {% else %}
-                        index_size,
-                        {% endif %}
-                        index_size,
-                        num_rows,
-                        reinterpret_cast<const float16*>(weights),
-                        indices_acc + *offsets_begin_ptr,
-                        offsets_begin_ptr,
-                        indice_weights_ptr,
-                        reinterpret_cast<fbgemm_out_t*>(output_acc + D_start));
+                    {{ generate_and_exec_kernel("float16", True, False, False) }}
+                } else if (weight_ty == SparseType::INT8) {
+                    {{ generate_and_exec_kernel("uint8_t", True, False, False) }}
                 } else if (weight_ty == SparseType::FP8) {
                     assert(fp8_exponent_bits > 0 && fp8_exponent_bias > 0);
-                    auto kernel = fbgemm::GenerateEmbeddingSpMDMFP8WithStrides<index_t, index_t, fbgemm_out_t>(
-                        D,
-                        normalize_by_lengths,
-                        /*is_weight_positional=*/false,
-                        /*use_offsets=*/true,
-                        {% if not nobag %}
-                        /*output_stride=*/total_D,
-                        {% else %}
-                        /*output_stride=*/D,
-                        {% endif %}
-                        /*input_stride=*/D_bytes / sizeof(uint8_t),
-                        /*exponent_bits=*/fp8_exponent_bits,
-                        /*exponent_bias=*/fp8_exponent_bias,
-                        /*is_bf16_out=*/output_is_bf16);
-                    success = kernel(
-                        B,
-                        index_size,
-                        num_rows,
-                        weights,
-                        indices_acc + *offsets_begin_ptr,
-                        offsets_begin_ptr,
-                        indice_weights_ptr,
-                        reinterpret_cast<fbgemm_out_t*>(output_acc + D_start));
-                } else if (weight_ty == SparseType::INT8) {
-                    auto kernel = fbgemm::GenerateEmbeddingSpMDMWithStrides<uint8_t, index_t, index_t, fbgemm_out_t, /*THREAD_LOCAL=*/true>(
-                        D,
-                        has_weight,
-                        normalize_by_lengths,
-                        /*prefetch=*/16,
-                        /*is_weight_positional=*/false,
-                        /*use_offsets=*/true,
-                        {% if not nobag %}
-                        /*output_stride=*/total_D,
-                        {% else %}
-                        /*output_stride=*/D,
-                        {% endif %}
-                        /*input_stride=*/D_bytes / sizeof(uint8_t),
-                        {% if not nobag %}
-                        /*scale_bias_last=*/false,
-                        /*no_bag=*/false,
-                        /*is_bf16_out=*/output_is_bf16);
-                        {% else %}
-                        /*scale_bias_last=*/false,
-                        /*no_bag=*/true,
-                        /*is_bf16_out=*/output_is_bf16);
-                        {% endif %}
-                    success = kernel(
-                        {% if not nobag %}
-                        B,
-                        {% else %}
-                        index_size,
-                        {% endif %}
-                        index_size,
-                        num_rows,
-                        weights,
-                        indices_acc + *offsets_begin_ptr,
-                        offsets_begin_ptr,
-                        indice_weights_ptr,
-                        reinterpret_cast<fbgemm_out_t*>(output_acc + D_start));
+                    {{ generate_and_exec_kernel("uint8_t", False, False, True) }}
                 } else if (weight_ty == SparseType::INT4 || weight_ty == SparseType::INT2) {
                     int bit_rate;
                     switch (weight_ty) {
@@ -394,35 +336,13 @@ for (const auto t : c10::irange(T)) {
                           bit_rate = 2;
                           break;
                         default:
-                          throw std::logic_error("Unsupported SparseType: " + std::to_string(static_cast<int>(weight_ty)));
+                          throw std::logic_error(
+                              "Unsupported SparseType: " + std::to_string(static_cast<int>(weight_ty)));
                     }
-                    auto kernel = fbgemm::GenerateEmbeddingSpMDMNBitWithStrides<index_t, index_t, fbgemm_out_t, /*THREAD_LOCAL=*/true>(
-                        /*bit_rate=*/bit_rate,
-                        D,
-                        has_weight,
-                        normalize_by_lengths,
-                        /*prefetch=*/16,
-                        /*is_weight_positional=*/false,
-                        /*use_offsets=*/true,
-                        {% if not nobag %}
-                        /*output_stride=*/total_D,
-                        {% else %}
-                        /*output_stride=*/D,
-                        {% endif %}
-                        /*input_stride=*/D_bytes / sizeof(uint8_t),
-                        /*scale_bias_last=*/false,
-                        /*is_bf16_out=*/output_is_bf16);
-                    success = kernel(
-                        B,
-                        index_size,
-                        num_rows,
-                        weights,
-                        indices_acc + *offsets_begin_ptr,
-                        offsets_begin_ptr,
-                        indice_weights_ptr,
-                        reinterpret_cast<fbgemm_out_t*>(output_acc + D_start));
+                    {{ generate_and_exec_kernel("uint8_t", False, True, False) }}
                 } else {
-                    throw std::logic_error("Unsupported SparseType: " + std::to_string(static_cast<int>(weight_ty)));
+                    throw std::logic_error(
+                        "Unsupported SparseType: " + std::to_string(static_cast<int>(weight_ty)));
                 }
                 if (!success) {
                     fbgemm_gpu::report_embedding_error(
diff --git a/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp b/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp
index b440652b28..7d14b52664 100644
--- a/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp
+++ b/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp
@@ -234,6 +234,42 @@ Tensor split_embedding_codegen_forward_cpu(
   return output;
 }
 
+Tensor split_embedding_codegen_forward_cpu_meta(
+    Tensor weights,
+    Tensor weights_offsets,
+    Tensor D_offsets,
+    int64_t total_D,
+    Tensor hash_size_cumsum,
+    Tensor indices,
+    Tensor offsets,
+    int64_t pooling_mode,
+    Tensor indice_weights,
+    int64_t output_dtype) {
+  c10::SymInt T = D_offsets.sym_numel() - 1;
+  TORCH_CHECK_GT(T, 0);
+  // offsets = [T x B  + 1]
+  c10::SymInt B = (offsets.sym_size(0) - 1) / T;
+  TORCH_CHECK_GE(B, 0);
+
+  Tensor output;
+  if (output_dtype == static_cast<int64_t>(SparseType::FP32)) {
+    output =
+        at::empty_symint({B, total_D}, weights.options().dtype(at::kFloat));
+  } else if (output_dtype == static_cast<int64_t>(SparseType::FP16)) {
+    output = at::empty_symint({B, total_D}, weights.options().dtype(at::kHalf));
+  } else if (output_dtype == static_cast<int64_t>(SparseType::BF16)) {
+    output =
+        at::empty_symint({B, total_D}, weights.options().dtype(at::kBFloat16));
+  } else {
+    output = at::empty_symint({B, total_D}, weights.options());
+  }
+
+  // It is assumed that the indice_weights will always be float
+  TORCH_CHECK(
+      !indice_weights.defined() || indice_weights.scalar_type() != at::kHalf);
+  return output;
+}
+
 template <typename weights_t, typename grad_t>
 void split_embedding_grad_indice_weights_cpu_kernel(
     Tensor grad_output,
@@ -632,4 +668,10 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       split_embedding_codegen_forward_cpu);
 }
 
+TORCH_LIBRARY_IMPL(fbgemm, Meta, m) {
+  m.impl(
+      "split_embedding_codegen_forward_cpu",
+      &split_embedding_codegen_forward_cpu_meta);
+}
+
 } // namespace
diff --git a/fbgemm_gpu/codegen/embedding_forward_split_meta_template.cpp b/fbgemm_gpu/codegen/embedding_forward_split_meta_template.cpp
index 0c8c930ecd..f9067f0a88 100644
--- a/fbgemm_gpu/codegen/embedding_forward_split_meta_template.cpp
+++ b/fbgemm_gpu/codegen/embedding_forward_split_meta_template.cpp
@@ -179,6 +179,10 @@ Tensor
         return output;
     }
 
+    {%- if not nobag and vbe %}
+    output = output.reshape({-1});
+    {%- endif %}
+
     return output;
 }
 
diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
index 0979959a53..c6bfa60e0a 100644
--- a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
@@ -326,20 +326,6 @@ def merge_pooled_embeddings(
     )
 
 
-@impl_abstract("fbgemm::bounds_check_indices")
-def bounds_check_indices(
-    rows_per_table: torch.Tensor,
-    indices: torch.Tensor,
-    offsets: torch.Tensor,
-    bounds_check_mode: int,
-    warning: torch.Tensor,
-    weights: Optional[torch.Tensor] = None,
-    B_offsets: Optional[torch.Tensor] = None,
-    max_B: int = -1,
-) -> None:
-    pass
-
-
 @impl_abstract("fbgemm::permute_sparse_features")
 def permute_sparse_features_abstract(
     permute: Tensor, lengths: Tensor, indices: Tensor, weights: Optional[Tensor] = None
@@ -371,3 +357,31 @@ def segment_sum_csr_abstract(
     output_size = csr_seg.numel() - 1
     output = values.new_empty(output_size)
     return output
+
+
+@impl_abstract("fbgemm::dense_to_jagged_forward")
+def dense_to_jagged_forward(
+    dense: torch.Tensor,
+    offsets: List[torch.Tensor],
+    total_L: Optional[torch.SymInt] = None,
+) -> torch.Tensor:
+    if not total_L:
+        total_L = torch.library.get_ctx().new_dynamic_size()
+    return dense.new_zeros(
+        total_L,
+        dense.size()[-1],
+        dtype=dense.dtype,
+        device=dense.device,
+        layout=dense.layout,
+    )
+
+
+@impl_abstract("fbgemm::dense_to_jagged")
+def dense_to_jagged(
+    dense: torch.Tensor,
+    offsets: List[torch.Tensor],
+    total_L: Optional[torch.SymInt] = None,
+) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+    if not total_L:
+        total_L = torch.library.get_ctx().new_dynamic_size()
+    return (dense_to_jagged_forward(dense, offsets, total_L), offsets)
diff --git a/fbgemm_gpu/include/fbgemm_gpu/dispatch_macros.h b/fbgemm_gpu/include/fbgemm_gpu/dispatch_macros.h
index 834a226ce4..05f391e597 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/dispatch_macros.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/dispatch_macros.h
@@ -203,8 +203,10 @@
       TYPE, NAME, FBGEMM_DISPATCH_FLOAT_HALF_AND_BFLOAT16_CASE(__VA_ARGS__))
 
 // We can cleanup the following once fbgemm uses PyTorch 2.2 in January 2024.
+#ifndef PT2_COMPLIANT_TAG
 #ifdef HAS_PT2_COMPLIANT_TAG
 #define PT2_COMPLIANT_TAG at::Tag::pt2_compliant_tag
 #else
 #define PT2_COMPLIANT_TAG
 #endif
+#endif
diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py
index cb7ab36475..deee823190 100644
--- a/fbgemm_gpu/setup.py
+++ b/fbgemm_gpu/setup.py
@@ -393,18 +393,17 @@ def main(argv: List[str]) -> None:
         cmdclass={
             "install": FbgemmGpuInstaller,
         },
-        # PyPI package information.
+        # PyPI package information
         classifiers=[
             "Development Status :: 4 - Beta",
             "Intended Audience :: Developers",
             "Intended Audience :: Science/Research",
             "License :: OSI Approved :: BSD License",
-            "Programming Language :: Python :: 3",
-            "Programming Language :: Python :: 3.8",
-            "Programming Language :: Python :: 3.9",
-            "Programming Language :: Python :: 3.10",
-            "Programming Language :: Python :: 3.11",
             "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        ]
+        + [
+            f"Programming Language :: Python :: {x}"
+            for x in ["3", "3.8", "3.9", "3.10", "3.11", "3.12"]
         ],
     )
 
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
index 5a1753b239..fb5ba53798 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
@@ -1635,6 +1635,9 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   // SymInt is a new PyTorch 2.0 feature to support dynamic shape. See more
   // details at https://pytorch.org/get-started/pytorch-2.0/#dynamic-shapes. If
   // you find it doesn't compile, please pull the new PyTorch 2.0 code
+  m.impl_abstract_pystub(
+      "fbgemm_gpu.sparse_ops",
+      "//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_py");
   m.def(
       "dense_to_jagged(Tensor dense, Tensor[] x_offsets, SymInt? total_L=None) -> (Tensor, Tensor[])",
       {PT2_COMPLIANT_TAG});
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp
index b9e249cb90..fabcd6455b 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp
@@ -92,28 +92,6 @@ Tensor jagged_dense_elementwise_add_meta(
   return at::empty_like(y);
 }
 
-Tensor dense_to_jagged_forward_meta(
-    const Tensor& dense,
-    const std::vector<Tensor>& offsets,
-    c10::optional<at::SymInt> total_L) {
-  auto dense_values = dense;
-  at::SymInt D = dense_values.sym_size(-1);
-  TORCH_CHECK_NOT_IMPLEMENTED(
-      total_L.has_value(), "total_L is required for meta backend");
-  auto& total_L_computed = total_L.value();
-  auto values = at::zeros_symint({total_L_computed, D}, dense_values.options());
-
-  TORCH_CHECK(values.is_meta());
-  return values;
-}
-
-std::tuple<Tensor, std::vector<Tensor>> dense_to_jagged_meta(
-    const Tensor& dense,
-    const std::vector<Tensor>& offsets,
-    c10::optional<at::SymInt> total_L) {
-  return {dense_to_jagged_forward_meta(dense, offsets, total_L), offsets};
-}
-
 std::tuple<Tensor, std::vector<Tensor>> jagged_dense_elementwise_mul_meta(
     const Tensor& x_values,
     const std::vector<Tensor>& x_offsets,
@@ -241,10 +219,6 @@ TORCH_LIBRARY_IMPL(fbgemm, Meta, m) {
   m.impl(
       "jagged_to_padded_dense_backward",
       TORCH_FN(fbgemm_gpu::jagged_to_padded_dense_backward_meta));
-  m.impl(
-      "dense_to_jagged_forward",
-      TORCH_FN(fbgemm_gpu::dense_to_jagged_forward_meta));
-  m.impl("dense_to_jagged", TORCH_FN(fbgemm_gpu::dense_to_jagged_meta));
   m.impl(
       "jagged_dense_dense_elementwise_add_jagged_output_forward",
       TORCH_FN(
diff --git a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
index a6ff0d5dce..09f5c011ad 100644
--- a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
+++ b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
@@ -148,7 +148,8 @@ at::Tensor permute_pooled_embs_auto_grad_meta(
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
-      "permute_pooled_embs(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor");
+      "permute_pooled_embs(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
       "permute_pooled_embs_auto_grad(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor",
       {PT2_COMPLIANT_TAG});
diff --git a/fbgemm_gpu/src/sparse_ops/common.cuh b/fbgemm_gpu/src/sparse_ops/common.cuh
index 5cfca60e23..021736a675 100644
--- a/fbgemm_gpu/src/sparse_ops/common.cuh
+++ b/fbgemm_gpu/src/sparse_ops/common.cuh
@@ -32,7 +32,7 @@
 #include "fbgemm_gpu/split_embeddings_utils.cuh"
 
 #ifdef USE_ROCM
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #endif
 
 #ifdef USE_ROCM
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index ae17a393c5..7f3922417c 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -2769,7 +2769,8 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "generic_histogram_binning_calibration_by_feature(Tensor logit, Tensor segment_value, Tensor segment_lengths, SymInt num_segments, Tensor bin_num_examples, Tensor bin_num_positives, Tensor bin_boundaries, float positive_weight, SymInt bin_ctr_in_use_after, float bin_ctr_weight_value) -> (Tensor, Tensor)");
   m.def(
-      "segment_sum_csr(SymInt batch_size, Tensor csr_seg, Tensor values) -> Tensor");
+      "segment_sum_csr(SymInt batch_size, Tensor csr_seg, Tensor values) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
       "embedding_bag_rowwise_prune(Tensor weight, Tensor indicator, float threshold, ScalarType compressed_indices_dtype, bool abs=True, SymInt min_num_rows=0, float? min_save_ratio=1.0) -> (Tensor, Tensor)");
   m.def("lengths_range(Tensor t_in, SymInt[]? shape=None) -> Tensor");
diff --git a/fbgemm_gpu/src/split_embeddings_utils/split_embeddings_utils.cpp b/fbgemm_gpu/src/split_embeddings_utils/split_embeddings_utils.cpp
index e464c879d3..38195ece18 100644
--- a/fbgemm_gpu/src/split_embeddings_utils/split_embeddings_utils.cpp
+++ b/fbgemm_gpu/src/split_embeddings_utils/split_embeddings_utils.cpp
@@ -12,6 +12,30 @@
 #include <ATen/ATen.h>
 #include <torch/library.h>
 
+using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+namespace {
+
+std::tuple<Tensor /*row_output_offsets*/, Tensor /*b_t_map*/>
+generate_vbe_metadata_meta(
+    const Tensor& B_offsets,
+    const Tensor& B_offsets_rank_per_feature,
+    const Tensor& output_offsets_feature_rank,
+    const Tensor& D_offsets,
+    const int64_t D,
+    const bool nobag,
+    const int64_t max_B_feature_rank,
+    const int64_t info_B_num_bits,
+    const c10::SymInt total_B) {
+  Tensor row_output_offsets =
+      at::empty_symint({total_B}, output_offsets_feature_rank.options());
+  Tensor b_t_map = at::empty_symint({total_B}, B_offsets.options());
+  return {row_output_offsets, b_t_map};
+}
+
+} // namespace
+
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "transpose_embedding_input("
@@ -40,9 +64,13 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       "    bool nobag, "
       "    int max_B_feature_rank, "
       "    int info_B_num_bits, "
-      "    int total_B"
+      "    SymInt total_B"
       ") -> (Tensor, Tensor)");
   DISPATCH_TO_CUDA("transpose_embedding_input", transpose_embedding_input);
   DISPATCH_TO_CUDA("get_infos_metadata", get_infos_metadata);
   DISPATCH_TO_CUDA("generate_vbe_metadata", generate_vbe_metadata);
 }
+
+TORCH_LIBRARY_IMPL(fbgemm, Meta, m) {
+  m.impl("generate_vbe_metadata", &generate_vbe_metadata_meta);
+}
diff --git a/fbgemm_gpu/test/batched_unary_embeddings_test.py b/fbgemm_gpu/test/batched_unary_embeddings_test.py
index 3d63aff90f..1577a11f3b 100644
--- a/fbgemm_gpu/test/batched_unary_embeddings_test.py
+++ b/fbgemm_gpu/test/batched_unary_embeddings_test.py
@@ -7,9 +7,10 @@
 
 
 import random
+import sys
 import unittest
 from math import sqrt
-from typing import List, Tuple
+from typing import Callable, List, Tuple
 
 import fbgemm_gpu.batched_unary_embeddings_ops as batched_unary_embeddings_ops
 import numpy as np
@@ -45,6 +46,15 @@
 }
 
 
+# pyre-fixme[2]
+# pyre-fixme[24]
+def torch_compiled(model: Callable, **kwargs) -> Callable:
+    if sys.version_info < (3, 12, 0):
+        return torch.compile(model, **kwargs)
+    else:
+        return model
+
+
 class TableBatchedEmbeddingsTest(unittest.TestCase):
     class RefEmb(torch.nn.Module):
         def __init__(self, num_tasks: int, hash_sizes: List[int]) -> None:
@@ -147,7 +157,7 @@ def _test_main(
             param.detach().copy_(ref_emb.emb_modules[i].weight)
         output_ref = ref_emb(offsets, indices)
         if torch_compile:
-            unary_emb = torch.compile(unary_emb, dynamic=True, fullgraph=True)
+            unary_emb = torch_compiled(unary_emb, dynamic=True, fullgraph=True)
         output = unary_emb(offsets_tensor, indices_tensor)
         torch.testing.assert_close(
             output_ref,
@@ -169,7 +179,7 @@ def _test_main(
             param.detach().copy_(ref_emb.emb_modules[i].weight)
         output_ref = ref_emb(offsets, indices)
         if torch_compile:
-            unary_emb = torch.compile(unary_emb, dynamic=True, fullgraph=True)
+            unary_emb = torch_compiled(unary_emb, dynamic=True, fullgraph=True)
         output = unary_emb(offsets_tensor.long(), indices_tensor.long())
         torch.testing.assert_close(
             output_ref,
diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index 43efa968c9..eaf0ba2389 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -115,40 +115,7 @@
         "status": "xfail"
       }
     },
-    "fbgemm::dense_to_jagged": {
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_dense_to_jagged": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_dense_to_jagged_meta_backend": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_dense_to_jagged_opt": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_dense_to_jagged_opt_large_batch": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged_meta_backend": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged_opt": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged_opt_large_batch": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
+    "fbgemm::dense_to_jagged": {},
     "fbgemm::expand_into_jagged_permute": {},
     "fbgemm::generic_histogram_binning_calibration_by_feature": {
       "SparseOpsTest.test_aot_dispatch_dynamic__test_generic_histogram_binning_calibration_by_feature": {
@@ -438,6 +405,18 @@
     },
     "fbgemm::permute_1D_sparse_data": {},
     "fbgemm::permute_2D_sparse_data": {},
+    "fbgemm::permute_duplicate_pooled_embs_auto_grad": {
+      "PooledEmbeddingModulesTest.test_aot_dispatch_dynamic__test_duplicate_permutations": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "PooledEmbeddingModulesTest.test_faketensor__test_duplicate_permutations": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::permute_pooled_embs": {},
+    "fbgemm::permute_pooled_embs_auto_grad": {},
     "fbgemm::permute_sequence_embeddings": {
       "SparseOpsTest.test_aot_dispatch_dynamic__test_permute_embeddings": {
         "comment": "",
diff --git a/fbgemm_gpu/test/failures_dict_fast.json b/fbgemm_gpu/test/failures_dict_fast.json
index cec2a1adc7..5676072e0b 100644
--- a/fbgemm_gpu/test/failures_dict_fast.json
+++ b/fbgemm_gpu/test/failures_dict_fast.json
@@ -35,10 +35,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_cpu_bf16_out": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_gpu_no_cache": {
         "comment": "",
         "status": "xfail"
@@ -75,10 +71,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_cpu_bf16_out": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_gpu_no_cache": {
         "comment": "",
         "status": "xfail"
@@ -88,120 +80,7 @@
         "status": "xfail"
       }
     },
-    "fbgemm::bounds_check_indices": {
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmNONE": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmSUM": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmNONE": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_none": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_none_with_rowwise_adagrad": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_adagrad": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_adam": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_lamb": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_lars": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd_really_long_segments": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_bounds_check": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_miss_counter": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_pipeline": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_1": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_2": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_fused_pooled_emb_quant": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_int_nbit_split_embedding_uvm_caching_codegen_lookup_function": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_cache_miss_counter": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_direct_mapped_uvm_cache_stats": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_fused_pooled_emb_quant": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_uvm_cache_stats": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_stb_uvm_cache_stats": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
+    "fbgemm::bounds_check_indices": {},
     "fbgemm::dense_embedding_codegen_lookup_function": {
       "SplitTableBatchedEmbeddingsTest.test_autograd_registration__test_backward_dense": {
         "comment": "",
@@ -212,16 +91,7 @@
         "status": "xfail"
       }
     },
-    "fbgemm::direct_mapped_lru_cache_populate_byte": {
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_direct_mapped_uvm_cache_stats": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
+    "fbgemm::direct_mapped_lru_cache_populate_byte": {},
     "fbgemm::direct_mapped_lxu_cache_lookup": {
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_direct_mapped_uvm_cache_stats": {
         "comment": "",
@@ -232,12 +102,7 @@
         "status": "xfail"
       }
     },
-    "fbgemm::emb_inplace_update": {
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_embedding_inplace_update": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
+    "fbgemm::emb_inplace_update": {},
     "fbgemm::get_unique_indices": {
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_unique_lxu_cache_lookup": {
         "comment": "",
@@ -470,80 +335,9 @@
         "status": "skip"
       }
     },
-    "fbgemm::lru_cache_populate_byte": {
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_int_nbit_split_embedding_uvm_caching_codegen_lookup_function": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_cache_miss_counter": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_uvm_cache_stats": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
-    "fbgemm::lxu_cache_flush": {
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmNONE": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmSUM": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmNONE": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd_really_long_segments": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_pipeline": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_1": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_2": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
-    "fbgemm::lxu_cache_locking_counter_decrement": {
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_lxu_cache_locking_counter_decrement": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
+    "fbgemm::lru_cache_populate_byte": {},
+    "fbgemm::lxu_cache_flush": {},
+    "fbgemm::lxu_cache_locking_counter_decrement": {},
     "fbgemm::lxu_cache_lookup": {
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": {
         "comment": "",
@@ -671,20 +465,7 @@
         "status": "xfail"
       }
     },
-    "fbgemm::pruned_hashmap_insert": {
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_gpu_no_cache": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_pruning": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
+    "fbgemm::pruned_hashmap_insert": {},
     "fbgemm::pruned_hashmap_lookup": {
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": {
         "comment": "",
@@ -706,115 +487,17 @@
       }
     },
     "fbgemm::split_embedding_codegen_lookup_adagrad_function": {},
-    "fbgemm::split_embedding_codegen_lookup_adagrad_function_cpu": {
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_adagrad": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
-    "fbgemm::split_embedding_codegen_lookup_adam_function": {
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_adam": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
+    "fbgemm::split_embedding_codegen_lookup_adagrad_function_cpu": {},
+    "fbgemm::split_embedding_codegen_lookup_adam_function": {},
     "fbgemm::split_embedding_codegen_lookup_lamb_function": {},
     "fbgemm::split_embedding_codegen_lookup_lars_sgd_function": {},
     "fbgemm::split_embedding_codegen_lookup_none_function": {},
     "fbgemm::split_embedding_codegen_lookup_partial_rowwise_adam_function": {},
     "fbgemm::split_embedding_codegen_lookup_partial_rowwise_lamb_function": {},
-    "fbgemm::split_embedding_codegen_lookup_rowwise_adagrad_function": {
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": {
-        "comment": "",
-        "status": "skip"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmNONE": {
-        "comment": "",
-        "status": "skip"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmSUM": {
-        "comment": "",
-        "status": "skip"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": {
-        "comment": "",
-        "status": "skip"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmNONE": {
-        "comment": "",
-        "status": "skip"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": {
-        "comment": "",
-        "status": "skip"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_adagrad": {
-        "comment": "",
-        "status": "skip"
-      }
-    },
-    "fbgemm::split_embedding_codegen_lookup_rowwise_adagrad_function_cpu": {
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": {
-        "comment": "",
-        "status": "skip"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmNONE": {
-        "comment": "",
-        "status": "skip"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmSUM": {
-        "comment": "",
-        "status": "skip"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": {
-        "comment": "",
-        "status": "skip"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmNONE": {
-        "comment": "",
-        "status": "skip"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": {
-        "comment": "",
-        "status": "skip"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_adagrad": {
-        "comment": "",
-        "status": "skip"
-      }
-    },
+    "fbgemm::split_embedding_codegen_lookup_rowwise_adagrad_function": {},
+    "fbgemm::split_embedding_codegen_lookup_rowwise_adagrad_function_cpu": {},
     "fbgemm::split_embedding_codegen_lookup_rowwise_weighted_adagrad_function": {},
-    "fbgemm::split_embedding_codegen_lookup_sgd_function": {
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd": {
-        "comment": "",
-        "status": "skip"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd_really_long_segments": {
-        "comment": "",
-        "status": "skip"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_pipeline": {
-        "comment": "",
-        "status": "skip"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline": {
-        "comment": "",
-        "status": "skip"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_1": {
-        "comment": "",
-        "status": "skip"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_2": {
-        "comment": "",
-        "status": "skip"
-      }
-    },
-    "fbgemm::split_embedding_codegen_lookup_sgd_function_cpu": {
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd": {
-        "comment": "",
-        "status": "xfail"
-      }
-    }
+    "fbgemm::split_embedding_codegen_lookup_sgd_function": {},
+    "fbgemm::split_embedding_codegen_lookup_sgd_function_cpu": {}
   }
 }
diff --git a/fbgemm_gpu/test/input_combine_test.py b/fbgemm_gpu/test/input_combine_test.py
index db1f0e8928..32e1dd2465 100644
--- a/fbgemm_gpu/test/input_combine_test.py
+++ b/fbgemm_gpu/test/input_combine_test.py
@@ -132,10 +132,6 @@ def forward(  # noqa C901
 # skips and failures in deeplearning/fbgemm/fbgemm_gpu/test/failures_dict.json
 # pyre-fixme[24]: Generic type `Callable` expects 2 type parameters.
 additional_decorators: Dict[str, List[Callable]] = {
-    "test_pt2_compliant_tag_fbgemm_dense_to_jagged": [
-        # This operator has been grandfathered in. We need to fix this test failure.
-        unittest.expectedFailure,
-    ],
     "test_pt2_compliant_tag_fbgemm_jagged_dense_elementwise_add": [
         # This operator has been grandfathered in. We need to fix this test failure.
         unittest.expectedFailure,
diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py
index 8465490282..01e4333db4 100644
--- a/fbgemm_gpu/test/jagged_tensor_ops_test.py
+++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py
@@ -9,6 +9,7 @@
 
 import itertools
 import random
+import sys
 import unittest
 from typing import Callable, Dict, List, Tuple
 
@@ -27,7 +28,6 @@
         gpu_available,
         gpu_unavailable,
         gradcheck,
-        on_arm_platform,
         optests,
         symint_vector_unsupported,
         TEST_WITH_ROCM,
@@ -40,7 +40,6 @@
         gpu_available,
         gpu_unavailable,
         gradcheck,
-        on_arm_platform,
         optests,
         symint_vector_unsupported,
         TEST_WITH_ROCM,
@@ -127,15 +126,20 @@ def hash_size_cumsum_to_offsets(hash_size_cum_sum_list: List[int]) -> List[int]:
     return hash_size_offsets_list
 
 
+# pyre-fixme[2]
+# pyre-fixme[24]
+def torch_compiled(model: Callable, **kwargs) -> Callable:
+    if sys.version_info < (3, 12, 0):
+        return torch.compile(model, **kwargs)
+    else:
+        return model
+
+
 # e.g. "test_faketensor__test_cumsum": [unittest.expectedFailure]
 # Please avoid putting tests here, you should put operator-specific
 # skips and failures in deeplearning/fbgemm/fbgemm_gpu/test/failures_dict.json
 # pyre-ignore[24]: Generic type `Callable` expects 2 type parameters.
 additional_decorators: Dict[str, List[Callable]] = {
-    "test_pt2_compliant_tag_fbgemm_dense_to_jagged": [
-        # This operator has been grandfathered in. We need to fix this test failure.
-        unittest.expectedFailure,
-    ],
     "test_pt2_compliant_tag_fbgemm_jagged_dense_elementwise_add": [
         # This operator has been grandfathered in. We need to fix this test failure.
         unittest.expectedFailure,
@@ -381,7 +385,7 @@ def test_jagged_2d_to_dense_dynamic_shape(
         values = ref_values.clone().to(dtype).detach().requires_grad_(True)
         offsets = offsets.to(device_type)
         ref_output_values = ref_output_values.to(device_type)
-        output_values = torch.compile(
+        output_values = torch_compiled(
             torch.ops.fbgemm.jagged_2d_to_dense, dynamic=True, fullgraph=True
         )(
             values=values,
@@ -597,7 +601,7 @@ def lengths_to_segment_ids(lengths: torch.Tensor) -> torch.Tensor:
         values = ref_values.clone().detach().requires_grad_(False)
         offsets = offsets.to(device_type)
         ref_output_values = ref_output_values.to(device_type)
-        output_values = torch.compile(
+        output_values = torch_compiled(
             torch.ops.fbgemm.jagged_1d_to_dense, dynamic=True, fullgraph=True
         )(
             values=values,
@@ -977,9 +981,10 @@ def test_dense_to_jagged_dynamic_shape(
         )
         values_2d = values_2d.clone().detach().requires_grad_(True)
 
-        @torch.compile(fullgraph=True, dynamic=True)
         def jagged_to_dense(
-            values: torch.Tensor, offsets: torch.Tensor, max_lengths: List[int]
+            values: torch.Tensor,
+            offsets: List[torch.LongTensor],
+            max_lengths: List[int],
         ) -> torch.Tensor:
             return torch.ops.fbgemm.jagged_to_padded_dense(values, offsets, max_lengths)
 
@@ -993,15 +998,13 @@ def jagged_to_dense(
         torch._dynamo.mark_dynamic(dense, 0)
         torch._dynamo.mark_dynamic(dense, -1)
 
-        @torch.compile(fullgraph=True, dynamic=True)
         def dense_to_jagged_withL(
-            dense: torch.Tensor, offsets: torch.Tensor, total_L: List[int]
+            dense: torch.Tensor, offsets: List[torch.LongTensor], total_L: List[int]
         ) -> Tuple[torch.Tensor, torch.Tensor]:
             return torch.ops.fbgemm.dense_to_jagged(dense, offsets, total_L)
 
-        @torch.compile(fullgraph=False, dynamic=True)
         def dense_to_jagged_noL(
-            dense: torch.Tensor, offsets: torch.Tensor
+            dense: torch.Tensor, offsets: List[torch.LongTensor]
         ) -> Tuple[torch.Tensor, torch.Tensor]:
             return torch.ops.fbgemm.dense_to_jagged(dense, offsets)
 
@@ -1325,24 +1328,21 @@ def test_jagged_elementwise_binary_dynamic_shape(
 
         x_padded = self._to_padded_dense(x_values, x_offsets, max_lengths)
 
-        @torch.compile(fullgraph=True, dynamic=True)
         def jagged_dense_elementwise_add(
-            x_values: torch.Tensor, x_offsets: torch.Tensor, y: torch.Tensor
+            x_values: torch.Tensor, x_offsets: List[torch.LongTensor], y: torch.Tensor
         ) -> torch.Tensor:
             return torch.ops.fbgemm.jagged_dense_elementwise_add(x_values, x_offsets, y)
 
-        @torch.compile(fullgraph=True, dynamic=True)
         def jagged_dense_elementwise_add_jagged_output(
-            x_values: torch.Tensor, x_offsets: torch.Tensor, y: torch.Tensor
-        ) -> Tuple[torch.Tensor, torch.Tensor]:
+            x_values: torch.Tensor, x_offsets: List[torch.LongTensor], y: torch.Tensor
+        ) -> Tuple[torch.Tensor, List[torch.LongTensor]]:
             return torch.ops.fbgemm.jagged_dense_elementwise_add_jagged_output(
                 x_values, x_offsets, y
             )
 
-        @torch.compile(fullgraph=True, dynamic=True)
         def jagged_dense_elementwise_mul(
-            x_values: torch.Tensor, x_offsets: torch.Tensor, y: torch.Tensor
-        ) -> Tuple[torch.Tensor, torch.Tensor]:
+            x_values: torch.Tensor, x_offsets: List[torch.LongTensor], y: torch.Tensor
+        ) -> Tuple[torch.Tensor, List[torch.LongTensor]]:
             return torch.ops.fbgemm.jagged_dense_elementwise_mul(x_values, x_offsets, y)
 
         if operation == "add":
@@ -1614,7 +1614,7 @@ def test_jagged_dense_dense_elementwise_add_jagged_output_dynamic_shape(
         )
         output_ref = x_padded + y_0 + y_1
         x_values.to(device_type)
-        (output, output_offsets) = torch.compile(
+        (output, output_offsets) = torch_compiled(
             torch.ops.fbgemm.jagged_dense_dense_elementwise_add_jagged_output,
             fullgraph=True,
             dynamic=True,
@@ -1825,7 +1825,7 @@ def test_batched_dense_vec_jagged_2d_mul_dynamic_shape(
         torch._dynamo.mark_dynamic(values, 1)
         torch._dynamo.mark_dynamic(offsets, 0)
 
-        output = torch.compile(
+        output = torch_compiled(
             torch.ops.fbgemm.batched_dense_vec_jagged_2d_mul,
             fullgraph=True,
             dynamic=True,
@@ -2363,7 +2363,6 @@ def test_jagged_softmax(
         if gpu_available
         else st.just("cpu"),
     )
-    @unittest.skipIf(*on_arm_platform)
     @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
     def test_jagged_jagged_bmm(
         self,
@@ -2429,7 +2428,6 @@ def test_jagged_jagged_bmm(
         if gpu_available
         else st.just("cpu"),
     )
-    @unittest.skipIf(*on_arm_platform)
     @settings(verbosity=Verbosity.verbose, max_examples=2, deadline=None)
     def test_jagged_dense_bmm(
         self,
@@ -2492,7 +2490,6 @@ def test_jagged_dense_bmm(
         dtype=st.sampled_from([torch.float, torch.double]),
         device_type=st.just("cpu"),
     )
-    @unittest.skipIf(*on_arm_platform)
     @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
     def test_jagged_dense_bmm_dynamic_shape(
         self,
@@ -2521,7 +2518,7 @@ def test_jagged_dense_bmm_dynamic_shape(
         torch._dynamo.mark_dynamic(x_values, 1)
         torch._dynamo.mark_dynamic(lengths, 0)  # offsets = lengths + 1
 
-        output, _ = torch.compile(
+        output, _ = torch_compiled(
             torch.ops.fbgemm.jagged_dense_bmm, fullgraph=True, dynamic=True
         )(
             x_values,
diff --git a/fbgemm_gpu/test/permute_pooled_embedding_test.py b/fbgemm_gpu/test/permute_pooled_embedding_test.py
index 3723ee76c4..ff7575477b 100644
--- a/fbgemm_gpu/test/permute_pooled_embedding_test.py
+++ b/fbgemm_gpu/test/permute_pooled_embedding_test.py
@@ -8,9 +8,10 @@
 import sys
 import unittest
 from itertools import accumulate
-from typing import List, Tuple
+from typing import Any, Callable, Dict, List, Tuple
 
 import fbgemm_gpu
+import hypothesis.strategies as st
 import torch
 import torch._dynamo
 from fbgemm_gpu.permute_pooled_embedding_modules import PermutePooledEmbeddings
@@ -20,12 +21,13 @@
 # pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
 if getattr(fbgemm_gpu, "open_source", False):
     # pyre-ignore[21]
-    from test_utils import cpu_and_maybe_gpu, gpu_unavailable, on_arm_platform
+    from test_utils import cpu_and_maybe_gpu, gpu_unavailable, on_arm_platform, optests
 else:
     from fbgemm_gpu.test.test_utils import (
         cpu_and_maybe_gpu,
         gpu_unavailable,
         on_arm_platform,
+        optests,
     )
 
 typed_gpu_unavailable: Tuple[bool, str] = gpu_unavailable
@@ -66,12 +68,29 @@
 )
 
 
+class PermutePooledEmbeddingsFwdOnly(PermutePooledEmbeddings):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, pooled_embs: torch.Tensor) -> torch.Tensor:
+        result = torch.ops.fbgemm.permute_pooled_embs(
+            pooled_embs,
+            self._offset_dim_list.to(device=pooled_embs.device),
+            self._permute.to(device=pooled_embs.device),
+            self._inv_offset_dim_list.to(device=pooled_embs.device),
+            self._inv_permute.to(device=pooled_embs.device),
+        )
+        return result
+
+
 class Net(torch.nn.Module):
-    def __init__(self) -> None:
+    def __init__(self, fwd_only: bool = False) -> None:
         super(Net, self).__init__()
         self.fc1 = torch.nn.Linear(1, 10, bias=False)
-        self.permute_pooled_embeddings = PermutePooledEmbeddings(
-            [2, 3, 1, 4], [3, 0, 2, 1]
+        op_cls = PermutePooledEmbeddingsFwdOnly if fwd_only else PermutePooledEmbeddings
+        self.permute_pooled_embeddings: PermutePooledEmbeddings = op_cls(
+            [2, 3, 1, 4],
+            [3, 0, 2, 1],
         )
         self.fc2 = torch.nn.Linear(10, 1, bias=False)
 
@@ -82,7 +101,24 @@ def forward(self, x: Tensor) -> Tensor:
         return x
 
 
+# e.g. "test_faketensor__test_cumsum": [unittest.expectedFailure]
+# Please avoid putting tests here, you should put operator-specific
+# skips and failures in deeplearning/fbgemm/fbgemm_gpu/test/failures_dict.json
+# pyre-ignore[24]: Generic type `Callable` expects 2 type parameters.
+additional_decorators: Dict[str, List[Callable]] = {
+    "test_pt2_compliant_tag_fbgemm_jagged_dense_elementwise_add": [
+        # This operator has been grandfathered in. We need to fix this test failure.
+        unittest.expectedFailure,
+    ],
+    "test_pt2_compliant_tag_fbgemm_jagged_dense_elementwise_add_jagged_output": [
+        # This operator has been grandfathered in. We need to fix this test failure.
+        unittest.expectedFailure,
+    ],
+}
+
+
 # @parameterized_class([{"device_type": "cpu"}, {"device_type": "cuda"}])
+@optests.generate_opcheck_tests(additional_decorators=additional_decorators)
 class PooledEmbeddingModulesTest(unittest.TestCase):
     @settings(deadline=10000, suppress_health_check=suppressed_list)
     # pyre-fixme[56]: Pyre was not able to infer the type of argument
@@ -90,8 +126,11 @@ class PooledEmbeddingModulesTest(unittest.TestCase):
     def setUp(self, device_type: torch.device) -> None:
         self.device = device_type
 
-    def test_permutation(self) -> None:
-        net = Net().to(self.device)
+    @settings(deadline=500)
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    @given(fwd_only=st.booleans())
+    def test_permutation(self, fwd_only: bool) -> None:
+        net = Net(fwd_only=fwd_only).to(self.device)
 
         input = torch.Tensor([range(10)]).to(self.device)
         self.assertEqual(
diff --git a/fbgemm_gpu/test/quantize_ops_test.py b/fbgemm_gpu/test/quantize_ops_test.py
index 493f35f9f8..2b1d6dcde5 100644
--- a/fbgemm_gpu/test/quantize_ops_test.py
+++ b/fbgemm_gpu/test/quantize_ops_test.py
@@ -7,6 +7,7 @@
 import logging
 import os
 import random
+import sys
 import unittest
 from ctypes import c_float, c_int32, cast, POINTER, pointer
 from typing import Callable, Dict, List, Tuple
@@ -1006,10 +1007,6 @@ def test_quantize_and_dequantize_op_cuda_large_nrows_bf16(
 # skips and failures in deeplearning/fbgemm/fbgemm_gpu/test/failures_dict.json
 # pyre-ignore[24]: Generic type `Callable` expects 2 type parameters.
 additional_decorators: Dict[str, List[Callable]] = {
-    "test_pt2_compliant_tag_fbgemm_dense_to_jagged": [
-        # This operator has been grandfathered in. We need to fix this test failure.
-        unittest.expectedFailure,
-    ],
     "test_pt2_compliant_tag_fbgemm_jagged_dense_elementwise_add": [
         # This operator has been grandfathered in. We need to fix this test failure.
         unittest.expectedFailure,
@@ -1084,7 +1081,7 @@ def test_quantize_and_dequantize_op_fp8_rowwise(
                 dynamic=True,
                 fullgraph=True,
             )
-            if test_compile
+            if test_compile and sys.version_info < (3, 12, 0)
             else torch.ops.fbgemm.FP8RowwiseQuantizedToFloat
         )
 
diff --git a/fbgemm_gpu/test/sparse_ops_test.py b/fbgemm_gpu/test/sparse_ops_test.py
index d06b7988d0..8f64837016 100644
--- a/fbgemm_gpu/test/sparse_ops_test.py
+++ b/fbgemm_gpu/test/sparse_ops_test.py
@@ -13,6 +13,7 @@
 import logging
 import os
 import random
+import sys
 import unittest
 from itertools import accumulate
 from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Type, Union
@@ -103,6 +104,15 @@ def permute_scripted(
     )
 
 
+# pyre-fixme[2]
+# pyre-fixme[24]
+def torch_compiled(model: Callable, **kwargs) -> Callable:
+    if sys.version_info < (3, 12, 0):
+        return torch.compile(model, **kwargs)
+    else:
+        return model
+
+
 class SparseOpsTest(unittest.TestCase):
     @staticmethod
     @settings(suppress_health_check=suppressed_list)
@@ -2018,7 +2028,7 @@ def test_pack_segments(
             pack_segments_fun = torch.ops.fbgemm.pack_segments
 
             if torch_compile:
-                pack_segments_fun = torch.compile(pack_segments_fun, dynamic=True)
+                pack_segments_fun = torch_compiled(pack_segments_fun, dynamic=True)
 
             packed_cuda = pack_segments_fun(
                 t_in=input_data.cuda(),
@@ -2114,7 +2124,7 @@ def test_pack_segments_smaller_max_len(
         if gpu_available:
             pack_segments_fun = torch.ops.fbgemm.pack_segments
             if torch_compile:
-                pack_segments_fun = torch.compile(pack_segments_fun)
+                pack_segments_fun = torch_compiled(pack_segments_fun)
 
             packed_cuda = pack_segments_fun(
                 t_in=input_data.cuda(),
@@ -2721,10 +2731,6 @@ def test_permute_sparse_features_with_repeats(
     "test_faketensor__test_index_select_dim0": [unittest.skip("hangs")],
     "test_autograd_registration__test_index_select_dim0": [unittest.skip("hangs")],
     "test_schema__test_index_select_dim0": [unittest.skip("hangs")],
-    "test_pt2_compliant_tag_fbgemm_dense_to_jagged": [
-        # This operator has been grandfathered in. We need to fix this test failure.
-        unittest.expectedFailure,
-    ],
     "test_pt2_compliant_tag_fbgemm_jagged_dense_elementwise_add": [
         # This operator has been grandfathered in. We need to fix this test failure.
         unittest.expectedFailure,
diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
index ce8e41e630..037de61815 100644
--- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py
+++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -4526,6 +4526,12 @@ def execute_nbit_forward_(  # noqa C901
         nbit_weights_ty=get_nbit_weights_ty(),
         use_array_for_index_remapping=st.booleans(),
         do_pruning=st.booleans(),
+        pooling_mode=st.sampled_from(
+            [PoolingMode.SUM, PoolingMode.NONE, PoolingMode.MEAN]
+        ),
+        output_dtype=st.sampled_from(
+            [SparseType.FP32, SparseType.FP16, SparseType.BF16]
+        ),
     )
     @settings(
         verbosity=VERBOSITY,
@@ -4537,6 +4543,8 @@ def test_nbit_forward_cpu(
         nbit_weights_ty: Optional[SparseType],
         use_array_for_index_remapping: bool,
         do_pruning: bool,
+        pooling_mode: PoolingMode,
+        output_dtype: SparseType,
     ) -> None:
         use_cpu = True
         T = random.randint(1, 50)
@@ -4549,27 +4557,7 @@ def test_nbit_forward_cpu(
         # cache_algorithm is don't care as we don't use cache.
         cache_algorithm = CacheAlgorithm.LRU
 
-        pooling_mode = random.choice(
-            [
-                PoolingMode.SUM,
-                PoolingMode.MEAN,
-                PoolingMode.NONE,
-            ]
-        )
         mixed = random.choice([True, False])
-        if pooling_mode == PoolingMode.NONE:
-            nbit_weights_ty = random.choice(
-                [
-                    SparseType.FP32,
-                    SparseType.FP16,
-                    # CPU sequence embedding does not support FP8/INT4/INT2 yet
-                    # SparseType.FP8,
-                    SparseType.INT8,
-                    # SparseType.INT4,
-                    # SparseType.INT2,
-                ]
-            )
-
         if pooling_mode == PoolingMode.SUM:
             weighted = random.choice([True, False])
         else:
@@ -4582,81 +4570,7 @@ def test_nbit_forward_cpu(
         else:
             weights_ty: SparseType = nbit_weights_ty
             mixed_weights_ty = False
-        output_dtype = random.choice(
-            (
-                [SparseType.BF16]
-                if weights_ty in [SparseType.INT4, SparseType.INT2]
-                else []
-            )
-            + [SparseType.FP32, SparseType.FP16]
-        )
-        self.execute_nbit_forward_(
-            T,
-            D,
-            B,
-            log_E,
-            L,
-            weighted,
-            mixed,
-            pooling_mode,
-            weights_ty,
-            use_cache,
-            cache_algorithm,
-            use_cpu,
-            use_array_for_index_remapping,
-            do_pruning,
-            mixed_weights_ty,
-            output_dtype,
-        )
-
-    @given(
-        nbit_weights_ty=get_nbit_weights_ty(),
-        use_array_for_index_remapping=st.booleans(),
-        do_pruning=st.booleans(),
-    )
-    @settings(
-        verbosity=VERBOSITY,
-        max_examples=MAX_EXAMPLES_LONG_RUNNING,
-        deadline=None,
-    )
-    def test_nbit_forward_cpu_bf16_out(
-        self,
-        nbit_weights_ty: Optional[SparseType],
-        use_array_for_index_remapping: bool,
-        do_pruning: bool,
-    ) -> None:
-        use_cpu = True
-        T = random.randint(1, 50)
-        B = random.randint(0, 128)
-        L = random.randint(0, 32)
-        D = random.randint(2, 2048)
-        log_E = random.randint(2, 4)
-
-        use_cache = False
-        # cache_algorithm is don't care as we don't use cache.
-        cache_algorithm = CacheAlgorithm.LRU
-
-        pooling_mode = random.choice(
-            [
-                PoolingMode.SUM,
-                PoolingMode.MEAN,
-            ]
-        )
-        mixed = random.choice([True, False])
-
-        if pooling_mode == PoolingMode.SUM:
-            weighted = random.choice([True, False])
-        else:
-            weighted = False
 
-        if nbit_weights_ty is None:
-            # don't care when mixed type is used.
-            weights_ty: SparseType = SparseType.INT8
-            mixed_weights_ty = True
-        else:
-            weights_ty: SparseType = nbit_weights_ty
-            mixed_weights_ty = False
-        output_dtype = SparseType.BF16
         self.execute_nbit_forward_(
             T,
             D,
diff --git a/src/DirectConv.h b/src/DirectConv.h
index ded8c2c62d..e10597e759 100644
--- a/src/DirectConv.h
+++ b/src/DirectConv.h
@@ -224,4 +224,4 @@ CodeCache<
     typename DirectConvCodeGenBase<TA, TB, TC, accT>::jit_micro_kernel_fp_convT>
     DirectConvCodeGenBase<TA, TB, TC, accT>::codeCacheT_;
 
-}; // namespace fbgemm
+} // namespace fbgemm