From 0da4234ad0e79bbf08bd6b796cf7494fb82f3154 Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Thu, 31 Aug 2023 12:17:49 -0700
Subject: [PATCH 01/94] Enable FBGEMM_GPU_MEMCHECK on TBE cache ops (#1977)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1977

As titled

This is helpful for code debugging.

Reviewed By: q10

Differential Revision: D48727799

fbshipit-source-id: 1820b9dc18909600a1586cd2ab5789ab53a7d6d2
---
 fbgemm_gpu/src/split_embeddings_cache_cuda.cu | 706 +++++++++---------
 1 file changed, 358 insertions(+), 348 deletions(-)
diff --git a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
index fa7d94e262..9eb21b6e60 100644
--- a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
+++ b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
@@ -33,6 +33,7 @@
 #include "fbgemm_gpu/dispatch_macros.h"
 #include "fbgemm_gpu/embedding_common.h"
 #include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
+#include "fbgemm_gpu/fbgemm_tensor_accessor.h"
 #include "fbgemm_gpu/ops_utils.h"
 #include "fbgemm_gpu/sparse_ops_utils.h"
 #include "fbgemm_gpu/split_embeddings_utils.cuh"
@@ -103,18 +104,18 @@ constexpr int64_t kCacheStateInvalid = -1;
 
 template <typename emb_t, typename cache_t>
 __global__ __launch_bounds__(kMaxThreads) void lxu_cache_flush_kernel(
-    at::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> weights,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> weights,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         cache_hash_size_cumsum,
-    const at::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
         cache_index_table_map,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         weights_offsets,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         D_offsets,
-    const at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
         lxu_cache_state,
-    at::PackedTensorAccessor64<cache_t, 2, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor64<cache_t, 2, at::RestrictPtrTraits>
         lxu_cache_weights,
     bool stochastic_rounding,
     at::PhiloxCudaState stochastic_rounding_philox_args) {
@@ -209,22 +210,21 @@ DLL_PUBLIC void lxu_cache_flush_cuda(
           rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)
                                   ->philox_cuda_state(4);
         }
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lxu_cache_flush_kernel";
+#endif
         lxu_cache_flush_kernel<emb_t, cache_t>
             <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-                uvm_weights
-                    .packed_accessor64<emb_t, 1, at::RestrictPtrTraits>(),
-                cache_hash_size_cumsum
-                    .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-                cache_index_table_map
-                    .packed_accessor64<int32_t, 1, at::RestrictPtrTraits>(),
-                weights_offsets
-                    .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-                D_offsets
-                    .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-                lxu_cache_state
-                    .packed_accessor32<int64_t, 2, at::RestrictPtrTraits>(),
-                lxu_cache_weights
-                    .packed_accessor64<cache_t, 2, at::RestrictPtrTraits>(),
+                MAKE_PTA_WITH_NAME(func_name, uvm_weights, emb_t, 1, 64),
+                MAKE_PTA_WITH_NAME(
+                    func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name, cache_index_table_map, int32_t, 1, 64),
+                MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
+                MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
+                MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name, lxu_cache_weights, cache_t, 2, 64),
                 stochastic_rounding,
                 rng_engine_inputs);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -235,12 +235,13 @@ namespace {
 
 template <typename index_t>
 __global__ __launch_bounds__(kMaxThreads) void linearize_cache_indices_kernel(
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         cache_hash_size_cumsum,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> indices,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        indices,
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
         table_offsets,
-    at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
         linear_cache_indices) {
   const index_t index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index >= indices.size(0)) {
@@ -298,20 +299,23 @@ DLL_PUBLIC Tensor linearize_cache_indices_cuda(
   }
 
   auto table_offsets = offsets.slice(0, B, B * T, B);
+
   AT_DISPATCH_INDEX_TYPES(
       indices.scalar_type(), "linearize_cache_indices_kernel", [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "linearize_cache_indices_kernel";
+#endif
         linearize_cache_indices_kernel<<<
             div_round_up(num_indices, kMaxThreads),
             kMaxThreads,
             0,
             at::cuda::getCurrentCUDAStream()>>>(
-            cache_hash_size_cumsum
-                .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-            indices.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-            table_offsets
-                .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-            linear_cache_indices
-                .packed_accessor32<index_t, 1, at::RestrictPtrTraits>());
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, indices, index_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, table_offsets, index_t, 1, 32),
+            MAKE_PTA_WITH_NAME(
+                func_name, linear_cache_indices, index_t, 1, 32));
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
   return linear_cache_indices;
@@ -322,13 +326,13 @@ namespace {
 template <typename index_t>
 __global__
 __launch_bounds__(kMaxThreads) void linearize_cache_indices_from_row_idx_kernel(
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         cache_hash_size_cumsum,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
         update_table_indices,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
         update_row_indices,
-    at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
         linear_cache_indices) {
   const index_t index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index >= update_row_indices.size(0)) {
@@ -373,19 +377,20 @@ DLL_PUBLIC Tensor linearize_cache_indices_from_row_idx_cuda(
       update_row_indices.scalar_type(),
       "linearize_cache_indices_from_row_idx_kernel",
       [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "linearize_cache_indices_from_row_idx_kernel";
+#endif
         linearize_cache_indices_from_row_idx_kernel<<<
             div_round_up(num_indices, kMaxThreads),
             kMaxThreads,
             0,
             at::cuda::getCurrentCUDAStream()>>>(
-            cache_hash_size_cumsum
-                .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-            update_table_indices
-                .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-            update_row_indices
-                .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-            linear_cache_indices
-                .packed_accessor32<index_t, 1, at::RestrictPtrTraits>());
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, update_table_indices, index_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, update_row_indices, index_t, 1, 32),
+            MAKE_PTA_WITH_NAME(
+                func_name, linear_cache_indices, index_t, 1, 32));
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
   return linear_cache_indices;
@@ -499,11 +504,11 @@ get_unique_indices_cuda(
 namespace {
 
 __global__ __launch_bounds__(kMaxThreads) void emulate_cache_miss_kernel(
-    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         lxu_cache_locations,
     const int64_t enforced_misses_per_256,
     const bool gather_cache_stats,
-    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         uvm_cache_stats) {
   const int32_t N = lxu_cache_locations.size(0);
   int64_t n_enforced_misses = 0;
@@ -544,16 +549,19 @@ DLL_PUBLIC Tensor emulate_cache_miss(
       div_round_up(N, kMaxThreads),
       get_max_thread_blocks_for_cache_kernels_()));
 
+#ifdef FBGEMM_GPU_MEMCHECK
+  const char* func_name = "emulate_cache_miss_kernel";
+#endif
+
   emulate_cache_miss_kernel<<<
       blocks,
       kMaxThreads,
       0,
       at::cuda::getCurrentCUDAStream()>>>(
-      lxu_cache_locations
-          .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+      MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32),
       enforced_misses_per_256,
       gather_cache_stats,
-      uvm_cache_stats.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>());
+      MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats, int32_t, 1, 32));
   C10_CUDA_KERNEL_LAUNCH_CHECK();
   return lxu_cache_locations;
 }
@@ -562,9 +570,9 @@ namespace {
 // count the number of times that a cache_slot appears in lxu_cache_locations
 // we actually only care about whether the number is 0 or > 0.
 __global__ __launch_bounds__(kMaxThreads) void lxu_cache_locations_count_kernel(
-    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         lxu_cache_locations,
-    at::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits> count,
+    pta::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits> count,
     FixedDivisor fd) {
   const int32_t N = lxu_cache_locations.size(0);
   CUDA_KERNEL_LOOP(n, N) {
@@ -581,9 +589,9 @@ __global__ __launch_bounds__(kMaxThreads) void lxu_cache_locations_count_kernel(
 // decrement the counter of that cache_slot.
 __global__
 __launch_bounds__(kMaxThreads) void lxu_cache_locking_counter_decrement_kernel(
-    at::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits>
         lxu_cache_locking_counter,
-    at::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits> count) {
+    pta::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits> count) {
   const int32_t C = lxu_cache_locking_counter.size(0);
   for (int32_t i = blockIdx.x * blockDim.y + threadIdx.y; i < C;
        i += gridDim.x * blockDim.y) {
@@ -621,17 +629,24 @@ void lxu_cache_locking_counter_decrement_cuda(
       div_round_up(N, kMaxThreads),
       get_max_thread_blocks_for_cache_kernels_()));
 
+#ifdef FBGEMM_GPU_MEMCHECK
+  const char* func_name = "lxu_cache_locations_count_kernel";
+#endif
+
   lxu_cache_locations_count_kernel<<<
       blocks,
       kMaxThreads,
       0,
       at::cuda::getCurrentCUDAStream()>>>(
-      lxu_cache_locations
-          .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-      count.packed_accessor32<int32_t, 2, at::RestrictPtrTraits>(),
+      MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32),
+      MAKE_PTA_WITH_NAME(func_name, count, int32_t, 2, 32),
       fd);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
+#ifdef FBGEMM_GPU_MEMCHECK
+  const char* func_name2 = "lxu_cache_locking_counter_decrement_kernel";
+#endif
+
   lxu_cache_locking_counter_decrement_kernel<<<
       std::min(
           div_round_up(C, kMaxThreads / kWarpSize),
@@ -639,29 +654,28 @@ void lxu_cache_locking_counter_decrement_cuda(
       dim3(kWarpSize, kMaxThreads / kWarpSize),
       0,
       at::cuda::getCurrentCUDAStream()>>>(
-      lxu_cache_locking_counter
-          .packed_accessor32<int32_t, 2, at::RestrictPtrTraits>(),
-      count.packed_accessor32<int32_t, 2, at::RestrictPtrTraits>());
+      MAKE_PTA_WITH_NAME(func_name2, lxu_cache_locking_counter, int32_t, 2, 32),
+      MAKE_PTA_WITH_NAME(func_name2, count, int32_t, 2, 32));
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 namespace {
 template <typename index_t>
 __global__ __launch_bounds__(kMaxThreads) void lru_cache_find_uncached_kernel(
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
         unique_indices,
     const int32_t* __restrict__ N_unique,
     int64_t max_indices,
-    const at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
         lxu_cache_state,
-    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> cache_sets,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> cache_sets,
     int64_t time_stamp,
-    at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
     const bool gather_cache_stats,
-    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         uvm_cache_stats,
     const bool lock_cache_line,
-    at::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits>
         lxu_cache_locking_counter) {
   if (gather_cache_stats) {
     if (blockIdx.x == 0 && threadIdx.x == 0 && threadIdx.y == 0) {
@@ -720,15 +734,15 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_find_uncached_kernel(
 template <typename index_t>
 __global__
 __launch_bounds__(kMaxThreads) void direct_mapped_lru_cache_find_uncached_kernel(
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
         linear_cache_indices,
-    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> cache_sets,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> cache_sets,
     const int64_t max_indices,
-    const at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
         lxu_cache_state,
     const int64_t time_stamp,
-    at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
-    at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
         lxu_cache_miss_timestamp) {
   const int32_t N = linear_cache_indices.size(0);
   const int32_t C = lxu_cache_state.size(0);
@@ -814,6 +828,9 @@ DLL_PUBLIC std::pair<Tensor, Tensor> lru_cache_find_uncached_cuda(
 
   AT_DISPATCH_INDEX_TYPES(
       unique_indices.scalar_type(), "lru_cache_find_uncached_cuda", [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lru_cache_find_uncached_kernel";
+#endif
         // Find uncached indices
         lru_cache_find_uncached_kernel<<<
             std::min(
@@ -822,21 +839,18 @@ DLL_PUBLIC std::pair<Tensor, Tensor> lru_cache_find_uncached_cuda(
             dim3(kWarpSize, kMaxThreads / kWarpSize),
             0,
             at::cuda::getCurrentCUDAStream()>>>(
-            unique_indices
-                .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(func_name, unique_indices, index_t, 1, 32),
             unique_indices_length.data_ptr<int32_t>(),
             max_indices,
-            lxu_cache_state
-                .packed_accessor32<int64_t, 2, at::RestrictPtrTraits>(),
-            cache_sets.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+            MAKE_PTA_WITH_NAME(func_name, cache_sets, int32_t, 1, 32),
             time_stamp,
-            lru_state.packed_accessor32<int64_t, 2, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(func_name, lru_state, int64_t, 2, 32),
             gather_cache_stats,
-            uvm_cache_stats
-                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats, int32_t, 1, 32),
             lock_cache_line,
-            lxu_cache_locking_counter
-                .packed_accessor32<int32_t, 2, at::RestrictPtrTraits>());
+            MAKE_PTA_WITH_NAME(
+                func_name, lxu_cache_locking_counter, int32_t, 2, 32));
         C10_CUDA_KERNEL_LAUNCH_CHECK();
         // Sort the cache sets and ids
         size_t temp_storage_bytes = 0;
@@ -898,6 +912,9 @@ Tensor direct_mapped_lru_cache_find_uncached_cuda(
       linear_cache_indices.scalar_type(),
       "direct_mapped_lru_cache_find_uncached_cuda",
       [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "direct_mapped_lru_cache_find_uncached_kernel";
+#endif
         // Find uncached indices
         direct_mapped_lru_cache_find_uncached_kernel<<<
             std::min(
@@ -906,16 +923,14 @@ Tensor direct_mapped_lru_cache_find_uncached_cuda(
             kMaxThreads,
             0,
             at::cuda::getCurrentCUDAStream()>>>(
-            linear_cache_indices
-                .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-            cache_sets.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(func_name, linear_cache_indices, index_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, cache_sets, int32_t, 1, 32),
             max_indices,
-            lxu_cache_state
-                .packed_accessor32<int64_t, 2, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
             time_stamp,
-            lru_state.packed_accessor32<int64_t, 2, at::RestrictPtrTraits>(),
-            lxu_cache_miss_timestamp
-                .packed_accessor32<int64_t, 2, at::RestrictPtrTraits>());
+            MAKE_PTA_WITH_NAME(func_name, lru_state, int64_t, 2, 32),
+            MAKE_PTA_WITH_NAME(
+                func_name, lxu_cache_miss_timestamp, int64_t, 2, 32));
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 
@@ -924,33 +939,33 @@ Tensor direct_mapped_lru_cache_find_uncached_cuda(
 
 template <typename emb_t, typename cache_t>
 __global__ __launch_bounds__(kMaxThreads) void lru_cache_insert_kernel(
-    at::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> weights,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> weights,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         cache_hash_size_cumsum,
-    const at::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
         cache_index_table_map,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         weights_offsets,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         D_offsets,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         sorted_cache_sets,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         cache_set_sorted_indices,
     const int32_t* __restrict__ N_unique,
-    at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
         lxu_cache_state,
-    at::PackedTensorAccessor64<cache_t, 2, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor64<cache_t, 2, at::RestrictPtrTraits>
         lxu_cache_weights,
     const int64_t time_stamp,
-    at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
     const bool stochastic_rounding,
     at::PhiloxCudaState stochastic_rounding_philox_args,
     const bool gather_cache_stats,
-    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         uvm_cache_stats,
     const bool lock_cache_line,
-    at::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits>
         lxu_cache_locking_counter) {
   const int32_t C = lxu_cache_state.size(0);
   int32_t n_conflict_misses = 0;
@@ -1145,40 +1160,38 @@ void lru_cache_insert_cuda(
             ? div_round_up(get_device_sm_cnt_(), ALL_TO_PREFETCH_SM_RATIO)
             : div_round_up(N, kMaxThreads / kWarpSize);
 
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lru_cache_insert_kernel";
+#endif
         lru_cache_insert_kernel<emb_t, cache_t>
             <<<grid_size,
                dim3(kWarpSize, kMaxThreads / kWarpSize),
                0,
                at::cuda::getCurrentCUDAStream()>>>(
-                weights.packed_accessor64<emb_t, 1, at::RestrictPtrTraits>(),
-                cache_hash_size_cumsum
-                    .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-                cache_index_table_map
-                    .packed_accessor64<int32_t, 1, at::RestrictPtrTraits>(),
-                weights_offsets
-                    .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-                D_offsets
-                    .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-                sorted_cache_sets
-                    .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-                cache_set_sorted_unique_indices
-                    .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+                MAKE_PTA_WITH_NAME(func_name, weights, emb_t, 1, 64),
+                MAKE_PTA_WITH_NAME(
+                    func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name, cache_index_table_map, int32_t, 1, 64),
+                MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
+                MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name, sorted_cache_sets, int32_t, 1, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name, cache_set_sorted_unique_indices, int64_t, 1, 32),
                 unique_indices_length.data_ptr<int32_t>(),
-                lxu_cache_state
-                    .packed_accessor32<int64_t, 2, at::RestrictPtrTraits>(),
-                lxu_cache_weights
-                    .packed_accessor64<cache_t, 2, at::RestrictPtrTraits>(),
+                MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name, lxu_cache_weights, cache_t, 2, 64),
                 time_stamp,
-                lru_state
-                    .packed_accessor32<int64_t, 2, at::RestrictPtrTraits>(),
+                MAKE_PTA_WITH_NAME(func_name, lru_state, int64_t, 2, 32),
                 stochastic_rounding,
                 rng_engine_inputs,
                 gather_cache_stats,
-                uvm_cache_stats
-                    .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+                MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats, int32_t, 1, 32),
                 lock_cache_line,
-                lxu_cache_locking_counter
-                    .packed_accessor32<int32_t, 2, at::RestrictPtrTraits>());
+                MAKE_PTA_WITH_NAME(
+                    func_name, lxu_cache_locking_counter, int32_t, 2, 32));
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       }));
 }
@@ -1285,30 +1298,30 @@ namespace {
 
 template <typename index_t>
 __global__ __launch_bounds__(kMaxThreads) void lru_cache_insert_byte_kernel(
-    at::PackedTensorAccessor64<uint8_t, 1, at::RestrictPtrTraits> weights,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor64<uint8_t, 1, at::RestrictPtrTraits> weights,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         cache_hash_size_cumsum,
-    const at::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
         cache_index_table_map,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         weights_offsets,
-    const at::PackedTensorAccessor32<uint8_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<uint8_t, 1, at::RestrictPtrTraits>
         weights_tys,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         D_offsets,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         sorted_cache_sets,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
         cache_set_sorted_indices,
     const int32_t* __restrict__ N_unique,
-    at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
         lxu_cache_state,
-    at::PackedTensorAccessor64<uint8_t, 2, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor64<uint8_t, 2, at::RestrictPtrTraits>
         lxu_cache_weights,
     int64_t time_stamp,
-    at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
     const bool gather_cache_stats,
-    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         uvm_cache_stats,
     const int64_t row_alignment) {
   const int32_t C = lxu_cache_state.size(0);
@@ -1396,28 +1409,28 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_insert_byte_kernel(
 template <typename index_t>
 __global__
 __launch_bounds__(kMaxThreads) void direct_mapped_lru_cache_insert_byte_kernel(
-    at::PackedTensorAccessor64<uint8_t, 1, at::RestrictPtrTraits> weights,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor64<uint8_t, 1, at::RestrictPtrTraits> weights,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         cache_hash_size_cumsum,
-    const at::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
         cache_index_table_map,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         weights_offsets,
-    const at::PackedTensorAccessor32<uint8_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<uint8_t, 1, at::RestrictPtrTraits>
         weights_tys,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         D_offsets,
-    at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
         lxu_cache_state,
-    at::PackedTensorAccessor64<uint8_t, 2, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor64<uint8_t, 2, at::RestrictPtrTraits>
         lxu_cache_weights,
     int64_t time_stamp,
-    at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
         linear_cache_indices,
-    at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
         lxu_cache_miss_timestamp,
-    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> cache_sets,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> cache_sets,
     const int64_t row_alignment) {
   const int32_t N = cache_sets.size(0);
 
@@ -1519,6 +1532,9 @@ void lru_cache_insert_byte_cuda(
       cache_set_sorted_unique_indices.scalar_type(),
       "lru_cache_insert_byte_cuda",
       [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lru_cache_insert_byte_kernel";
+#endif
         lru_cache_insert_byte_kernel<<<
             std::min(
                 div_round_up(N, kMaxThreads / kWarpSize),
@@ -1526,29 +1542,24 @@ void lru_cache_insert_byte_cuda(
             dim3(kWarpSize, kMaxThreads / kWarpSize),
             0,
             at::cuda::getCurrentCUDAStream()>>>(
-            weights.packed_accessor64<uint8_t, 1, at::RestrictPtrTraits>(),
-            cache_hash_size_cumsum
-                .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-            cache_index_table_map
-                .packed_accessor64<int32_t, 1, at::RestrictPtrTraits>(),
-            weights_offsets
-                .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-            weights_tys.packed_accessor32<uint8_t, 1, at::RestrictPtrTraits>(),
-            D_offsets.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-            sorted_cache_sets
-                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-            cache_set_sorted_unique_indices
-                .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(func_name, weights, uint8_t, 1, 64),
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_index_table_map, int32_t, 1, 64),
+            MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, weights_tys, uint8_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, sorted_cache_sets, int32_t, 1, 32),
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_set_sorted_unique_indices, index_t, 1, 32),
             unique_indices_length.data_ptr<int32_t>(),
-            lxu_cache_state
-                .packed_accessor32<int64_t, 2, at::RestrictPtrTraits>(),
-            lxu_cache_weights
-                .packed_accessor64<uint8_t, 2, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_weights, uint8_t, 2, 64),
             time_stamp,
-            lru_state.packed_accessor32<int64_t, 2, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(func_name, lru_state, int64_t, 2, 32),
             gather_cache_stats,
-            uvm_cache_stats
-                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats, int32_t, 1, 32),
             row_alignment);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
@@ -1591,6 +1602,9 @@ void direct_mapped_lru_cache_insert_byte_cuda(
       linear_cache_indices.scalar_type(),
       "direct_mapped_lru_cache_insert_byte_cuda",
       [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "direct_mapped_lru_cache_insert_byte_kernel";
+#endif
         direct_mapped_lru_cache_insert_byte_kernel<<<
             std::min(
                 div_round_up(N, kMaxThreads / kWarpSize),
@@ -1598,26 +1612,22 @@ void direct_mapped_lru_cache_insert_byte_cuda(
             dim3(kWarpSize, kMaxThreads / kWarpSize),
             0,
             at::cuda::getCurrentCUDAStream()>>>(
-            weights.packed_accessor64<uint8_t, 1, at::RestrictPtrTraits>(),
-            cache_hash_size_cumsum
-                .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-            cache_index_table_map
-                .packed_accessor64<int32_t, 1, at::RestrictPtrTraits>(),
-            weights_offsets
-                .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-            weights_tys.packed_accessor32<uint8_t, 1, at::RestrictPtrTraits>(),
-            D_offsets.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-            lxu_cache_state
-                .packed_accessor32<int64_t, 2, at::RestrictPtrTraits>(),
-            lxu_cache_weights
-                .packed_accessor64<uint8_t, 2, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(func_name, weights, uint8_t, 1, 64),
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_index_table_map, int32_t, 1, 64),
+            MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, weights_tys, uint8_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_weights, uint8_t, 2, 64),
             time_stamp,
-            lru_state.packed_accessor32<int64_t, 2, at::RestrictPtrTraits>(),
-            linear_cache_indices
-                .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-            lxu_cache_miss_timestamp
-                .packed_accessor32<int64_t, 2, at::RestrictPtrTraits>(),
-            cache_sets.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(func_name, lru_state, int64_t, 2, 32),
+            MAKE_PTA_WITH_NAME(func_name, linear_cache_indices, index_t, 1, 32),
+            MAKE_PTA_WITH_NAME(
+                func_name, lxu_cache_miss_timestamp, int64_t, 2, 32),
+            MAKE_PTA_WITH_NAME(func_name, cache_sets, int32_t, 1, 32),
             row_alignment);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
@@ -1809,12 +1819,12 @@ namespace {
 
 template <typename index_t>
 __global__ __launch_bounds__(kMaxThreads) void lfu_update_counts_kernel(
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
         unique_indices,
     const int32_t* __restrict__ N_unique,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         unique_indices_count,
-    at::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits> lfu_state) {
+    pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits> lfu_state) {
   CUDA_KERNEL_LOOP(n, *N_unique) {
     const auto idx = unique_indices[n];
     lfu_state[idx] += unique_indices_count[n];
@@ -1835,6 +1845,9 @@ void lfu_update_counts_cuda(
   const int32_t N = unique_indices.size(0);
   AT_DISPATCH_INDEX_TYPES(
       unique_indices.scalar_type(), "lfu_update_counts_cuda", [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lfu_update_counts_kernel";
+#endif
         lfu_update_counts_kernel<<<
             std::min(
                 div_round_up(N, kMaxThreads),
@@ -1842,12 +1855,10 @@ void lfu_update_counts_cuda(
             kMaxThreads,
             0,
             at::cuda::getCurrentCUDAStream()>>>(
-            unique_indices
-                .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(func_name, unique_indices, index_t, 1, 32),
             unique_indices_length.data_ptr<int32_t>(),
-            unique_indices_count
-                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-            lfu_state.packed_accessor64<int64_t, 1, at::RestrictPtrTraits>());
+            MAKE_PTA_WITH_NAME(func_name, unique_indices_count, int32_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64));
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 }
@@ -1858,14 +1869,14 @@ static_assert(kCacheSetBits + kLFUCounterBits == 8 * sizeof(int64_t), "");
 
 template <typename index_t>
 __global__ __launch_bounds__(kMaxThreads) void lfu_cache_find_uncached_kernel(
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
         unique_indices,
     const int32_t* __restrict__ N_unique,
     int64_t max_indices,
-    const at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
         lxu_cache_state,
     uint64_t* __restrict__ cache_sets,
-    const at::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
         lfu_state) {
   const int32_t C = lxu_cache_state.size(0);
 
@@ -1922,6 +1933,9 @@ std::pair<Tensor, Tensor> lfu_cache_find_uncached_cuda(
 
   AT_DISPATCH_INDEX_TYPES(
       unique_indices.scalar_type(), "lfu_cache_find_uncached_cuda", [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lfu_cache_find_uncached_kernel";
+#endif
         // Find uncached indices
         lfu_cache_find_uncached_kernel<<<
             std::min(
@@ -1930,14 +1944,12 @@ std::pair<Tensor, Tensor> lfu_cache_find_uncached_cuda(
             dim3(kWarpSize, kMaxThreads / kWarpSize),
             0,
             at::cuda::getCurrentCUDAStream()>>>(
-            unique_indices
-                .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(func_name, unique_indices, index_t, 1, 32),
             unique_indices_length.data_ptr<int32_t>(),
             max_indices,
-            lxu_cache_state
-                .packed_accessor32<int64_t, 2, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
             (uint64_t*)cache_sets.data_ptr<int64_t>(),
-            lfu_state.packed_accessor64<int64_t, 1, at::RestrictPtrTraits>());
+            MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64));
         C10_CUDA_KERNEL_LAUNCH_CHECK();
         // Sort the cache sets and ids
         size_t temp_storage_bytes = 0;
@@ -1974,24 +1986,24 @@ std::pair<Tensor, Tensor> lfu_cache_find_uncached_cuda(
 
 template <typename emb_t, typename cache_t>
 __global__ __launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_kernel(
-    at::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> weights,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> weights,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         cache_hash_size_cumsum,
-    const at::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
         cache_index_table_map,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         weights_offsets,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         D_offsets,
     const uint64_t* __restrict__ sorted_cache_sets,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         cache_set_sorted_indices,
     const int32_t* __restrict__ N_unique,
-    at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
         lxu_cache_state,
-    at::PackedTensorAccessor64<cache_t, 2, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor64<cache_t, 2, at::RestrictPtrTraits>
         lxu_cache_weights,
-    const at::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
         lfu_state,
     bool stochastic_rounding,
     at::PhiloxCudaState stochastic_rounding_philox_args) {
@@ -2164,6 +2176,10 @@ void lfu_cache_insert_cuda(
                                   ->philox_cuda_state(4);
         }
 
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lfu_cache_insert_kernel";
+#endif
+
         lfu_cache_insert_kernel<emb_t, cache_t>
             <<<std::min(
                    div_round_up(N, kCacheMaxThreads / kWarpSize),
@@ -2171,25 +2187,21 @@ void lfu_cache_insert_cuda(
                dim3(kWarpSize, kCacheMaxThreads / kWarpSize),
                0,
                at::cuda::getCurrentCUDAStream()>>>(
-                weights.packed_accessor64<emb_t, 1, at::RestrictPtrTraits>(),
-                cache_hash_size_cumsum
-                    .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-                cache_index_table_map
-                    .packed_accessor64<int32_t, 1, at::RestrictPtrTraits>(),
-                weights_offsets
-                    .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-                D_offsets
-                    .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+                MAKE_PTA_WITH_NAME(func_name, weights, emb_t, 1, 64),
+                MAKE_PTA_WITH_NAME(
+                    func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name, cache_index_table_map, int32_t, 1, 64),
+                MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
+                MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
                 (uint64_t*)sorted_cache_sets.data_ptr<int64_t>(),
-                cache_set_sorted_unique_indices
-                    .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+                MAKE_PTA_WITH_NAME(
+                    func_name, cache_set_sorted_unique_indices, int64_t, 1, 32),
                 unique_indices_length.data_ptr<int32_t>(),
-                lxu_cache_state
-                    .packed_accessor32<int64_t, 2, at::RestrictPtrTraits>(),
-                lxu_cache_weights
-                    .packed_accessor64<cache_t, 2, at::RestrictPtrTraits>(),
-                lfu_state
-                    .packed_accessor64<int64_t, 1, at::RestrictPtrTraits>(),
+                MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name, lxu_cache_weights, cache_t, 2, 64),
+                MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64),
                 stochastic_rounding,
                 rng_engine_inputs);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -2290,26 +2302,26 @@ namespace {
 template <typename index_t>
 __global__
 __launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_byte_kernel(
-    at::PackedTensorAccessor64<uint8_t, 1, at::RestrictPtrTraits> weights,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor64<uint8_t, 1, at::RestrictPtrTraits> weights,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         cache_hash_size_cumsum,
-    const at::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
         cache_index_table_map,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         weights_offsets,
-    const at::PackedTensorAccessor32<uint8_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<uint8_t, 1, at::RestrictPtrTraits>
         weights_tys,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         D_offsets,
     const uint64_t* __restrict__ sorted_cache_sets,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
         cache_set_sorted_indices,
     const int32_t* __restrict__ N_unique,
-    at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
         lxu_cache_state,
-    at::PackedTensorAccessor64<uint8_t, 2, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor64<uint8_t, 2, at::RestrictPtrTraits>
         lxu_cache_weights,
-    const at::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
         lfu_state,
     const int64_t row_alignment) {
   const int32_t C = lxu_cache_state.size(0);
@@ -2433,6 +2445,9 @@ void lfu_cache_insert_byte_cuda(
       cache_set_sorted_unique_indices.scalar_type(),
       "lfu_cache_insert_byte_cuda",
       [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lfu_cache_insert_byte_kernel";
+#endif
         lfu_cache_insert_byte_kernel<<<
             std::min(
                 div_round_up(N, kCacheMaxThreads / kWarpSize),
@@ -2440,24 +2455,21 @@ void lfu_cache_insert_byte_cuda(
             dim3(kWarpSize, kCacheMaxThreads / kWarpSize),
             0,
             at::cuda::getCurrentCUDAStream()>>>(
-            weights.packed_accessor64<uint8_t, 1, at::RestrictPtrTraits>(),
-            cache_hash_size_cumsum
-                .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-            cache_index_table_map
-                .packed_accessor64<int32_t, 1, at::RestrictPtrTraits>(),
-            weights_offsets
-                .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-            weights_tys.packed_accessor32<uint8_t, 1, at::RestrictPtrTraits>(),
-            D_offsets.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(func_name, weights, uint8_t, 1, 64),
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_index_table_map, int32_t, 1, 64),
+            MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, weights_tys, uint8_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
             (uint64_t*)sorted_cache_sets.data_ptr<int64_t>(),
-            cache_set_sorted_unique_indices
-                .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_set_sorted_unique_indices, index_t, 1, 32),
             unique_indices_length.data_ptr<int32_t>(),
-            lxu_cache_state
-                .packed_accessor32<int64_t, 2, at::RestrictPtrTraits>(),
-            lxu_cache_weights
-                .packed_accessor64<uint8_t, 2, at::RestrictPtrTraits>(),
-            lfu_state.packed_accessor64<int64_t, 1, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_weights, uint8_t, 2, 64),
+            MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64),
             row_alignment);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
@@ -2544,15 +2556,15 @@ namespace {
 
 template <typename index_t>
 __global__ __launch_bounds__(kMaxThreads) void lxu_cache_lookup_kernel(
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
         linear_cache_indices,
-    const at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
         lxu_cache_state,
     int64_t invalid_index,
-    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         lxu_cache_locations,
     const bool gather_cache_stats,
-    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         uvm_cache_stats) {
   const int32_t C = lxu_cache_state.size(0);
   const int32_t N = linear_cache_indices.size(0);
@@ -2614,12 +2626,12 @@ __global__ __launch_bounds__(kMaxThreads) void lxu_cache_lookup_kernel(
 template <typename index_t>
 __global__
 __launch_bounds__(kMaxThreads) void direct_mapped_lxu_cache_lookup_kernel(
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
         linear_cache_indices,
-    const at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
         lxu_cache_state,
     int64_t invalid_index,
-    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         lxu_cache_locations) {
   const int32_t C = lxu_cache_state.size(0);
   const int32_t N = linear_cache_indices.size(0);
@@ -2676,21 +2688,20 @@ DLL_PUBLIC Tensor lxu_cache_lookup_cuda(
 
   AT_DISPATCH_INDEX_TYPES(
       linear_cache_indices.scalar_type(), "lxu_cache_lookup_cuda", [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lxu_cache_lookup_kernel";
+#endif
         lxu_cache_lookup_kernel<<<
             blocks,
             threads,
             0,
             at::cuda::getCurrentCUDAStream()>>>(
-            linear_cache_indices
-                .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-            lxu_cache_state
-                .packed_accessor32<int64_t, 2, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(func_name, linear_cache_indices, index_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
             invalid_index,
-            lxu_cache_locations
-                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32),
             gather_cache_stats,
-            uvm_cache_stats_
-                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>());
+            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats_, int32_t, 1, 32));
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 
@@ -2701,9 +2712,9 @@ namespace {
 
 __global__
 __launch_bounds__(kMaxThreads) void lxu_cache_locations_update_kernel(
-    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         lxu_cache_locations,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         lxu_cache_locations_new) {
   const int32_t N = lxu_cache_locations.size(0);
   CUDA_KERNEL_LOOP(n, N) {
@@ -2734,15 +2745,17 @@ DLL_PUBLIC void lxu_cache_locations_update_cuda(
       div_round_up(N, kMaxThreads),
       get_max_thread_blocks_for_cache_kernels_()));
 
+#ifdef FBGEMM_GPU_MEMCHECK
+  const char* func_name = "lxu_cache_locations_update_kernel";
+#endif
+
   lxu_cache_locations_update_kernel<<<
       blocks,
       kMaxThreads,
       0,
       at::cuda::getCurrentCUDAStream()>>>(
-      lxu_cache_locations
-          .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-      lxu_cache_locations_new
-          .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>());
+      MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32),
+      MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations_new, int32_t, 1, 32));
 
   C10_CUDA_KERNEL_LAUNCH_CHECK();
   return;
@@ -2772,18 +2785,18 @@ DLL_PUBLIC Tensor direct_mapped_lxu_cache_lookup_cuda(
       linear_cache_indices.scalar_type(),
       "direct_mapped_lxu_cache_lookup_cuda",
       [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "direct_mapped_lxu_cache_lookup_kernel";
+#endif
         direct_mapped_lxu_cache_lookup_kernel<<<
             blocks,
             kMaxThreads,
             0,
             at::cuda::getCurrentCUDAStream()>>>(
-            linear_cache_indices
-                .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-            lxu_cache_state
-                .packed_accessor32<int64_t, 2, at::RestrictPtrTraits>(),
+            MAKE_PTA_WITH_NAME(func_name, linear_cache_indices, index_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
             invalid_index,
-            lxu_cache_locations
-                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>());
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32));
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 
@@ -2798,17 +2811,17 @@ int get_sm_count_() {
 
 __global__ __launch_bounds__(kMaxThreads) void get_cache_indices_kernel(
     int32_t blocks_per_table,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         cache_hash_size_cumsum,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         pruned_indices,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         pruned_indices_offsets,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         logical_table_ids,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         buffer_ids,
-    at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         linear_cache_indices) {
   const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -2855,37 +2868,37 @@ __global__ __launch_bounds__(kMaxThreads) void get_cache_indices_kernel(
 template <typename emb_t, typename cache_t>
 __global__ __launch_bounds__(kMaxThreads) void reset_weight_momentum_kernel(
     int32_t blocks_per_table,
-    at::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> dev_weights,
-    at::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> uvm_weights,
-    at::PackedTensorAccessor64<cache_t, 2, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> dev_weights,
+    pta::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> uvm_weights,
+    pta::PackedTensorAccessor64<cache_t, 2, at::RestrictPtrTraits>
         lxu_cache_weights,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         weights_placements,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         weights_offsets,
-    at::PackedTensorAccessor64<
+    pta::PackedTensorAccessor64<
         at::acc_type<cache_t, true>,
         1,
         at::RestrictPtrTraits> momentum1_dev,
-    at::PackedTensorAccessor64<
+    pta::PackedTensorAccessor64<
         at::acc_type<cache_t, true>,
         1,
         at::RestrictPtrTraits> momentum1_uvm,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         momentum1_placements,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         momentum1_offsets,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         D_offsets,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         pruned_indices,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         pruned_indices_offsets,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         logical_table_ids,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         buffer_ids,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         lxu_cache_locations) {
   const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -3025,22 +3038,22 @@ DLL_PUBLIC void reset_weight_momentum_cuda(
     auto linear_cache_indices = at::zeros(
         {num_pruned_indices}, pruned_indices.options().dtype(at::kLong));
 
+#ifdef FBGEMM_GPU_MEMCHECK
+    const char* func_name = "get_cache_indices_kernel";
+#endif
+
     get_cache_indices_kernel<<<
         num_pruned_tables * blocks_per_table,
         kMaxThreads,
         0,
         at::cuda::getCurrentCUDAStream()>>>(
         blocks_per_table,
-        cache_hash_size_cumsum
-            .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-        pruned_indices.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-        pruned_indices_offsets
-            .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-        logical_table_ids
-            .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-        buffer_ids.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-        linear_cache_indices
-            .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>());
+        MAKE_PTA_WITH_NAME(func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+        MAKE_PTA_WITH_NAME(func_name, pruned_indices, int64_t, 1, 32),
+        MAKE_PTA_WITH_NAME(func_name, pruned_indices_offsets, int64_t, 1, 32),
+        MAKE_PTA_WITH_NAME(func_name, logical_table_ids, int32_t, 1, 32),
+        MAKE_PTA_WITH_NAME(func_name, buffer_ids, int32_t, 1, 32),
+        MAKE_PTA_WITH_NAME(func_name, linear_cache_indices, int64_t, 1, 32));
     C10_CUDA_KERNEL_LAUNCH_CHECK();
 
     // Look up cache locations
@@ -3060,42 +3073,39 @@ DLL_PUBLIC void reset_weight_momentum_cuda(
       lxu_cache_weights.scalar_type(),
       "reset_weight_momentum_kernel",
       ([&] {
-        reset_weight_momentum_kernel<emb_t, cache_t><<<
-            num_pruned_tables * blocks_per_table,
-            kMaxThreads,
-            0,
-            at::cuda::getCurrentCUDAStream()>>>(
-            blocks_per_table,
-            dev_weights.packed_accessor64<emb_t, 1, at::RestrictPtrTraits>(),
-            uvm_weights.packed_accessor64<emb_t, 1, at::RestrictPtrTraits>(),
-            lxu_cache_weights
-                .packed_accessor64<cache_t, 2, at::RestrictPtrTraits>(),
-            weights_placements
-                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-            weights_offsets
-                .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-            momentum1_dev.packed_accessor64<
-                at::acc_type<cache_t, true>,
-                1,
-                at::RestrictPtrTraits>(),
-            momentum1_uvm.packed_accessor64<
-                at::acc_type<cache_t, true>,
-                1,
-                at::RestrictPtrTraits>(),
-            momentum1_placements
-                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-            momentum1_offsets
-                .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-            D_offsets.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-            pruned_indices
-                .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-            pruned_indices_offsets
-                .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-            logical_table_ids
-                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-            buffer_ids.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-            lxu_cache_locations
-                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>());
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name2 = "get_cache_indices_kernel";
+#endif
+        reset_weight_momentum_kernel<emb_t, cache_t>
+            <<<num_pruned_tables * blocks_per_table,
+               kMaxThreads,
+               0,
+               at::cuda::getCurrentCUDAStream()>>>(
+                blocks_per_table,
+                MAKE_PTA_WITH_NAME(func_name2, dev_weights, emb_t, 1, 64),
+                MAKE_PTA_WITH_NAME(func_name2, uvm_weights, emb_t, 1, 64),
+                MAKE_PTA_WITH_NAME(
+                    func_name2, lxu_cache_weights, cache_t, 2, 64),
+                MAKE_PTA_WITH_NAME(
+                    func_name2, weights_placements, int32_t, 1, 32),
+                MAKE_PTA_WITH_NAME(func_name2, weights_offsets, int64_t, 1, 32),
+                MAKE_PTA_ACC_WITH_NAME(
+                    func_name2, momentum1_dev, cache_t, 1, 64),
+                MAKE_PTA_ACC_WITH_NAME(
+                    func_name2, momentum1_uvm, cache_t, 1, 64),
+                MAKE_PTA_WITH_NAME(
+                    func_name2, momentum1_placements, int32_t, 1, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name2, momentum1_offsets, int64_t, 1, 32),
+                MAKE_PTA_WITH_NAME(func_name2, D_offsets, int32_t, 1, 32),
+                MAKE_PTA_WITH_NAME(func_name2, pruned_indices, int64_t, 1, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name2, pruned_indices_offsets, int64_t, 1, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name2, logical_table_ids, int32_t, 1, 32),
+                MAKE_PTA_WITH_NAME(func_name2, buffer_ids, int32_t, 1, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name2, lxu_cache_locations, int32_t, 1, 32));
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       }));
 }

From 6f0abb0502d4782d0489890216e8927254d1ebfd Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Thu, 31 Aug 2023 14:49:43 -0700
Subject: [PATCH 02/94] Remove debug_synchronous from CUB call sites in FBGEMM
 ops (#1973)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1973

- Remove debug_synchronous from CUB call sites in FBGEMM ops

Reviewed By: sryap

Differential Revision: D48722495

fbshipit-source-id: 47a92dc82e9fe271d719913f8a842f7fa2c8f36f
---
 .../embedding_backward_split_template.cu      | 12 +++----
 .../fbgemm_gpu/split_embeddings_utils.cuh     |  3 +-
 fbgemm_gpu/src/split_embeddings_utils.cu      | 36 +++++++++++++++++--
 3 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/fbgemm_gpu/codegen/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/embedding_backward_split_template.cu
index a762c163bf..3a3cfb1983 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_template.cu
+++ b/fbgemm_gpu/codegen/embedding_backward_split_template.cu
@@ -524,8 +524,7 @@ Tensor split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_e
             linear_indices.numel(),
             0,
             total_hash_size_bits,
-            at::cuda::getCurrentCUDAStream(),
-            false));
+            at::cuda::getCurrentCUDAStream()));
         auto temp_storage = at::empty(
             {static_cast<int64_t>(temp_storage_bytes)},
             indices.options().dtype(at::kByte));
@@ -539,8 +538,7 @@ Tensor split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_e
             linear_indices.numel(),
             0,
             total_hash_size_bits,
-            at::cuda::getCurrentCUDAStream(),
-            false));
+            at::cuda::getCurrentCUDAStream()));
     }
     {%- endif %}
 
@@ -568,8 +566,7 @@ Tensor split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_e
                 linear_indices.numel(),
                 0,
                 total_hash_size_bits,
-                at::cuda::getCurrentCUDAStream(),
-                false));
+                at::cuda::getCurrentCUDAStream()));
             auto temp_storage = at::empty(
                 {static_cast<int64_t>(temp_storage_bytes)},
                 indices.options().dtype(at::kByte));
@@ -583,8 +580,7 @@ Tensor split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_e
                 linear_indices.numel(),
                 0,
                 total_hash_size_bits,
-                at::cuda::getCurrentCUDAStream(),
-                false));
+                at::cuda::getCurrentCUDAStream()));
             }
             {%- endif %}
 
diff --git a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh
index 7cd1301c9c..45c08e1028 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh
@@ -69,8 +69,7 @@ std::tuple<int32_t, uint32_t> adjust_info_B_num_bits(int32_t B, int32_t T);
       int num_items,                           \
       int begin_bit = 0,                       \
       int end_bit = sizeof(KeyT) * 8,          \
-      cudaStream_t stream = 0,                 \
-      bool debug_synchronous = false)
+      cudaStream_t stream = 0)
 
 DECL_RADIX_SORT_PAIRS_FN(int64_t, float);
 DECL_RADIX_SORT_PAIRS_FN(int64_t, double);
diff --git a/fbgemm_gpu/src/split_embeddings_utils.cu b/fbgemm_gpu/src/split_embeddings_utils.cu
index 0b887f7238..b9fba4b5f5 100644
--- a/fbgemm_gpu/src/split_embeddings_utils.cu
+++ b/fbgemm_gpu/src/split_embeddings_utils.cu
@@ -21,6 +21,10 @@
 #include "fbgemm_gpu/cub_namespace_postfix.cuh"
 // clang-format on
 
+#ifdef __HIP_PLATFORM_HCC__
+#include <rocm_version.h>
+#endif
+
 inline at::Tensor asynchronous_complete_cumsum(at::Tensor t_in) {
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(t_in.get_device());
@@ -442,6 +446,32 @@ DLL_PUBLIC std::tuple<int32_t, uint32_t> adjust_info_B_num_bits(
   return {info_B_num_bits, info_B_mask};
 }
 
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
+#define DEF_RADIX_SORT_PAIRS_FN(KeyT, ValueT)                        \
+  DLL_PUBLIC cudaError_t radix_sort_pairs(                           \
+      void* d_temp_storage,                                          \
+      size_t& temp_storage_bytes,                                    \
+      const KeyT* d_keys_in,                                         \
+      KeyT* d_keys_out,                                              \
+      const ValueT* d_values_in,                                     \
+      ValueT* d_values_out,                                          \
+      const int num_items,                                           \
+      const int begin_bit,                                           \
+      const int end_bit,                                             \
+      cudaStream_t stream) {                                         \
+    return FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs( \
+        d_temp_storage,                                              \
+        temp_storage_bytes,                                          \
+        d_keys_in,                                                   \
+        d_keys_out,                                                  \
+        d_values_in,                                                 \
+        d_values_out,                                                \
+        num_items,                                                   \
+        begin_bit,                                                   \
+        end_bit,                                                     \
+        stream);                                                     \
+  }
+#else
 #define DEF_RADIX_SORT_PAIRS_FN(KeyT, ValueT)                        \
   DLL_PUBLIC cudaError_t radix_sort_pairs(                           \
       void* d_temp_storage,                                          \
@@ -453,8 +483,7 @@ DLL_PUBLIC std::tuple<int32_t, uint32_t> adjust_info_B_num_bits(
       const int num_items,                                           \
       const int begin_bit,                                           \
       const int end_bit,                                             \
-      cudaStream_t stream,                                           \
-      const bool debug_synchronous) {                                \
+      cudaStream_t stream) {                                         \
     return FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs( \
         d_temp_storage,                                              \
         temp_storage_bytes,                                          \
@@ -466,8 +495,9 @@ DLL_PUBLIC std::tuple<int32_t, uint32_t> adjust_info_B_num_bits(
         begin_bit,                                                   \
         end_bit,                                                     \
         stream,                                                      \
-        debug_synchronous);                                          \
+        false);                                                      \
   }
+#endif
 
 DEF_RADIX_SORT_PAIRS_FN(int64_t, float);
 DEF_RADIX_SORT_PAIRS_FN(int64_t, double);

From 196ad13ed0af57f21d913b5bb75a89738427df45 Mon Sep 17 00:00:00 2001
From: Mengchi Zhang <mengchi@meta.com>
Date: Sun, 3 Sep 2023 10:52:11 -0700
Subject: [PATCH 03/94] Add meta backend for new_managed_tensor (#1990)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1990

As titled.

Reviewed By: jspark1105

Differential Revision: D48927429

fbshipit-source-id: dc5c11471ba2bb8bbc330d25b49c839deb14ac5c
---
 fbgemm_gpu/src/cumem_utils.cu       |  6 ++++++
 fbgemm_gpu/src/cumem_utils.h        |  4 ++++
 fbgemm_gpu/src/cumem_utils_host.cpp |  1 +
 fbgemm_gpu/test/uvm_test.py         | 12 ++++++++++++
 4 files changed, 23 insertions(+)

diff --git a/fbgemm_gpu/src/cumem_utils.cu b/fbgemm_gpu/src/cumem_utils.cu
index ab52c76d9e..9f7ecc308d 100644
--- a/fbgemm_gpu/src/cumem_utils.cu
+++ b/fbgemm_gpu/src/cumem_utils.cu
@@ -188,6 +188,12 @@ Tensor new_managed_tensor(
   return t;
 }
 
+Tensor new_managed_tensor_meta(
+    const Tensor& self,
+    const std::vector<std::int64_t>& sizes) {
+  return at::empty(sizes, self.options());
+}
+
 // Allocate a cuda Tensor with unified managed memory (UVM) without the
 // additional steps taked by new_managed_tensor above
 Tensor new_vanilla_managed_tensor(
diff --git a/fbgemm_gpu/src/cumem_utils.h b/fbgemm_gpu/src/cumem_utils.h
index 793174eabf..93debaeed5 100644
--- a/fbgemm_gpu/src/cumem_utils.h
+++ b/fbgemm_gpu/src/cumem_utils.h
@@ -23,6 +23,10 @@ Tensor new_managed_tensor(
     const Tensor& self,
     const std::vector<std::int64_t>& sizes);
 
+Tensor new_managed_tensor_meta(
+    const Tensor& self,
+    const std::vector<std::int64_t>& sizes);
+
 ///@ingroup cumem-utils
 // Allocate the ATen Tensor with host-mapped memory
 Tensor new_host_mapped_tensor(
diff --git a/fbgemm_gpu/src/cumem_utils_host.cpp b/fbgemm_gpu/src/cumem_utils_host.cpp
index da865fea84..a3e32483ce 100644
--- a/fbgemm_gpu/src/cumem_utils_host.cpp
+++ b/fbgemm_gpu/src/cumem_utils_host.cpp
@@ -27,6 +27,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   DISPATCH_TO_CUDA("uvm_to_cpu", uvm_to_cpu);
   m.def("new_managed_tensor(Tensor self, int[] sizes) -> Tensor");
   DISPATCH_TO_CUDA("new_managed_tensor", new_managed_tensor);
+  DISPATCH_TO_META("new_managed_tensor", new_managed_tensor_meta);
   m.def("new_host_mapped_tensor(Tensor self, int[] sizes) -> Tensor");
   DISPATCH_TO_CUDA("new_host_mapped_tensor", new_host_mapped_tensor);
   m.def(
diff --git a/fbgemm_gpu/test/uvm_test.py b/fbgemm_gpu/test/uvm_test.py
index 677dea48c6..89ce026d7f 100644
--- a/fbgemm_gpu/test/uvm_test.py
+++ b/fbgemm_gpu/test/uvm_test.py
@@ -323,6 +323,18 @@ def test_uvm_to_cpu_clone(self, sizes: List[int], uvm_op) -> None:
         assert not torch.ops.fbgemm.is_uvm_tensor(cpu_clone)
         assert not torch.ops.fbgemm.uvm_storage(cpu_clone)
 
+    @unittest.skipIf(*gpu_unavailable)
+    @given(
+        sizes=st.lists(
+            st.integers(min_value=1, max_value=(512)), min_size=1, max_size=3
+        ),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    def test_new_managed_tensor_meta(self, sizes: List[int]) -> None:
+        cpu_tensor = torch.empty(sizes).to("meta")
+        cpu_tensor_meta = torch.ops.fbgemm.new_managed_tensor(cpu_tensor, sizes)
+        assert cpu_tensor.shape == cpu_tensor_meta.shape
+
 
 if __name__ == "__main__":
     unittest.main()

From 76330fa2ef917b2348eecf5f85ae8e2d696fe4cb Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@meta.com>
Date: Sun, 3 Sep 2023 23:53:08 -0700
Subject: [PATCH 04/94] Clear torch.compile cache between tests (#1992)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1992

Preparing for reducing cache size work.

Reviewed By: jspark1105

Differential Revision: D48940544

fbshipit-source-id: e0047c784918d886a935f2ec059a05fbc5064e6f
---
 fbgemm_gpu/test/jagged_tensor_ops_test.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py
index ee868ff100..f8cd1cd6ff 100644
--- a/fbgemm_gpu/test/jagged_tensor_ops_test.py
+++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py
@@ -320,6 +320,9 @@ def test_jagged_2d_to_dense_dynamic_shape(
         dtype: torch.dtype,
         device_type: str,
     ) -> None:
+        # Start a fresh compile for each parameter of the test case
+        torch._dynamo.reset()
+
         D = D * 4
         lengths_ = np.random.randint(low=0, high=max_sequence_length, size=B)
         total_lengths = lengths_.sum()
@@ -523,6 +526,9 @@ def test_jagged_1d_to_dense_truncation(self) -> None:
     def test_jagged_1d_to_dense_dynamic_shape(
         self, B: int, max_sequence_length: int, padding_value: int, device_type: str
     ) -> None:
+        # Start a fresh compile for each parameter of the test case
+        torch._dynamo.reset()
+
         def lengths_to_segment_ids(lengths: torch.Tensor) -> torch.Tensor:
             return torch.repeat_interleave(
                 torch._dim_arange(lengths, 0).long(),
@@ -912,6 +918,9 @@ def test_dense_to_jagged_dynamic_shape(
         dtype: torch.dtype,
         device_type: str,
     ) -> None:
+        # Start a fresh compile for each parameter of the test case
+        torch._dynamo.reset()
+
         values_2d, offsets, max_lengths = self._generate_jagged_tensor(
             num_jagged_dim,
             outer_dense_size,
@@ -1248,6 +1257,9 @@ def test_jagged_elementwise_binary_dynamic_shape(
         dtype: torch.dtype,
         device_type: str,
     ) -> None:
+        # Start a fresh compile for each parameter of the test case
+        torch._dynamo.reset()
+
         device = torch.device(device_type)
 
         x_values, x_offsets, max_lengths = self._generate_jagged_tensor(
@@ -1514,6 +1526,9 @@ def test_jagged_dense_dense_elementwise_add_jagged_output_dynamic_shape(
         dtype: torch.dtype,
         device_type: str,
     ) -> None:
+        # Start a fresh compile for each parameter of the test case
+        torch._dynamo.reset()
+
         x_values, x_offsets, max_lengths = self._generate_jagged_tensor(
             num_jagged_dim,
             outer_dense_size,
@@ -1720,6 +1735,9 @@ def test_batched_dense_vec_jagged_2d_mul_dynamic_shape(
         dtype: torch.dtype,
         device_type: str,
     ) -> None:
+        # Start a fresh compile for each parameter of the test case
+        torch._dynamo.reset()
+
         assume(H == 1 or B != 0)
 
         device = torch.device(device_type)
@@ -2405,6 +2423,9 @@ def test_jagged_dense_bmm_dynamic_shape(
         dtype: torch.dtype,
         device_type: str,
     ) -> None:
+        # Start a fresh compile for each parameter of the test case
+        torch._dynamo.reset()
+
         assume(B != 0)
         device = torch.device(device_type)
         torch.backends.cuda.matmul.allow_tf32 = False

From 9ed959f1d4e5b6b753359e5797a7c00c956d21dc Mon Sep 17 00:00:00 2001
From: Sungmin Cho <sungmincho@meta.com>
Date: Mon, 4 Sep 2023 01:55:09 -0700
Subject: [PATCH 05/94] uvm_cache_stats for direct mapped (#1951)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1951

D40518654 introduced `uvm_cache_stats` to provide cache metrics for FBGEMM 32way cache.
This diff expands its usage to also provide cache metrics for direct mapped cache.

Reviewed By: sryap, doehyun

Differential Revision: D48023956

fbshipit-source-id: d183cf39ee848e9cb4e27686b7f45ca0a162370b
---
 .../split_embeddings_cache_cuda.cuh           |   8 +-
 fbgemm_gpu/src/split_embeddings_cache_cuda.cu | 102 ++++++++++++++++--
 .../src/split_table_batched_embeddings.cpp    |   4 +-
 3 files changed, 104 insertions(+), 10 deletions(-)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh
index 581a85d2d4..be71c24baf 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh
@@ -114,7 +114,9 @@ void direct_mapped_lru_cache_populate_byte_cuda(
     int64_t time_stamp,
     at::Tensor lru_state,
     at::Tensor lxu_cache_miss_timestamp,
-    int64_t row_alignment);
+    int64_t row_alignment,
+    bool gather_cache_stats,
+    c10::optional<at::Tensor> uvm_cache_stats);
 
 ///@ingroup table-batched-embed-cuda
 /// LFU cache: fetch the rows corresponding to `linear_cache_indices` from
@@ -174,7 +176,9 @@ at::Tensor emulate_cache_miss(
 at::Tensor direct_mapped_lxu_cache_lookup_cuda(
     at::Tensor linear_cache_indices,
     at::Tensor lxu_cache_state,
-    int64_t invalid_index);
+    int64_t invalid_index,
+    bool gather_cache_stats,
+    c10::optional<at::Tensor> uvm_cache_stats);
 
 //////@ingroup table-batched-embed-cuda
 /// Flush the cache: store the weights from the cache to the backing storage.
diff --git a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
index 9eb21b6e60..ce33c5fcd7 100644
--- a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
+++ b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
@@ -11,6 +11,7 @@
 #include <cub/device/device_radix_sort.cuh>
 #include <cub/device/device_run_length_encode.cuh>
 #include <cub/device/device_select.cuh>
+#include <cub/block/block_reduce.cuh>
 #include "fbgemm_gpu/cub_namespace_postfix.cuh"
 // clang-format on
 
@@ -742,11 +743,24 @@ __launch_bounds__(kMaxThreads) void direct_mapped_lru_cache_find_uncached_kernel
         lxu_cache_state,
     const int64_t time_stamp,
     pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
+    const bool gather_cache_stats,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        uvm_cache_stats,
     pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
         lxu_cache_miss_timestamp) {
   const int32_t N = linear_cache_indices.size(0);
   const int32_t C = lxu_cache_state.size(0);
 
+  if (gather_cache_stats) {
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+      atomicAdd(
+          &uvm_cache_stats[uvm_cache_stats_index::num_calls], 1); // N_called.
+      atomicAdd(
+          &uvm_cache_stats[uvm_cache_stats_index::num_requested_indices],
+          N); // N_requested_indices.
+    }
+  }
+
   CUDA_KERNEL_LOOP(n, N) {
     int64_t idx = linear_cache_indices[n];
     if (idx == max_indices) {
@@ -893,7 +907,9 @@ Tensor direct_mapped_lru_cache_find_uncached_cuda(
     Tensor lxu_cache_state,
     int64_t time_stamp,
     Tensor lru_state,
-    Tensor lxu_cache_miss_timestamp) {
+    Tensor lxu_cache_miss_timestamp,
+    bool gather_cache_stats,
+    Tensor uvm_cache_stats) {
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
       linear_cache_indices,
       lxu_cache_state,
@@ -929,6 +945,8 @@ Tensor direct_mapped_lru_cache_find_uncached_cuda(
             MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
             time_stamp,
             MAKE_PTA_WITH_NAME(func_name, lru_state, int64_t, 2, 32),
+            gather_cache_stats,
+            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats, int32_t, 1, 32),
             MAKE_PTA_WITH_NAME(
                 func_name, lxu_cache_miss_timestamp, int64_t, 2, 32));
         C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -1431,6 +1449,9 @@ __launch_bounds__(kMaxThreads) void direct_mapped_lru_cache_insert_byte_kernel(
     pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
         lxu_cache_miss_timestamp,
     pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> cache_sets,
+    const bool gather_cache_stats,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        uvm_cache_stats,
     const int64_t row_alignment) {
   const int32_t N = cache_sets.size(0);
 
@@ -1458,6 +1479,24 @@ __launch_bounds__(kMaxThreads) void direct_mapped_lru_cache_insert_byte_kernel(
     //   continue;
     // }
 
+    if (gather_cache_stats && threadIdx.x == 0) {
+      // We are using this slot for a slightly different purpose.
+      // In 32 way:
+      //    UVM traffic for insert
+      //    = # of inserted rows
+      //    = # of unique misses - # of unique misses that were not inserted
+      //    = uvm_cache_stats_index::num_unique_misses
+      //      - uvm_cache_stats_index::num_conflict_unique_misses
+      // In Direct Mapped (here):
+      //    UVM traffic for insert
+      //    = # of inserted rows
+      //    = uvm_cache_stats_index::num_conflict_unique_misses
+      //      (just store here directly)
+      atomicAdd(
+          &uvm_cache_stats[uvm_cache_stats_index::num_conflict_unique_misses],
+          1);
+    }
+
     // insert the index in the buffer into our only slot
     const int32_t insert_slot = 0;
 
@@ -1579,6 +1618,8 @@ void direct_mapped_lru_cache_insert_byte_cuda(
     Tensor linear_cache_indices,
     Tensor lxu_cache_miss_timestamp,
     Tensor cache_sets,
+    bool gather_cache_stats,
+    Tensor uvm_cache_stats,
     int64_t row_alignment) {
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
       weights,
@@ -1628,6 +1669,8 @@ void direct_mapped_lru_cache_insert_byte_cuda(
             MAKE_PTA_WITH_NAME(
                 func_name, lxu_cache_miss_timestamp, int64_t, 2, 32),
             MAKE_PTA_WITH_NAME(func_name, cache_sets, int32_t, 1, 32),
+            gather_cache_stats,
+            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats, int32_t, 1, 32),
             row_alignment);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
@@ -1739,7 +1782,9 @@ DLL_PUBLIC void direct_mapped_lru_cache_populate_byte_cuda(
     int64_t time_stamp,
     Tensor lru_state,
     Tensor lxu_cache_miss_timestamp,
-    int64_t row_alignment) {
+    int64_t row_alignment,
+    bool gather_cache_stats,
+    c10::optional<Tensor> uvm_cache_stats) {
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
       weights,
       cache_hash_size_cumsum,
@@ -1753,6 +1798,14 @@ DLL_PUBLIC void direct_mapped_lru_cache_populate_byte_cuda(
       lru_state,
       lxu_cache_miss_timestamp);
 
+  if (gather_cache_stats) {
+    TORCH_CHECK(uvm_cache_stats.has_value());
+    TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+        uvm_cache_stats, lxu_cache_weights);
+  }
+  auto uvm_cache_stats_ = uvm_cache_stats.value_or(
+      at::empty({0}, weights.options().dtype(at::kInt)));
+
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(weights.get_device());
 
@@ -1795,7 +1848,9 @@ DLL_PUBLIC void direct_mapped_lru_cache_populate_byte_cuda(
       lxu_cache_state,
       time_stamp,
       lru_state,
-      lxu_cache_miss_timestamp);
+      lxu_cache_miss_timestamp,
+      gather_cache_stats,
+      uvm_cache_stats_);
 
   // insert caching weights
   direct_mapped_lru_cache_insert_byte_cuda(
@@ -1812,6 +1867,8 @@ DLL_PUBLIC void direct_mapped_lru_cache_populate_byte_cuda(
       linear_cache_indices,
       lxu_cache_miss_timestamp,
       cache_sets,
+      gather_cache_stats,
+      uvm_cache_stats_,
       row_alignment);
 }
 
@@ -2632,10 +2689,16 @@ __launch_bounds__(kMaxThreads) void direct_mapped_lxu_cache_lookup_kernel(
         lxu_cache_state,
     int64_t invalid_index,
     pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        lxu_cache_locations) {
+        lxu_cache_locations,
+    const bool gather_cache_stats,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        uvm_cache_stats) {
   const int32_t C = lxu_cache_state.size(0);
   const int32_t N = linear_cache_indices.size(0);
 
+  int32_t n_indices = 0;
+  int32_t n_hits = 0;
+
   CUDA_KERNEL_LOOP(n, N) {
     int32_t cache_location = kCacheLocationMissing;
     const auto slot = 0;
@@ -2646,13 +2709,29 @@ __launch_bounds__(kMaxThreads) void direct_mapped_lxu_cache_lookup_kernel(
     }
 
     const int32_t cache_set = cache_slot(idx, C);
+    n_indices++;
     const bool found =
         (::__ldg((&lxu_cache_state[cache_set][0]) + slot) == idx);
     if (found) {
       cache_location = cache_set;
+      n_hits++;
     }
     lxu_cache_locations[n] = cache_location;
   }
+
+  if (gather_cache_stats) {
+    typedef cub::BlockReduce<int32_t, kMaxThreads> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp;
+
+    const int32_t conflict_miss = n_indices - n_hits;
+    const int32_t conflict_miss_sum = BlockReduce(temp).Sum(conflict_miss);
+
+    if (threadIdx.x == 0) {
+      atomicAdd(
+          &uvm_cache_stats[uvm_cache_stats_index::num_conflict_misses],
+          conflict_miss_sum);
+    }
+  }
 }
 
 } // namespace
@@ -2764,9 +2843,18 @@ DLL_PUBLIC void lxu_cache_locations_update_cuda(
 DLL_PUBLIC Tensor direct_mapped_lxu_cache_lookup_cuda(
     Tensor linear_cache_indices,
     Tensor lxu_cache_state,
-    int64_t invalid_index) {
+    int64_t invalid_index,
+    bool gather_cache_stats,
+    c10::optional<Tensor> uvm_cache_stats) {
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
       linear_cache_indices, lxu_cache_state);
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(uvm_cache_stats, lxu_cache_state);
+
+  if (gather_cache_stats) {
+    TORCH_CHECK(uvm_cache_stats.has_value());
+  }
+  auto uvm_cache_stats_ = uvm_cache_stats.value_or(
+      at::empty({0}, linear_cache_indices.options().dtype(at::kInt)));
 
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(linear_cache_indices.get_device());
@@ -2796,7 +2884,9 @@ DLL_PUBLIC Tensor direct_mapped_lxu_cache_lookup_cuda(
             MAKE_PTA_WITH_NAME(func_name, linear_cache_indices, index_t, 1, 32),
             MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
             invalid_index,
-            MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32));
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32),
+            gather_cache_stats,
+            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats_, int32_t, 1, 32));
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 
diff --git a/fbgemm_gpu/src/split_table_batched_embeddings.cpp b/fbgemm_gpu/src/split_table_batched_embeddings.cpp
index e32126f40b..31a330fe73 100644
--- a/fbgemm_gpu/src/split_table_batched_embeddings.cpp
+++ b/fbgemm_gpu/src/split_table_batched_embeddings.cpp
@@ -31,7 +31,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       "lru_cache_populate_byte(Tensor weights, Tensor hash_size_cumsum, int total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor linear_cache_indices, Tensor(a!) lxu_cache_state, Tensor(b!) lxu_cache_weights, int time_stamp, Tensor(c!) lru_state, int row_alignment=16, bool gather_cache_stats=False, Tensor(d!)? uvm_cache_stats=None) -> ()");
   DISPATCH_TO_CUDA("lru_cache_populate_byte", lru_cache_populate_byte_cuda);
   m.def(
-      "direct_mapped_lru_cache_populate_byte(Tensor weights, Tensor hash_size_cumsum, int total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor linear_cache_indices, Tensor(a!) lxu_cache_state, Tensor(b!) lxu_cache_weights, int time_stamp, Tensor(c!) lru_state, Tensor(d!) lxu_cache_miss_timestamp, int row_alignment=16) -> ()");
+      "direct_mapped_lru_cache_populate_byte(Tensor weights, Tensor hash_size_cumsum, int total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor linear_cache_indices, Tensor(a!) lxu_cache_state, Tensor(b!) lxu_cache_weights, int time_stamp, Tensor(c!) lru_state, Tensor(d!) lxu_cache_miss_timestamp, int row_alignment=16, bool gather_cache_stats=False, Tensor(e!)? uvm_cache_stats=None) -> ()");
   DISPATCH_TO_CUDA(
       "direct_mapped_lru_cache_populate_byte",
       direct_mapped_lru_cache_populate_byte_cuda);
@@ -45,7 +45,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       "lxu_cache_lookup(Tensor linear_cache_indices, Tensor lxu_cache_state, int invalid_index = -1, bool gather_cache_stats=False, Tensor(a!)? uvm_cache_stats=None) -> Tensor");
   DISPATCH_TO_CUDA("lxu_cache_lookup", lxu_cache_lookup_cuda);
   m.def(
-      "direct_mapped_lxu_cache_lookup(Tensor linear_cache_indices, Tensor lxu_cache_state, int invalid_index = -1) -> Tensor");
+      "direct_mapped_lxu_cache_lookup(Tensor linear_cache_indices, Tensor lxu_cache_state, int invalid_index = -1, bool gather_cache_stats=False, Tensor(a!)? uvm_cache_stats=None) -> Tensor");
   DISPATCH_TO_CUDA(
       "direct_mapped_lxu_cache_lookup", direct_mapped_lxu_cache_lookup_cuda);
   m.def(

From 50f831390efc1939d43ff97b84225a63d8e1d60f Mon Sep 17 00:00:00 2001
From: Supadchaya Puangpontip <supadchaya@meta.com>
Date: Wed, 6 Sep 2023 12:02:47 -0700
Subject: [PATCH 06/94] Skip PooledEmbeddingModulesTest until FailedHealthCheck
 is fixed (#1999)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1999

Hypothesis version 6.83.2 onwards introduces
`HealthCheck.differing_executors` that causes tests
in`permute_pooled_embedding_test.py` to fail with error:

`The method PooledEmbeddingModulesTest.setUp was called from multiple different executors. This may lead to flaky tests and nonreproducible errors when replaying from database`.

Currently, we're using the latest version of hypothesis on CI:

https://github.com/pytorch/FBGEMM/actions/runs/6084855480/job/16515052387

Current hypothesis on FBCode is 6.70.1 which does not have
`HealthCheck.differing_executors`.

Reviewed By: shintaro-iwasaki

Differential Revision: D49020046

fbshipit-source-id: 8ab1350411260c771baf05efe607f91c12df2385
---
 .../test/permute_pooled_embedding_test.py       | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/fbgemm_gpu/test/permute_pooled_embedding_test.py b/fbgemm_gpu/test/permute_pooled_embedding_test.py
index e4858f09ed..53fab069a5 100644
--- a/fbgemm_gpu/test/permute_pooled_embedding_test.py
+++ b/fbgemm_gpu/test/permute_pooled_embedding_test.py
@@ -26,6 +26,11 @@
 
 typed_gpu_unavailable: Tuple[bool, str] = gpu_unavailable
 
+if getattr(HealthCheck, "not_a_test_method", False):
+    suppressed_list: List[HealthCheck] = [HealthCheck.not_a_test_method]
+else:
+    suppressed_list = []
+
 INTERN_MODULE = "fbgemm_gpu.permute_pooled_embedding_modules"
 FIXED_EXTERN_API = {
     "PermutePooledEmbeddings": {
@@ -68,13 +73,13 @@ def forward(self, x: Tensor) -> Tensor:
 
 # @parameterized_class([{"device_type": "cpu"}, {"device_type": "cuda"}])
 class PooledEmbeddingModulesTest(unittest.TestCase):
-    @settings(deadline=10000, suppress_health_check=[HealthCheck.not_a_test_method])
+    @settings(deadline=10000, suppress_health_check=suppressed_list)
     # pyre-fixme[56]: Pyre was not able to infer the type of argument
     @given(device_type=cpu_and_maybe_gpu())
     def setUp(self, device_type: torch.device) -> None:
         self.device = device_type
 
-    @unittest.skipIf(*typed_gpu_unavailable)
+    @unittest.skipIf(True, "Skip until FailedHealthCheck is fixed")
     def test_permutation(self) -> None:
         net = Net().to(self.device)
 
@@ -84,7 +89,7 @@ def test_permutation(self) -> None:
             [6, 7, 8, 9, 0, 1, 5, 2, 3, 4],
         )
 
-    @unittest.skipIf(*typed_gpu_unavailable)
+    @unittest.skipIf(True, "Skip until FailedHealthCheck is fixed")
     def test_permutation_autograd(self) -> None:
         net = Net().to(self.device)
 
@@ -117,7 +122,7 @@ def test_compatibility(self) -> None:
                     f"{FWD_COMPAT_MSG}",
                 )
 
-    @unittest.skipIf(*typed_gpu_unavailable)
+    @unittest.skipIf(True, "Skip until FailedHealthCheck is fixed")
     def test_pooled_table_batched_embedding(self) -> None:
         num_emb_bags = 5
         num_embeddings = 10
@@ -160,7 +165,7 @@ def test_pooled_table_batched_embedding(self) -> None:
             ref_permuted_pooled_emb.to(self.device), permuted_pooled_emb
         )
 
-    @unittest.skipIf(*typed_gpu_unavailable)
+    @unittest.skipIf(True, "Skip until FailedHealthCheck is fixed")
     def test_permutation_autograd_meta(self) -> None:
         """
         Test that permute_pooled_embeddings_autograd works with meta tensor and
@@ -175,7 +180,7 @@ def test_permutation_autograd_meta(self) -> None:
         assert output_meta.shape == output_cpu.shape
         assert input.shape == output_meta.shape
 
-    @unittest.skipIf(*typed_gpu_unavailable)
+    @unittest.skipIf(True, "Skip until FailedHealthCheck is fixed")
     def test_duplicate_permutations(self) -> None:
         embs_dims = [2, 3, 1, 4]
         permute = [3, 0, 2, 0, 1, 3]

From a695e775a13ff2ecddae31a62956c3411ba7cf14 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Wed, 6 Sep 2023 15:13:26 -0700
Subject: [PATCH 07/94] Build scripts for release testing (#1998)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1998

- Assemble build script for flexible release testing

Reviewed By: spcyppt

Differential Revision: D49011206

fbshipit-source-id: bb08e76803ac6f9e6d644fd04db9f24a25e0fce1
---
 .github/scripts/fbgemm_gpu_test.bash | 78 ++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
index 88b0561519..4e37a1a17f 100644
--- a/.github/scripts/fbgemm_gpu_test.bash
+++ b/.github/scripts/fbgemm_gpu_test.bash
@@ -113,3 +113,81 @@ run_fbgemm_gpu_tests () {
     fi
   done
 }
+
+
+################################################################################
+# FBGEMM_GPU Test Bulk-Combination Functions
+################################################################################
+
+test_setup_conda_environment () {
+  local python_version="$1"
+  local pytorch_installer="$2"
+  local pytorch_version="$3"
+  local pytorch_variant_type="$4"
+  local pytorch_variant_version="$5"
+  if [ "$pytorch_variant_type" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} PYTHON_VERSION PYTORCH_INSTALLER PYTORCH_VERSION PYTORCH_VARIANT_TYPE [PYTORCH_VARIANT_VERSION]"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env 3.8 pip test cuda 11.8.0       # Setup environment with pytorch-test for Python 3.8 + CUDA 11.8.0"
+    return 1
+  else
+    echo "################################################################################"
+    echo "# Setup FBGEMM-GPU Build Container (All Steps)"
+    echo "#"
+    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "################################################################################"
+    echo ""
+  fi
+
+  local env_name="test_py${python_version}_${pytorch_installer}_pytorch_${pytorch_version}_${pytorch_variant_type}"
+  if [ "$pytorch_variant_version" != "" ]; then
+    local env_name="${env_name}_${pytorch_variant_version}"
+  fi
+
+  echo "Creating the Build Environment: ${env_name} ..."
+  create_conda_environment    "${env_name}" "${python_version}" || return 1
+
+  # Set up the build tools and/or GPU runtimes
+  if [ "$pytorch_variant_type" == "cuda" ]; then
+    install_cxx_compiler      "${env_name}"                                                                         || return 1
+    install_build_tools       "${env_name}"                                                                         || return 1
+    install_cuda              "${env_name}" "${pytorch_variant_version}"                                            || return 1
+    install_cudnn             "${env_name}" "${HOME}/cudnn-${pytorch_variant_version}" "${pytorch_variant_version}" || return 1
+
+  elif [ "$pytorch_variant_type" == "rocm" ]; then
+    install_rocm_ubuntu       "${env_name}" "${pytorch_variant_version}"  || return 1
+    install_build_tools       "${env_name}"                               || return 1
+    return 1
+
+  else
+    install_cxx_compiler      "${env_name}" || return 1
+    install_build_tools       "${env_name}" || return 1
+  fi
+
+  # Install PyTorch
+  if [ "$pytorch_installer" == "conda" ]; then
+    install_pytorch_conda     "${env_name}" "${pytorch_version}" "${pytorch_variant_type}" "${pytorch_variant_version}" || return 1
+  else
+    install_pytorch_pip       "${env_name}" "${pytorch_version}" "${pytorch_variant_type}" "${pytorch_variant_version}" || return 1
+  fi
+
+  return "${env_name}"
+}
+
+test_fbgemm_gpu_build_and_install () {
+  local env_name="$1"
+  local pytorch_variant_type="$2"
+
+  # Assume we are starting from the repository root directory
+  cd fbgemm_gpu                                                               || return 1
+  prepare_fbgemm_gpu_build    "${env_name}"                                   || return 1
+  build_fbgemm_gpu_package    "${env_name}" release "${pytorch_variant_type}" || return 1
+  # shellcheck disable=SC2164
+  cd -
+  install_fbgemm_gpu_package  "${env_name}" fbgemm_gpu/dist/*.whl             || return 1
+
+  cd fbgemm_gpu/test                        || return 1
+  run_fbgemm_gpu_tests        "${env_name}" || return 1
+  # shellcheck disable=SC2164
+  cd -
+}

From 9d6ba13d3444b3be8a7932045cf458db36721ec8 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Wed, 6 Sep 2023 18:19:27 -0700
Subject: [PATCH 08/94] =?UTF-8?q?Make=20publishing=20to=20PyPI=20optional?=
 =?UTF-8?q?=20in=20the=20OSS=20=E2=80=A6=20(#2003)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
…workflows

- Include the option to disable publishing to PyPI when running the Release and Nightly OSS workflows

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2003

Reviewed By: spcyppt

Differential Revision: D49040359

Pulled By: q10

fbshipit-source-id: 8cede1906e3b827a22522305ce31e9505383c4fd
---
 .github/workflows/fbgemm_gpu_cpu_nightly.yml  | 8 +++++++-
 .github/workflows/fbgemm_gpu_cpu_release.yml  | 8 +++++++-
 .github/workflows/fbgemm_gpu_cuda_nightly.yml | 8 +++++++-
 .github/workflows/fbgemm_gpu_cuda_release.yml | 8 +++++++-
 4 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/fbgemm_gpu_cpu_nightly.yml b/.github/workflows/fbgemm_gpu_cpu_nightly.yml
index 299f1c6556..32c4efd7f8 100644
--- a/.github/workflows/fbgemm_gpu_cpu_nightly.yml
+++ b/.github/workflows/fbgemm_gpu_cpu_nightly.yml
@@ -29,6 +29,12 @@ on:
   # Manual Trigger
   #
   workflow_dispatch:
+    inputs:
+      publish_to_pypi:
+        description: Publish Artifact to PyPI
+        type: boolean
+        required: false
+        default: false
 
 concurrency:
   # Cancel previous runs in the PR if a new commit is pushed
@@ -172,7 +178,7 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
 
     - name: Push FBGEMM_GPU Nightly (CPU version) Binary to PYPI
-      if: ${{ github.event_name != 'pull_request' && github.event_name != 'push' }}
+      if: ${{ github.event_name != 'pull_request' && github.event_name != 'push' && github.event.inputs.publish_to_pypi == 'true' }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
       run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu_nightly_cpu-*.whl "$PYPI_TOKEN"
diff --git a/.github/workflows/fbgemm_gpu_cpu_release.yml b/.github/workflows/fbgemm_gpu_cpu_release.yml
index 95f18934cb..ba65181d37 100644
--- a/.github/workflows/fbgemm_gpu_cpu_release.yml
+++ b/.github/workflows/fbgemm_gpu_cpu_release.yml
@@ -21,6 +21,12 @@ on:
   # Manual Trigger
   #
   workflow_dispatch:
+    inputs:
+      publish_to_pypi:
+        description: Publish Artifact to PyPI
+        type: boolean
+        required: false
+        default: false
 
 concurrency:
   # Cancel previous runs in the PR if a new commit is pushed
@@ -159,7 +165,7 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
 
     - name: Push FBGEMM_GPU (CPU version) Binary to PYPI
-      if: ${{ github.event_name != 'pull_request' && github.event_name != 'push' }}
+      if: ${{ github.event_name != 'pull_request' && github.event_name != 'push' && github.event.inputs.publish_to_pypi == 'true' }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
       run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu_cpu-*.whl "$PYPI_TOKEN"
diff --git a/.github/workflows/fbgemm_gpu_cuda_nightly.yml b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
index 29e89b4ff4..f1c81ebfd7 100644
--- a/.github/workflows/fbgemm_gpu_cuda_nightly.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
@@ -29,6 +29,12 @@ on:
   # Manual Trigger
   #
   workflow_dispatch:
+    inputs:
+      publish_to_pypi:
+        description: Publish Artifact to PyPI
+        type: boolean
+        required: false
+        default: false
 
 concurrency:
   # Cancel previous runs in the PR if a new commit is pushed
@@ -178,7 +184,7 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
 
     - name: Push FBGEMM_GPU Nightly Binary to PYPI
-      if: ${{ github.event_name != 'pull_request' && github.event_name != 'push' && matrix.cuda-version == matrix.cuda-version-publish }}
+      if: ${{ github.event_name != 'pull_request' && github.event_name != 'push' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
       run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu_nightly-*.whl "$PYPI_TOKEN"
diff --git a/.github/workflows/fbgemm_gpu_cuda_release.yml b/.github/workflows/fbgemm_gpu_cuda_release.yml
index f5e3bdcb11..a273dae065 100644
--- a/.github/workflows/fbgemm_gpu_cuda_release.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_release.yml
@@ -21,6 +21,12 @@ on:
   # Manual Trigger
   #
   workflow_dispatch:
+    inputs:
+      publish_to_pypi:
+        description: Publish Artifact to PyPI
+        type: boolean
+        required: false
+        default: false
 
 concurrency:
   # Cancel previous runs in the PR if a new commit is pushed
@@ -165,7 +171,7 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
 
     - name: Push FBGEMM_GPU Binary to PYPI
-      if: ${{ github.event_name != 'pull_request' && github.event_name != 'push' && matrix.cuda-version == matrix.cuda-version-publish }}
+      if: ${{ github.event_name != 'pull_request' && github.event_name != 'push' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
       run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu-*.whl "$PYPI_TOKEN"

From d3fe19983e14bb5a9ce1bf0375f1425e1589d932 Mon Sep 17 00:00:00 2001
From: Jun Luo <junluo@meta.com>
Date: Thu, 7 Sep 2023 10:18:32 -0700
Subject: [PATCH 09/94] Support MTIA device type in FBGEEM TBE training (#1994)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1994

For the MTIA device type, we have to rely on the CPU fallback. So we let the FBGEMM chose CPU path when the device type == MTIA.

Reviewed By: jackm321

Differential Revision: D48809630

fbshipit-source-id: 15bec60be6efe4c8b1ad4f9d46da39ce58e36a40
---
 .../split_table_batched_embeddings_ops_training.py    | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
index c161363f08..bedf38f5c8 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
@@ -54,6 +54,7 @@ class DoesNotHavePrefix(Exception):
 class ComputeDevice(enum.IntEnum):
     CPU = 0
     CUDA = 1
+    MTIA = 2
 
 
 class WeightDecayMode(enum.IntEnum):
@@ -366,7 +367,13 @@ def __init__(  # noqa C901
         assert all(
             cd == compute_devices[0] for cd in compute_devices
         ), "Heterogenous compute_devices are NOT supported!"
-        self.use_cpu: bool = all(cd == ComputeDevice.CPU for cd in compute_devices)
+        # Split TBE has different function schemas for CUDA and CPU.
+        # For MTIA device type, it uses the CPU one.
+        self.use_cpu: bool = (
+            compute_devices[0] == ComputeDevice.CPU
+            or compute_devices[0] == ComputeDevice.MTIA
+        )
+
         assert not self.use_cpu or all(
             loc == EmbeddingLocation.HOST for loc in locations
         ), "ComputeDevice.CPU is only for EmbeddingLocation.HOST!"
@@ -998,7 +1005,7 @@ def forward(  # noqa: C901
             placements=self.momentum2_placements,
         )
         # Ensure iter is always on CPU so the increment doesn't synchronize.
-        if self.iter.is_cuda:
+        if not self.iter.is_cpu:
             self.iter = self.iter.cpu()
         self.iter[0] += 1
 

From ce3943c6b8cca71d1343b882e228be853e5b780f Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Thu, 7 Sep 2023 15:10:13 -0700
Subject: [PATCH 10/94] =?UTF-8?q?Make=20publishing=20to=20PyPI=20optional?=
 =?UTF-8?q?=20in=20the=20OSS=20=E2=80=A6=20(#2004)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
…workflows, pt. 2

- Fix the logic around the Publish to PyPI toggle to allow for publishing in more scenarios

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2004

Reviewed By: spcyppt

Differential Revision: D49042306

Pulled By: q10

fbshipit-source-id: 757fd3db27ed1dab332f20c417a26600a0beef20
---
 .github/workflows/fbgemm_gpu_cpu_nightly.yml  | 2 +-
 .github/workflows/fbgemm_gpu_cpu_release.yml  | 2 +-
 .github/workflows/fbgemm_gpu_cuda_nightly.yml | 2 +-
 .github/workflows/fbgemm_gpu_cuda_release.yml | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/fbgemm_gpu_cpu_nightly.yml b/.github/workflows/fbgemm_gpu_cpu_nightly.yml
index 32c4efd7f8..1f6547848c 100644
--- a/.github/workflows/fbgemm_gpu_cpu_nightly.yml
+++ b/.github/workflows/fbgemm_gpu_cpu_nightly.yml
@@ -178,7 +178,7 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
 
     - name: Push FBGEMM_GPU Nightly (CPU version) Binary to PYPI
-      if: ${{ github.event_name != 'pull_request' && github.event_name != 'push' && github.event.inputs.publish_to_pypi == 'true' }}
+      if: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true') }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
       run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu_nightly_cpu-*.whl "$PYPI_TOKEN"
diff --git a/.github/workflows/fbgemm_gpu_cpu_release.yml b/.github/workflows/fbgemm_gpu_cpu_release.yml
index ba65181d37..42193182a4 100644
--- a/.github/workflows/fbgemm_gpu_cpu_release.yml
+++ b/.github/workflows/fbgemm_gpu_cpu_release.yml
@@ -165,7 +165,7 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
 
     - name: Push FBGEMM_GPU (CPU version) Binary to PYPI
-      if: ${{ github.event_name != 'pull_request' && github.event_name != 'push' && github.event.inputs.publish_to_pypi == 'true' }}
+      if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
       run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu_cpu-*.whl "$PYPI_TOKEN"
diff --git a/.github/workflows/fbgemm_gpu_cuda_nightly.yml b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
index f1c81ebfd7..c3aeef8ba8 100644
--- a/.github/workflows/fbgemm_gpu_cuda_nightly.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
@@ -184,7 +184,7 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
 
     - name: Push FBGEMM_GPU Nightly Binary to PYPI
-      if: ${{ github.event_name != 'pull_request' && github.event_name != 'push' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish }}
+      if: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish) }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
       run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu_nightly-*.whl "$PYPI_TOKEN"
diff --git a/.github/workflows/fbgemm_gpu_cuda_release.yml b/.github/workflows/fbgemm_gpu_cuda_release.yml
index a273dae065..bb4ad8fa67 100644
--- a/.github/workflows/fbgemm_gpu_cuda_release.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_release.yml
@@ -171,7 +171,7 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
 
     - name: Push FBGEMM_GPU Binary to PYPI
-      if: ${{ github.event_name != 'pull_request' && github.event_name != 'push' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish }}
+      if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
       run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu-*.whl "$PYPI_TOKEN"

From f664fd908dc0f0f29d69f358cfd5d71126761000 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Thu, 7 Sep 2023 16:39:47 -0700
Subject: [PATCH 11/94] Remove unsupported type dispatch from FBGEMM ops, pt. 1
 (#1989)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1989

- Remove unsupported type dispatch from FBGEMM ops, pt. 1

Reviewed By: sryap

Differential Revision: D48895311

fbshipit-source-id: 05a0ee7e3db10bbe720b01457a08b29de6f8afdc
---
 .../embedding_backward_split_cpu_approx_template.cpp       | 5 +++--
 .../codegen/embedding_backward_split_cpu_template.cpp      | 7 ++++---
 fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp         | 7 ++++---
 fbgemm_gpu/src/jagged_tensor_ops/common.cuh                | 1 +
 fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp | 1 +
 .../jagged_tensor_ops/keyed_jagged_index_select_dim1.cu    | 2 +-
 fbgemm_gpu/src/layout_transform_ops.cu                     | 7 ++++---
 fbgemm_gpu/src/layout_transform_ops_cpu.cpp                | 3 ++-
 fbgemm_gpu/src/metric_ops.cu                               | 3 ++-
 9 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/fbgemm_gpu/codegen/embedding_backward_split_cpu_approx_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_cpu_approx_template.cpp
index 992ca87eaf..5dedfd0f57 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_cpu_approx_template.cpp
+++ b/fbgemm_gpu/codegen/embedding_backward_split_cpu_approx_template.cpp
@@ -17,6 +17,7 @@
 #include "fbgemm/FbgemmEmbedding.h"
 #include "fbgemm_gpu/cpu_utils.h"
 #include "fbgemm_gpu/embedding_common.h"
+#include "fbgemm_gpu/dispatch_macros.h"
 
 using Tensor = at::Tensor;
 using namespace fbgemm_gpu;
@@ -193,10 +194,10 @@ for (const auto t : c10::irange(t_begin,t_end)) {
 
   {% endif %}
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+  FBGEMM_DISPATCH_FLOAT_AND_HALF(
       grad_output.scalar_type(), "split_embedding_backward_cpu", [&] {
         using grad_t = scalar_t;
-        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        FBGEMM_DISPATCH_FLOAT_AND_HALF(
             host_weights.scalar_type(),
             "split_embedding_backward_cpu_inner",
             [&] {
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp
index 2bf098498d..fbdbcf45b4 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp
+++ b/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp
@@ -20,6 +20,7 @@
 #include "fbgemm/FbgemmEmbedding.h"
 #include "fbgemm/Types.h"
 #include "fbgemm_gpu/embedding_common.h"
+#include "fbgemm_gpu/dispatch_macros.h"
 #include "fbgemm_gpu/cpu_utils.h"
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
@@ -344,11 +345,11 @@ for (const auto d : c10::irange(D)) {
   grad_output = grad_output.contiguous();
 
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+  FBGEMM_DISPATCH_FLOAT_AND_HALF(
       grad_output.scalar_type(),
       "split_embedding_backward_exact_cpu_outer", [&]() {
         using grad_t = scalar_t;
-      AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      FBGEMM_DISPATCH_FLOAT_AND_HALF(
           host_weights.scalar_type(), "split_embedding_backward_exact_cpu", [&] {
             split_embedding_backward_exact_cpu_kernel<scalar_t, grad_t>(
                 grad_output,
@@ -379,7 +380,7 @@ for (const auto d : c10::irange(D)) {
 
   // When input is dense enough, avoid sorting and just treat as dense.
   auto grad = zeros_like(host_weights, grad_output.dtype());
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+  FBGEMM_DISPATCH_FLOAT_AND_HALF(
       grad_output.scalar_type(), "split_embedding_backward_exact_cpu", [&] {
 
         split_embedding_backward_exact_cpu_dense_kernel<scalar_t>(
diff --git a/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp b/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp
index fc879b525a..b440652b28 100644
--- a/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp
+++ b/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp
@@ -11,6 +11,7 @@
 #include "fbgemm/Types.h"
 #include "fbgemm/Utils.h"
 #include "fbgemm_gpu/cpu_utils.h"
+#include "fbgemm_gpu/dispatch_macros.h"
 #include "fbgemm_gpu/embedding_common.h"
 #include "fbgemm_gpu/sparse_ops_utils.h"
 #ifdef FBCODE_CAFFE2
@@ -201,7 +202,7 @@ Tensor split_embedding_codegen_forward_cpu(
   // It is assumed that the indice_weights will always be float
   TORCH_CHECK(
       !indice_weights.defined() || indice_weights.scalar_type() != at::kHalf);
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+  FBGEMM_DISPATCH_FLOAT_AND_HALF(
       output.scalar_type(), "split_embedding_cpu_forward", [&]() {
         using output_t = scalar_t;
         AT_DISPATCH_FLOATING_TYPES_AND2(
@@ -298,12 +299,12 @@ Tensor split_embedding_codegen_grad_indice_weights_cpu(
       indices,
       indices.options().dtype(
           at::toAccumulateType(grad_output.scalar_type(), true)));
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+  FBGEMM_DISPATCH_FLOAT_AND_HALF(
       grad_output.scalar_type(),
       "split_embedding_grad_indice_weights_cpu_outer",
       [&] {
         using grad_t = scalar_t;
-        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        FBGEMM_DISPATCH_FLOAT_AND_HALF(
             weights.scalar_type(),
             "split_embedding_grad_indice_weights_cpu",
             [&] {
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/common.cuh b/fbgemm_gpu/src/jagged_tensor_ops/common.cuh
index 1134b85cc5..8a9f529a9b 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/common.cuh
+++ b/fbgemm_gpu/src/jagged_tensor_ops/common.cuh
@@ -24,6 +24,7 @@
 #include "fbgemm_gpu/cub_namespace_postfix.cuh"
 // clang-format on
 
+#include "fbgemm_gpu/dispatch_macros.h"
 #include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
 #include "fbgemm_gpu/fbgemm_tensor_accessor.h"
 #include "fbgemm_gpu/ops_utils.h"
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
index 852a37b2b9..f73cded5f3 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
@@ -13,6 +13,7 @@
 #include <torch/library.h>
 #include "ATen/Parallel.h"
 
+#include "fbgemm_gpu/dispatch_macros.h"
 #include "fbgemm_gpu/sparse_ops.h"
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu b/fbgemm_gpu/src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu
index e9efb1e988..89948c418a 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu
+++ b/fbgemm_gpu/src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu
@@ -298,7 +298,7 @@ class KeyedJaggedIndexSelectDim1GPUOp
                       "keyed_jagged_index_select_dim1_warpper_3",
                       [&] {
                         if (weights.has_value()) {
-                          AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+                          FBGEMM_DISPATCH_FLOAT_AND_HALF(
                               weights.value().scalar_type(),
                               "keyed_jagged_index_select_dim1_warpper_4",
                               [&] {
diff --git a/fbgemm_gpu/src/layout_transform_ops.cu b/fbgemm_gpu/src/layout_transform_ops.cu
index 9f75b23a91..3b76e5b32c 100644
--- a/fbgemm_gpu/src/layout_transform_ops.cu
+++ b/fbgemm_gpu/src/layout_transform_ops.cu
@@ -9,6 +9,7 @@
 // clang-format off
 #include "fbgemm_gpu/cub_namespace_prefix.cuh"
 #include <cub/device/device_scan.cuh>
+#include "fbgemm_gpu/dispatch_macros.h"
 #include "fbgemm_gpu/cub_namespace_postfix.cuh"
 // clang-format on
 
@@ -49,7 +50,7 @@ Tensor recat_embedding_grad_output_cuda(
 
   Tensor sharded_grad_output =
       at::empty({grad_output.numel()}, grad_output.options());
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+  FBGEMM_DISPATCH_FLOAT_AND_HALF(
       grad_output.scalar_type(), "recat_embedding_gradients", [&] {
         const auto go = grad_output.accessor<scalar_t, 3>();
         auto sgo = sharded_grad_output.accessor<scalar_t, 1>();
@@ -93,7 +94,7 @@ Tensor recat_embedding_grad_output_mixed_D_cuda(
   Tensor sharded_grad_output =
       at::empty({grad_output.numel()}, grad_output.options());
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+  FBGEMM_DISPATCH_FLOAT_AND_HALF(
       grad_output.scalar_type(), "recat_embedding_gradients", [&] {
         const auto go = grad_output.accessor<scalar_t, 2>();
         auto sgo = sharded_grad_output.accessor<scalar_t, 1>();
@@ -145,7 +146,7 @@ Tensor recat_embedding_grad_output_mixed_D_batch_cuda(
   const dim3 blocks(fbgemm_gpu::div_round_up(
       (B_local * dim_num), fbgemm_gpu::kMaxThreads / fbgemm_gpu::kWarpSize));
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+  FBGEMM_DISPATCH_FLOAT_AND_HALF(
       grad_output.scalar_type(), "recat_embedding_gradients", [&] {
         recat_copy_async_kernel<scalar_t>
             <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
diff --git a/fbgemm_gpu/src/layout_transform_ops_cpu.cpp b/fbgemm_gpu/src/layout_transform_ops_cpu.cpp
index 22fb882b48..8adda99dd9 100644
--- a/fbgemm_gpu/src/layout_transform_ops_cpu.cpp
+++ b/fbgemm_gpu/src/layout_transform_ops_cpu.cpp
@@ -10,6 +10,7 @@
 #include <ATen/core/op_registration/op_registration.h>
 #include <torch/library.h>
 #include "ATen/Parallel.h"
+#include "fbgemm_gpu/dispatch_macros.h"
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
 using Tensor = at::Tensor;
@@ -37,7 +38,7 @@ Tensor recat_embedding_grad_output_mixed_D_cpu(
   const auto global_dim_sum = accum_dim_sum[n];
   TORCH_CHECK(B_local * global_dim_sum == grad_output.numel());
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+  FBGEMM_DISPATCH_FLOAT_AND_HALF(
       grad_output.scalar_type(), "recat_embedding_gradients", [&] {
         const auto go = grad_output.accessor<scalar_t, 2>();
         auto sgo = sharded_grad_output.accessor<scalar_t, 1>();
diff --git a/fbgemm_gpu/src/metric_ops.cu b/fbgemm_gpu/src/metric_ops.cu
index 58f9ad79fe..6bb867be9a 100644
--- a/fbgemm_gpu/src/metric_ops.cu
+++ b/fbgemm_gpu/src/metric_ops.cu
@@ -13,6 +13,7 @@
 #include <ATen/cuda/Atomic.cuh>
 #include <algorithm>
 
+#include "fbgemm_gpu/dispatch_macros.h"
 #include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
 #include "metric_ops.h"
 
@@ -276,7 +277,7 @@ at::Tensor batch_auc(
     AT_DISPATCH_ALL_TYPES_AND(
         at::ScalarType::Half, labels.scalar_type(), "auc_wrapper_2", [&] {
           using label_t = scalar_t;
-          AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+          FBGEMM_DISPATCH_FLOAT_AND_HALF(
               weights.scalar_type(), "auc_wrapper_3", [&] {
                 using acc_t = at::acc_type<scalar_t, true>;
                 if (padded_section_size == 1) {

From 067698c949793d57a73d730b6f71527cd36c5735 Mon Sep 17 00:00:00 2001
From: Sungmin Cho <sungmincho@meta.com>
Date: Fri, 8 Sep 2023 03:27:40 -0700
Subject: [PATCH 12/94] uvm_cache_stats for direct mapped (#1952)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1952

- Implement python frontend for the previous diff (split for backward compatibility)
- Revise the existing benchmark to use the uvm_cache_stats for stats instead of cache_miss_counter.
- Implement unit test for uvm_cache_stats for direct mapped.

Reviewed By: doehyun

Differential Revision: D48439568

fbshipit-source-id: cc3b36402e8038b44c83d1e701004b68129563d4
---
 ...plit_table_batched_embeddings_benchmark.py |  33 +++--
 ..._table_batched_embeddings_ops_inference.py |  24 +++-
 .../split_table_batched_embeddings_test.py    | 123 ++++++++++++++++++
 3 files changed, 162 insertions(+), 18 deletions(-)

diff --git a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
index 39d056c566..ec4d265535 100644
--- a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
+++ b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
@@ -1727,6 +1727,7 @@ def nbit_uvm(
 @click.option("--fp8-exponent-bits", type=int, default=None)
 @click.option("--fp8-exponent-bias", type=int, default=None)
 @click.option("--record-cache", is_flag=True, default=False)
+@click.option("--uvm-host-mapped", is_flag=True, default=False)
 @click.option(
     "--dump-requests", type=int, default=0, help="number of reqs to dump (0=no dump)"
 )
@@ -1753,6 +1754,7 @@ def nbit_uvm_compare_direct_mapped(
     fp8_exponent_bits: Optional[int],
     fp8_exponent_bias: Optional[int],
     record_cache: bool,
+    uvm_host_mapped: bool,
     dump_requests: int,
 ) -> None:
     logging.info(json.dumps({k: str(v) for k, v in locals().items()}, indent=2))
@@ -1837,18 +1839,21 @@ def bench_uvm_cls(
             enforce_hbm=enforce_hbm,
             fp8_exponent_bits=fp8_exponent_bits,
             fp8_exponent_bias=fp8_exponent_bias,
-            record_cache_metrics=RecordCacheMetrics(record_cache, record_cache),
+            gather_uvm_cache_stats=record_cache,
+            uvm_host_mapped=uvm_host_mapped,
         ).cuda()
         emb.fill_random_weights()
 
-        # label nvtx only when cache counter is off
-        nvtx_range = "" if record_cache else f"UVM-{name.upper()}"
-        callback_after_warmup = emb.reset_cache_miss_counter if record_cache else None
-        requests = requests_uvm[:1] if record_cache else requests_uvm
+        nvtx_range = (
+            f"UVM-RECORD-CACHE-{name.upper()}"
+            if record_cache
+            else f"UVM-{name.upper()}"
+        )
+        callback_after_warmup = emb.reset_uvm_cache_stats if record_cache else None
 
         torch.cuda.cudart().cudaProfilerStart()
         time_per_iter = benchmark_requests(
-            requests,
+            requests_uvm,
             lambda indices, offsets, per_sample_weights: emb.forward(
                 indices.int(),
                 offsets.int(),
@@ -1881,12 +1886,14 @@ def bench_uvm_cls(
             )
 
         if record_cache:
-            cmc = emb.cache_miss_counter.detach().cpu().numpy().tolist()
+            ucs = emb.uvm_cache_stats.detach().cpu().numpy().tolist()
             cache_stats = {
-                "miss_forward_count": cmc[0],
-                "unique_miss": cmc[1],
-                "unique_req": cmc[2],
-                "nondedup_req": cmc[3],
+                "num_calls": ucs[0],
+                "num_requested_indices": ucs[1],
+                "num_unique_indices": ucs[2],
+                "num_unique_misses": ucs[3],
+                "num_conflict_unique_misses": ucs[4],
+                "num_conflict_misses": ucs[5],
             }
             stats[name]["cache_stats"] = cache_stats
             logging.info(f"[{name:>8s}] cache stats {cache_stats}")
@@ -1932,6 +1939,7 @@ def bench_uvm_cls(
 @click.option("--batch-size", default=512)
 @click.option("--cache-algorithm", default="lru")
 @click.option("--cache-load-factor", default=0.2)
+@click.option("--cache-assoc", default=32)
 @click.option("--embedding-dim", default=128)
 @click.option("--weights-precision", type=SparseType, default=SparseType.INT4)
 @click.option("--iters", default=100)
@@ -1954,6 +1962,7 @@ def nbit_cache(  # noqa C901
     batch_size: int,
     cache_algorithm: str,
     cache_load_factor: float,
+    cache_assoc: int,
     embedding_dim: int,
     weights_precision: SparseType,
     iters: int,
@@ -2003,6 +2012,7 @@ def nbit_cache(  # noqa C901
         enforce_hbm=enforce_hbm,
         fp8_exponent_bits=fp8_exponent_bits,
         fp8_exponent_bias=fp8_exponent_bias,
+        cache_assoc=cache_assoc,
     ).cuda()
     emb_nc.fill_random_weights()
 
@@ -2027,6 +2037,7 @@ def nbit_cache(  # noqa C901
         enforce_hbm=enforce_hbm,
         fp8_exponent_bits=fp8_exponent_bits,
         fp8_exponent_bias=fp8_exponent_bias,
+        cache_assoc=cache_assoc,
     ).cuda()
     emb.fill_random_weights()
 
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_inference.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_inference.py
index eeb6bc749b..0fb8188553 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_inference.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_inference.py
@@ -594,13 +594,7 @@ def prefetch_32way(self, linear_cache_indices: Tensor) -> None:
             )
         )
         if self.gather_uvm_cache_stats:
-            # Accumulate local_uvm_cache_stats (int32) into uvm_cache_stats (int64).
-            # We may wanna do this accumulation atomically, but as it's only for monitoring,
-            # slightly inaccurate result may be acceptable.
-            self.uvm_cache_stats = torch.add(
-                self.uvm_cache_stats, self.local_uvm_cache_stats
-            )
-            self.local_uvm_cache_stats.zero_()
+            self._accumulate_uvm_cache_stats()
 
     def prefetch_1way(self, linear_cache_indices: Tensor) -> None:
         if self.cache_algorithm == CacheAlgorithm.LRU:
@@ -618,6 +612,9 @@ def prefetch_1way(self, linear_cache_indices: Tensor) -> None:
                 self.timestep_counter.get(),
                 self.lxu_state,
                 self.lxu_cache_miss_timestamp,
+                16,  # row_alignment; using default value.
+                self.gather_uvm_cache_stats,
+                self.local_uvm_cache_stats,
             )
         else:
             raise ValueError("Direct Mapped for LRU only")
@@ -630,8 +627,21 @@ def prefetch_1way(self, linear_cache_indices: Tensor) -> None:
                 linear_cache_indices,
                 self.lxu_cache_state,
                 self.total_cache_hash_size,
+                self.gather_uvm_cache_stats,
+                self.local_uvm_cache_stats,
             )
         )
+        if self.gather_uvm_cache_stats:
+            self._accumulate_uvm_cache_stats()
+
+    def _accumulate_uvm_cache_stats(self) -> None:
+        # Accumulate local_uvm_cache_stats (int32) into uvm_cache_stats (int64).
+        # We may wanna do this accumulation atomically, but as it's only for monitoring,
+        # slightly inaccurate result may be acceptable.
+        self.uvm_cache_stats = torch.add(
+            self.uvm_cache_stats, self.local_uvm_cache_stats
+        )
+        self.local_uvm_cache_stats.zero_()
 
     def _update_cache_miss_counter(
         self,
diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
index c05d7f85fe..ddce4c0ea5 100644
--- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py
+++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -5575,6 +5575,129 @@ def test_nbit_uvm_cache_stats(self, N: int, dtype: SparseType) -> None:
                 self.assertEqual(num_conflict_miss, e[1])
                 cc1.reset_uvm_cache_stats()
 
+    @unittest.skipIf(*gpu_unavailable)
+    @given(
+        N=st.integers(min_value=1, max_value=8),
+        dtype=st.sampled_from([SparseType.INT8, SparseType.INT4, SparseType.INT2]),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    def test_nbit_direct_mapped_uvm_cache_stats(
+        self, N: int, dtype: SparseType
+    ) -> None:
+        # Create an abstract split table
+        D = 8
+        T = 2
+        E = 10**3
+        Ds = [D] * T
+        Es = [E] * T
+        cc = IntNBitTableBatchedEmbeddingBagsCodegen(
+            embedding_specs=[
+                (
+                    "",
+                    E,
+                    D,
+                    dtype,
+                    EmbeddingLocation.MANAGED_CACHING,
+                )
+                for (E, D) in zip(Es, Ds)
+            ],
+            device=torch.cuda.current_device(),
+            gather_uvm_cache_stats=True,
+            cache_assoc=1,  # Direct Mapped
+        )
+        cc.fill_random_weights()
+
+        # Create fake input data and the target output
+        x1 = torch.Tensor([[[1], [1]], [[3], [4]]]).cuda()
+        x2 = torch.Tensor([[[2], [1]], [[3], [4]]]).cuda()
+        x3 = torch.Tensor([[[5], [6]], [[7], [8]]]).cuda()
+
+        xs = [x1, x2, x3]
+        # num_unique_indices, num_unique_misses
+        # note that these are cumulative over calls; and also "unique" is per batch.
+        target_counter_list = [[3, 3], [4, 4], [4, 8]]
+        num_calls_expected = 0
+        num_indices_expcted = 0
+        num_unique_indices_expected = 0
+        for x, t_counter in zip(xs, target_counter_list):
+            (indices, offsets) = get_table_batched_offsets_from_dense(x, use_cpu=False)
+            for _ in range(N):
+                num_calls_expected = num_calls_expected + 1
+                num_indices_expcted = num_indices_expcted + len(indices)
+                cc(indices.int(), offsets.int())
+                (
+                    num_calls,
+                    num_indices,
+                    num_unique_indices,
+                    num_unique_misses,
+                    num_conflict_unique_miss,
+                    num_conflict_miss,
+                ) = cc.get_uvm_cache_stats().cpu()
+                # Note num_unique_indices is cumulative stats.
+                num_unique_indices_expected = num_unique_indices_expected + t_counter[0]
+                self.assertEqual(num_calls, num_calls_expected)
+                self.assertEqual(num_indices, num_indices_expcted)
+                self.assertEqual(num_unique_indices, 0)  # N/A for Direct Mapped
+                self.assertEqual(num_unique_misses, 0)  # N/A for Direct Mapped
+                self.assertEqual(
+                    num_conflict_unique_miss, t_counter[1]
+                )  # number of actually inserted rows for Direct Mapped
+                self.assertEqual(num_conflict_miss, 0)
+
+        T = 1  # for simplicity
+        Ds = [D] * T
+        Es = [E] * T
+        cc1 = IntNBitTableBatchedEmbeddingBagsCodegen(
+            embedding_specs=[
+                (
+                    "",
+                    E,
+                    D,
+                    SparseType.INT8,
+                    EmbeddingLocation.MANAGED_CACHING,
+                )
+                for (E, D) in zip(Es, Ds)
+            ],
+            device=torch.cuda.current_device(),
+            gather_uvm_cache_stats=True,
+            cache_sets=1,  # Only one set.
+            cache_assoc=1,  # Direct Mapped
+        )
+        cc1.fill_random_weights()
+
+        associativty = 1  # Direct-Mapped
+        repetition = 17
+        indices1 = torch.Tensor(
+            [[list(range(0, associativty))] * repetition]
+        ).cuda()  # no conflict miss
+        indices2 = torch.Tensor(
+            [[list(range(0, associativty + 1))] * repetition]
+        ).cuda()  # 1 * 17 conflict miss per request
+        indices3 = torch.Tensor(
+            [[list(range(0, associativty + 10))] * repetition]
+        ).cuda()  # 10 * 17 conflict misses per request
+
+        # num_conflict_unique_miss, num_conflict_miss
+        expected = [[1, 0], [1, 17], [1, 170]]
+
+        accum_num_conflict_miss = 0
+        for x, e in zip((indices1, indices2, indices3), expected):
+            (indices, offsets) = get_table_batched_offsets_from_dense(x, use_cpu=False)
+            for _ in range(N):
+                cc1(indices.int(), offsets.int())
+                (
+                    _,
+                    _,
+                    _,
+                    _,
+                    num_conflict_unique_miss,
+                    num_conflict_miss,
+                ) = cc1.get_uvm_cache_stats().cpu()
+                # for DM this represents number of actually inserted rows
+                self.assertEqual(num_conflict_unique_miss, e[0])
+                accum_num_conflict_miss += e[1]
+                self.assertEqual(num_conflict_miss, accum_num_conflict_miss)
+
     @given(
         T=st.integers(min_value=1, max_value=64),
         B=st.integers(min_value=1, max_value=64),

From 117bc3e344b56a4bcbceb02609f145afd1028c69 Mon Sep 17 00:00:00 2001
From: Abdul Zainul-Abedin <azainul@meta.com>
Date: Fri, 8 Sep 2023 07:58:44 -0700
Subject: [PATCH 13/94] Add permute_duplicate_pooled_embeddings op for CPU
 (#1939)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1939

This diff builds ontop of the pervious diff and adds support for permute_duplicate_pooled_embeddings for CPU.

# Background
Currently permute_pooled_embs_gpu does not support duplicates in a permutation, this poses a problem with passing the same embeddings to multiple modules. This doc proposes a solution to allow duplicate subsets in the resultant permutation.

# Details
The required implementation of permute_duplicate_pooled_embs_gpu should support a subset being repeated. This is represented by having duplicates in the permute list. This also results in the output list size being greater than the input list.

Input: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Offset_dims: [0,  2,  5,  6, 10]
Permute: [3, 0, 2, 1, 3]

Output:  [6, 7, 8, 9, 0, 1, 5, 2, 3, 4, 6, 7, 8, 9]

Reviewed By: sryap

Differential Revision: D48305145

fbshipit-source-id: e308338f105eb95f8066f554b8143dfb4524c7e6
---
 .../fbgemm_gpu/permute_pooled_embedding_ops.h | 16 +++++
 .../permute_pooled_embedding_ops_gpu.cpp      | 66 +++++++++++++++++--
 .../test/permute_pooled_embedding_test.py     | 13 ++++
 3 files changed, 90 insertions(+), 5 deletions(-)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops.h b/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops.h
index 4cff655b0f..e62c4105e4 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops.h
@@ -11,6 +11,22 @@
 #include <ATen/ATen.h>
 
 namespace fbgemm_gpu {
+
+at::Tensor permute_pooled_embs_cpu_impl(
+    const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
+    const at::Tensor& offset_dim_list,
+    const at::Tensor& permute_list,
+    const at::Tensor& inv_offset_dim_list,
+    const at::Tensor& inv_permute_list,
+    const bool& allow_duplicates);
+
+at::Tensor permute_duplicate_pooled_embs_cpu(
+    const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
+    const at::Tensor& offset_dim_list,
+    const at::Tensor& permute_list,
+    const at::Tensor& inv_offset_dim_list,
+    const at::Tensor& inv_permute_list);
+
 at::Tensor permute_pooled_embs_cpu(
     const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
     const at::Tensor& offset_dim_list,
diff --git a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp
index 98eab9d698..f0a02b9a4c 100644
--- a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp
+++ b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp
@@ -22,13 +22,14 @@ using Tensor = at::Tensor;
 
 namespace fbgemm_gpu {
 
-///@ingroup permute-pooled-embs-cpu
-Tensor permute_pooled_embs_cpu(
+///@ingroup permute-pooled-embs-cpu-impl
+Tensor permute_pooled_embs_cpu_impl(
     const Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
     const Tensor& offset_dim_list,
     const Tensor& permute_list,
     const Tensor& inv_offset_dim_list,
-    const Tensor& inv_permute_list) {
+    const Tensor& inv_permute_list,
+    const bool& allow_duplicates) {
   TORCH_CHECK(
       offset_dim_list.scalar_type() == at::ScalarType::Long,
       "offset_dim_list needs to have long/int64 type")
@@ -37,9 +38,10 @@ Tensor permute_pooled_embs_cpu(
       "permute_list needs to have long/int64 type")
   auto permute = permute_list.data_ptr<int64_t>();
   const auto n = permute_list.numel();
+  const auto dims_size = allow_duplicates ? offset_dim_list.numel() : n;
   std::vector<int64_t> dims;
-  dims.reserve(n - 1);
-  for (const auto i : c10::irange(1, n)) {
+  dims.reserve(dims_size - 1);
+  for (const auto i : c10::irange(1, dims_size)) {
     dims.push_back(offset_dim_list[i].item<int64_t>());
   }
   auto ts = pooled_embs.tensor_split(dims, 1);
@@ -51,6 +53,38 @@ Tensor permute_pooled_embs_cpu(
   return at::cat(permuted_ts, 1);
 }
 
+///@ingroup permute-pooled-embs-cpu
+at::Tensor permute_pooled_embs_cpu(
+    const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
+    const at::Tensor& offset_dim_list,
+    const at::Tensor& permute_list,
+    const at::Tensor& inv_offset_dim_list,
+    const at::Tensor& inv_permute_list) {
+  return permute_pooled_embs_cpu_impl(
+      pooled_embs,
+      offset_dim_list,
+      permute_list,
+      inv_offset_dim_list,
+      inv_permute_list,
+      false);
+}
+
+///@ingroup permute-duplicate-pooled-embs-cpu
+at::Tensor permute_duplicate_pooled_embs_cpu(
+    const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
+    const at::Tensor& offset_dim_list,
+    const at::Tensor& permute_list,
+    const at::Tensor& inv_offset_dim_list,
+    const at::Tensor& inv_permute_list) {
+  return permute_pooled_embs_cpu_impl(
+      pooled_embs,
+      offset_dim_list,
+      permute_list,
+      inv_offset_dim_list,
+      inv_permute_list,
+      true);
+}
+
 using torch::autograd::AutogradContext;
 using torch::autograd::Variable;
 using torch::autograd::variable_list;
@@ -201,6 +235,22 @@ Tensor permute_duplicate_pooled_embs_auto_grad_gpu(
       inv_permute_list,
       true);
 }
+
+///@ingroup permute-duplicate-pooled-embs-cpu
+Tensor permute_duplicate_pooled_embs_auto_grad_cpu(
+    const Tensor& pooled_embs,
+    const Tensor& offset_dim_list,
+    const Tensor& permute_list,
+    const Tensor& inv_offset_dim_list,
+    const Tensor& inv_permute_list) {
+  return PermutePooledEmbsFunction::apply(
+      pooled_embs,
+      offset_dim_list,
+      permute_list,
+      inv_offset_dim_list,
+      inv_permute_list,
+      true);
+}
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
@@ -228,9 +278,15 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   DISPATCH_TO_CUDA(
       "permute_duplicate_pooled_embs",
       fbgemm_gpu::permute_duplicate_pooled_embs_gpu);
+  DISPATCH_TO_CPU(
+      "permute_duplicate_pooled_embs",
+      fbgemm_gpu::permute_duplicate_pooled_embs_cpu);
   m.def(
       "permute_duplicate_pooled_embs_auto_grad(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor");
   DISPATCH_TO_CUDA(
       "permute_duplicate_pooled_embs_auto_grad",
       fbgemm_gpu::permute_duplicate_pooled_embs_auto_grad_gpu);
+  DISPATCH_TO_CPU(
+      "permute_duplicate_pooled_embs_auto_grad",
+      fbgemm_gpu::permute_duplicate_pooled_embs_auto_grad_cpu);
 }
diff --git a/fbgemm_gpu/test/permute_pooled_embedding_test.py b/fbgemm_gpu/test/permute_pooled_embedding_test.py
index 53fab069a5..67fef04edd 100644
--- a/fbgemm_gpu/test/permute_pooled_embedding_test.py
+++ b/fbgemm_gpu/test/permute_pooled_embedding_test.py
@@ -214,6 +214,19 @@ def test_duplicate_permutations(self) -> None:
             expected_result,
         )
 
+        input = input.to(device="cpu")
+        result = torch.ops.fbgemm.permute_duplicate_pooled_embs_auto_grad(
+            input,
+            _offset_dim_list.to(device=input.device),
+            _permute.to(device=input.device),
+            _inv_offset_dim_list.to(device=input.device),
+            _inv_permute.to(device=input.device),
+        )
+        self.assertEqual(
+            result.view(16).tolist(),
+            expected_result,
+        )
+
 
 if __name__ == "__main__":
     unittest.main()

From 6c163ff4da74d3647aeba348eadf36dd94870b5b Mon Sep 17 00:00:00 2001
From: Xinfeng Xie <xinfeng@meta.com>
Date: Fri, 8 Sep 2023 10:28:00 -0700
Subject: [PATCH 14/94] Fix invalid CUDA configuration error for the empty
 input (#1993)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1993

Fix invalid CUDA configuration error for the empty input KJT. When the input KJT is an empty Tensor, the total number of indices is zero. In this case, the jagged_unique_indices op launches 0 thread blocks for the `delinearize_unique_index` (div_round_up(total_indices, kMaxThreads) is 0), which results in CUDA errors.

Reviewed By: sryap

Differential Revision: D48978631

fbshipit-source-id: f21da7005a7b66427a9e8a607be244d8c2ac5583
---
 .../jagged_unique_indices.cu                  |  2 +-
 fbgemm_gpu/test/jagged_tensor_ops_test.py     | 42 +++++++++++++++++++
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_unique_indices.cu b/fbgemm_gpu/src/jagged_tensor_ops/jagged_unique_indices.cu
index e7df266eb8..54934a6cb0 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_unique_indices.cu
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_unique_indices.cu
@@ -171,7 +171,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> jagged_unique_indices_cuda(
         const auto delinearize_unique_index_kernel_ =
             delinearize_unique_index_kernel<index_t>;
         delinearize_unique_index_kernel_<<<
-            div_round_up(total_indices, kMaxThreads),
+            div_round_up(total_indices + 1, kMaxThreads),
             kMaxThreads,
             0,
             at::cuda::getCurrentCUDAStream()>>>(
diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py
index f8cd1cd6ff..6ea981c12a 100644
--- a/fbgemm_gpu/test/jagged_tensor_ops_test.py
+++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py
@@ -2750,6 +2750,48 @@ def test_jagged_unique_indices_multi_keys(
             pos = reverse_index_list[i]
             self.assertTrue(unique_indices_list[pos] == indices_list[i])
 
+    @unittest.skipIf(*gpu_unavailable)
+    @given(
+        B=st.integers(min_value=100, max_value=200),
+        F=st.integers(min_value=50, max_value=100),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=2, deadline=None)
+    def test_jagged_unique_indices_empty(
+        self,
+        B: int,  # Batch size
+        F: int,  # The number of features
+    ) -> None:
+        hash_size_cumsum_list = [0] + list(itertools.accumulate([10] * F))
+        hash_size_offsets_list = [0] + list(itertools.accumulate([1] * F))
+        offsets_list = [0] * (B * F + 1)
+        indices_list = []
+
+        device = torch.device("cuda")
+        dtype = torch.int64
+        hash_size_cumsum = torch.as_tensor(
+            hash_size_cumsum_list, device=device, dtype=dtype
+        )
+        hash_size_offsets = torch.as_tensor(
+            hash_size_offsets_list, device=device, dtype=dtype
+        )
+        offsets = torch.as_tensor(offsets_list, device=device, dtype=dtype)
+        indices = torch.as_tensor(indices_list, device=device, dtype=dtype)
+
+        (
+            output_lengths,
+            output_offsets,
+            unique_indices,
+            reverse_index,
+        ) = torch.ops.fbgemm.jagged_unique_indices(
+            hash_size_cumsum, hash_size_offsets, offsets, indices
+        )
+
+        # The output should be empty since there are no input indices
+        self.assertEqual(unique_indices.numel(), 0)
+        self.assertEqual(reverse_index.numel(), 0)
+        self.assertEqual(torch.sum(output_lengths).item(), 0)
+        self.assertEqual(torch.sum(output_offsets).item(), 0)
+
 
 if __name__ == "__main__":
     unittest.main()

From 6822fcae7b78db250ffb199c941c3392dd31a187 Mon Sep 17 00:00:00 2001
From: Kimble Houck <kimblehouck@meta.com>
Date: Fri, 8 Sep 2023 15:20:46 -0700
Subject: [PATCH 15/94] Modified TBE testbench to use FBGEMM generate_rquests
 function togenerate indices and offsets (#1882)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1882

Reviewed By: archishman

Differential Revision: D47448915

fbshipit-source-id: 2e98a3ebb4e3a42c3e41be53142fcbac3734dada
---
 .../fbgemm_gpu/split_embedding_utils.py       | 28 +++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/split_embedding_utils.py b/fbgemm_gpu/fbgemm_gpu/split_embedding_utils.py
index dad47615e5..782740ff3b 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_embedding_utils.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_embedding_utils.py
@@ -112,7 +112,11 @@ def generate_requests(  # noqa C901
     sigma_L: Optional[int] = None,
     emulate_pruning: bool = False,
     use_cpu: bool = False,
+    deterministic_output: bool = False,  # generate_requests uses numpy.random.default_rng without a set random seed be default, causing the indices tensor to vary with each call to generate_requests - set generate_repeatable_output to use a fixed random seed instead for repeatable outputs
+    length_dist: str = "normal",  # distribution of embedding sequence lengths
 ) -> List[Tuple[torch.IntTensor, torch.IntTensor, Optional[torch.Tensor]]]:
+    # TODO: refactor and split into helper functions to separate load from file,
+    # generate from distribution, and other future methods of generating data
     if requests_data_file is not None:
         indices_tensor, offsets_tensor, lengths_tensor = torch.load(requests_data_file)
 
@@ -181,7 +185,24 @@ def generate_requests(  # noqa C901
     # Generate L from stats
     if sigma_L is not None:
         use_variable_L = True
-        Ls = np.random.normal(loc=L, scale=sigma_L, size=T * B).astype(int)
+        if length_dist == "uniform":
+            # TODO: either make these separate parameters or make a separate version of
+            # generate_requests to handle the uniform dist case once whole
+            # generate_requests function is refactored to split into helper functions
+            # for each use case.
+            # L represents the lower bound when the uniform distribution is used
+            lower_bound = L
+            # sigma_L represetns the upper bound when the uniform distribution is used
+            upper_bound = sigma_L + 1
+            Ls = np.random.randint(
+                lower_bound,
+                upper_bound,
+                (T, B),
+                dtype=np.int32,
+            )
+        else:  # normal dist
+            Ls = np.random.normal(loc=L, scale=sigma_L, size=T * B).astype(int)
+
         # Make sure that Ls are positive
         Ls[Ls < 0] = 0
         # Use the same L distribution across iters
@@ -240,7 +261,10 @@ def generate_requests(  # noqa C901
             all_indices = torch.ops.fbgemm.bottom_k_per_row(
                 all_indices, torch.tensor([0, L], dtype=torch.long), True
             )
-        rng = default_rng()
+        if deterministic_output:
+            rng = default_rng(12345)
+        else:
+            rng = default_rng()
         permutation = torch.as_tensor(
             rng.choice(E, size=all_indices.max().item() + 1, replace=False)
         )

From 4decf7b316842a697908a2791decc6c5ca4bd1e2 Mon Sep 17 00:00:00 2001
From: Supadchaya Puangpontip <supadchaya@meta.com>
Date: Fri, 8 Sep 2023 16:01:00 -0700
Subject: [PATCH 16/94] Fix nightly publishing logic (#2007)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2007

Fix nightly publishing logic. Currently it attempts to publish all versions when for the scheduled run.

https://github.com/pytorch/FBGEMM/actions/runs/6121960197/job/16617350408

Reviewed By: sryap

Differential Revision: D49106347

fbshipit-source-id: 29ecd25f1f2a1835941c59123cf328e48ef806ba
---
 .github/workflows/fbgemm_gpu_cuda_nightly.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/fbgemm_gpu_cuda_nightly.yml b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
index c3aeef8ba8..30a87c49b7 100644
--- a/.github/workflows/fbgemm_gpu_cuda_nightly.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
@@ -184,7 +184,7 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
 
     - name: Push FBGEMM_GPU Nightly Binary to PYPI
-      if: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish) }}
+      if: ${{ github.event_name == 'schedule' && matrix.cuda-version == matrix.cuda-version-publish || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish) }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
       run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu_nightly-*.whl "$PYPI_TOKEN"

From b5bc22922d27ad65df434cd0bebf08b2795fbf3b Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Fri, 8 Sep 2023 16:48:51 -0700
Subject: [PATCH 17/94] Add CUDA artifact selection on publish (#2005)

Summary:
- Add CUDA version selection on artifact publishing
- Add new workflow file for testing installations of FBGEMM_GPU through pip

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2005

Reviewed By: shintaro-iwasaki

Differential Revision: D49084679

Pulled By: q10

fbshipit-source-id: 2406734aebef9d998e06c733bdb7377ea30bcd61
---
 .github/scripts/fbgemm_gpu_build.bash         |  32 ---
 .github/scripts/fbgemm_gpu_install.bash       | 134 ++++++++++++
 .github/scripts/fbgemm_gpu_test.bash          |   4 +-
 .github/scripts/setup_env.bash                |   2 +
 .github/scripts/utils_pytorch.bash            |   8 +-
 .github/workflows/fbgemm_gpu_ci.yml           |   8 +-
 .github/workflows/fbgemm_gpu_cpu_nightly.yml  |   2 +-
 .github/workflows/fbgemm_gpu_cpu_release.yml  |   2 +-
 .github/workflows/fbgemm_gpu_cuda_nightly.yml |   2 +-
 .github/workflows/fbgemm_gpu_cuda_release.yml |  12 +-
 .github/workflows/fbgemm_gpu_pip.yml          | 194 ++++++++++++++++++
 11 files changed, 350 insertions(+), 50 deletions(-)
 create mode 100644 .github/scripts/fbgemm_gpu_install.bash
 create mode 100644 .github/workflows/fbgemm_gpu_pip.yml

diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index 9c2de49d6e..1a0978bdb9 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -402,35 +402,3 @@ build_fbgemm_gpu_develop () {
 
   echo "[BUILD] FBGEMM-GPU build + develop completed"
 }
-
-install_fbgemm_gpu_package () {
-  local env_name="$1"
-  local package_name="$2"
-  if [ "$package_name" == "" ]; then
-    echo "Usage: ${FUNCNAME[0]} ENV_NAME WHEEL_NAME"
-    echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu.whl     # Install the package (wheel)"
-    return 1
-  else
-    echo "################################################################################"
-    echo "# Install FBGEMM-GPU Package (Wheel)"
-    echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
-    echo "################################################################################"
-    echo ""
-  fi
-
-  echo "[INSTALL] Printing out FBGEMM-GPU wheel SHA: ${package_name}"
-  print_exec sha1sum "${package_name}"
-  print_exec sha256sum "${package_name}"
-  print_exec md5sum "${package_name}"
-
-  echo "[INSTALL] Installing FBGEMM-GPU wheel: ${package_name} ..."
-  (exec_with_retries conda run -n "${env_name}" python -m pip install "${package_name}") || return 1
-
-  echo "[INSTALL] Checking imports ..."
-  (test_python_import "${env_name}" fbgemm_gpu) || return 1
-  (test_python_import "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1
-
-  echo "[INSTALL] Wheel installation completed ..."
-}
diff --git a/.github/scripts/fbgemm_gpu_install.bash b/.github/scripts/fbgemm_gpu_install.bash
new file mode 100644
index 0000000000..9793bcf34c
--- /dev/null
+++ b/.github/scripts/fbgemm_gpu_install.bash
@@ -0,0 +1,134 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# shellcheck disable=SC1091,SC2128
+. "$( dirname -- "$BASH_SOURCE"; )/utils_base.bash"
+
+################################################################################
+# FBGEMM_GPU Install Functions
+################################################################################
+
+install_fbgemm_gpu_wheel () {
+  local env_name="$1"
+  local wheel_path="$2"
+  if [ "$wheel_path" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME WHEEL_NAME"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu.whl     # Install the package (wheel)"
+    return 1
+  else
+    echo "################################################################################"
+    echo "# Install FBGEMM-GPU from Wheel"
+    echo "#"
+    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "################################################################################"
+    echo ""
+  fi
+
+  echo "[INSTALL] Printing out FBGEMM-GPU wheel SHA: ${wheel_path}"
+  print_exec sha1sum "${wheel_path}"
+  print_exec sha256sum "${wheel_path}"
+  print_exec md5sum "${wheel_path}"
+
+  echo "[INSTALL] Installing FBGEMM-GPU wheel: ${wheel_path} ..."
+  (exec_with_retries conda run -n "${env_name}" python -m pip install "${wheel_path}") || return 1
+
+  echo "[INSTALL] Checking imports ..."
+  (test_python_import "${env_name}" fbgemm_gpu) || return 1
+  (test_python_import "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1
+
+  echo "[INSTALL] FBGEMM-GPU installation through wheel completed ..."
+}
+
+
+
+install_fbgemm_gpu_pip () {
+  local env_name="$1"
+  local fbgemm_gpu_version="$2"
+  local fbgemm_gpu_variant_type="$3"
+  local fbgemm_gpu_variant_version="$4"
+  if [ "$fbgemm_gpu_variant_type" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME FBGEMM_GPU_VERSION FBGEMM_GPU_VARIANT_TYPE [FBGEMM_GPU_VARIANT_VERSION]"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env 0.5.0rc2 cuda 12.1.1        # Install a specific version of the package (PyPI)"
+    return 1
+  else
+    echo "################################################################################"
+    echo "# Install FBGEMM-GPU Package from PIP"
+    echo "#"
+    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "################################################################################"
+    echo ""
+  fi
+
+  # Set the package variant
+  if [ "$fbgemm_gpu_variant_type" == "cuda" ]; then
+    # Extract the CUDA version or default to 11.8.0
+    local cuda_version="${fbgemm_gpu_variant_version:-11.8.0}"
+    # shellcheck disable=SC2206
+    local cuda_version_arr=(${cuda_version//./ })
+    # Convert, i.e. cuda 11.7.1 => cu117
+    local fbgemm_gpu_variant="cu${cuda_version_arr[0]}${cuda_version_arr[1]}"
+  elif [ "$fbgemm_gpu_variant_type" == "rocm" ]; then
+    # Extract the ROCM version or default to 5.5.1
+    local rocm_version="${fbgemm_gpu_variant_version:-5.5.1}"
+    # shellcheck disable=SC2206
+    local rocm_version_arr=(${rocm_version//./ })
+    # Convert, i.e. rocm 5.5.1 => rocm5.5
+    local fbgemm_gpu_variant="rocm${rocm_version_arr[0]}.${rocm_version_arr[1]}"
+  else
+    local fbgemm_gpu_variant_type="cpu"
+    local fbgemm_gpu_variant="cpu"
+  fi
+  echo "[INSTALL] Extracted FBGEMM-GPU variant: ${fbgemm_gpu_variant}"
+
+  # Set the package name and installation channel
+#   if [ "$fbgemm_gpu_version" == "nightly" ] || [ "$fbgemm_gpu_version" == "test" ]; then
+#     local fbgemm_gpu_package="--pre fbgemm-gpu"
+#     local fbgemm_gpu_channel="https://download.pytorch.org/whl/${fbgemm_gpu_version}/${fbgemm_gpu_variant}/"
+#   elif [ "$fbgemm_gpu_version" == "latest" ]; then
+#     local fbgemm_gpu_package="fbgemm-gpu"
+#     local fbgemm_gpu_channel="https://download.pytorch.org/whl/${fbgemm_gpu_variant}/"
+#   else
+#     local fbgemm_gpu_package="fbgemm-gpu==${fbgemm_gpu_version}+${fbgemm_gpu_variant}"
+#     local fbgemm_gpu_channel="https://download.pytorch.org/whl/${fbgemm_gpu_variant}/"
+#   fi
+
+  if [ "$fbgemm_gpu_variant_type" == "cuda" ]; then
+    if [ "$fbgemm_gpu_version" == "nightly" ]; then
+      local fbgemm_gpu_package="fbgemm-gpu-nightly"
+    elif [ "$fbgemm_gpu_version" == "latest" ]; then
+      local fbgemm_gpu_package="fbgemm-gpu"
+    else
+      local fbgemm_gpu_package="fbgemm-gpu==${fbgemm_gpu_version}"
+    fi
+
+  elif [ "$fbgemm_gpu_variant_type" == "rocm" ]; then
+    echo "ROCm is currently not supported in PyPI!"
+    return 1
+
+  else
+    if [ "$fbgemm_gpu_version" == "nightly" ]; then
+      local fbgemm_gpu_package="fbgemm-gpu-nightly-cpu"
+    elif [ "$fbgemm_gpu_version" == "latest" ]; then
+      local fbgemm_gpu_package="fbgemm-gpu-cpu"
+    else
+      local fbgemm_gpu_package="fbgemm-gpu-cpu==${fbgemm_gpu_version}"
+    fi
+  fi
+
+  echo "[INSTALL] Attempting to install FBGEMM-GPU ${fbgemm_gpu_version}+${fbgemm_gpu_variant} through PIP ..."
+  # shellcheck disable=SC2086
+  (exec_with_retries conda run -n "${env_name}" pip install ${fbgemm_gpu_package}) || return 1
+
+  echo "[INSTALL] Checking imports ..."
+  (test_python_import "${env_name}" fbgemm_gpu) || return 1
+  (test_python_import "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1
+
+  echo "[INSTALL] FBGEMM-GPU installation through PIP completed ..."
+}
diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
index 4e37a1a17f..809c1c6be1 100644
--- a/.github/scripts/fbgemm_gpu_test.bash
+++ b/.github/scripts/fbgemm_gpu_test.bash
@@ -170,8 +170,6 @@ test_setup_conda_environment () {
   else
     install_pytorch_pip       "${env_name}" "${pytorch_version}" "${pytorch_variant_type}" "${pytorch_variant_version}" || return 1
   fi
-
-  return "${env_name}"
 }
 
 test_fbgemm_gpu_build_and_install () {
@@ -184,7 +182,7 @@ test_fbgemm_gpu_build_and_install () {
   build_fbgemm_gpu_package    "${env_name}" release "${pytorch_variant_type}" || return 1
   # shellcheck disable=SC2164
   cd -
-  install_fbgemm_gpu_package  "${env_name}" fbgemm_gpu/dist/*.whl             || return 1
+  install_fbgemm_gpu_wheel  "${env_name}" fbgemm_gpu/dist/*.whl             || return 1
 
   cd fbgemm_gpu/test                        || return 1
   run_fbgemm_gpu_tests        "${env_name}" || return 1
diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash
index ca9e25ae0a..0bc8449c5f 100755
--- a/.github/scripts/setup_env.bash
+++ b/.github/scripts/setup_env.bash
@@ -22,6 +22,8 @@
 # shellcheck disable=SC1091,SC2128
 . "$( dirname -- "$BASH_SOURCE"; )/fbgemm_gpu_docs.bash"
 # shellcheck disable=SC1091,SC2128
+. "$( dirname -- "$BASH_SOURCE"; )/fbgemm_gpu_install.bash"
+# shellcheck disable=SC1091,SC2128
 . "$( dirname -- "$BASH_SOURCE"; )/fbgemm_gpu_lint.bash"
 # shellcheck disable=SC1091,SC2128
 . "$( dirname -- "$BASH_SOURCE"; )/fbgemm_gpu_test.bash"
diff --git a/.github/scripts/utils_pytorch.bash b/.github/scripts/utils_pytorch.bash
index 846eb17ea8..5c2f97faba 100644
--- a/.github/scripts/utils_pytorch.bash
+++ b/.github/scripts/utils_pytorch.bash
@@ -118,15 +118,15 @@ install_pytorch_pip () {
 
   # Set the package variant
   if [ "$pytorch_variant_type" == "cuda" ]; then
-    # Extract the CUDA version or default to 11.7.1
-    local cuda_version="${pytorch_variant_version:-11.7.1}"
+    # Extract the CUDA version or default to 11.8.0
+    local cuda_version="${pytorch_variant_version:-11.8.0}"
     # shellcheck disable=SC2206
     local cuda_version_arr=(${cuda_version//./ })
     # Convert, i.e. cuda 11.7.1 => cu117
     local pytorch_variant="cu${cuda_version_arr[0]}${cuda_version_arr[1]}"
   elif [ "$pytorch_variant_type" == "rocm" ]; then
-    # Extract the ROCM version or default to 5.3
-    local rocm_version="${pytorch_variant_version:-5.3}"
+    # Extract the ROCM version or default to 5.5.1
+    local rocm_version="${pytorch_variant_version:-5.5.1}"
     # shellcheck disable=SC2206
     local rocm_version_arr=(${rocm_version//./ })
     # Convert, i.e. rocm 5.5.1 => rocm5.5
diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml
index ee3e46ccfa..a1817a3761 100644
--- a/.github/workflows/fbgemm_gpu_ci.yml
+++ b/.github/workflows/fbgemm_gpu_ci.yml
@@ -88,10 +88,10 @@ jobs:
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Build FBGEMM_GPU-ROCM Nightly
+    - name: Build FBGEMM_GPU-ROCm Nightly
       run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm gfx90a
 
-    - name: Test FBGEMM_GPU-ROCM Nightly Installation
+    - name: Test FBGEMM_GPU-ROCm Nightly Installation
       timeout-minutes: 10
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
 
@@ -154,10 +154,10 @@ jobs:
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Build FBGEMM_GPU-ROCM Nightly
+    - name: Build FBGEMM_GPU-ROCm Nightly
       run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm
 
-    - name: Test FBGEMM_GPU-ROCM Nightly Installation
+    - name: Test FBGEMM_GPU-ROCm Nightly Installation
       timeout-minutes: 15
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
 
diff --git a/.github/workflows/fbgemm_gpu_cpu_nightly.yml b/.github/workflows/fbgemm_gpu_cpu_nightly.yml
index 1f6547848c..2b279bc5ea 100644
--- a/.github/workflows/fbgemm_gpu_cpu_nightly.yml
+++ b/.github/workflows/fbgemm_gpu_cpu_nightly.yml
@@ -171,7 +171,7 @@ jobs:
       run: |
         . $PRELUDE
         pwd; ls -la .
-        install_fbgemm_gpu_package $BUILD_ENV *.whl
+        install_fbgemm_gpu_wheel $BUILD_ENV *.whl
 
     - name: Test with PyTest
       timeout-minutes: 10
diff --git a/.github/workflows/fbgemm_gpu_cpu_release.yml b/.github/workflows/fbgemm_gpu_cpu_release.yml
index 42193182a4..6e11f7d022 100644
--- a/.github/workflows/fbgemm_gpu_cpu_release.yml
+++ b/.github/workflows/fbgemm_gpu_cpu_release.yml
@@ -158,7 +158,7 @@ jobs:
       run: |
         . $PRELUDE
         pwd; ls -la .
-        install_fbgemm_gpu_package $BUILD_ENV *.whl
+        install_fbgemm_gpu_wheel $BUILD_ENV *.whl
 
     - name: Test with PyTest
       timeout-minutes: 10
diff --git a/.github/workflows/fbgemm_gpu_cuda_nightly.yml b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
index 30a87c49b7..7dd98f208a 100644
--- a/.github/workflows/fbgemm_gpu_cuda_nightly.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
@@ -177,7 +177,7 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Install FBGEMM_GPU Nightly
-      run: . $PRELUDE; install_fbgemm_gpu_package $BUILD_ENV *.whl
+      run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl
 
     - name: Test with PyTest
       timeout-minutes: 10
diff --git a/.github/workflows/fbgemm_gpu_cuda_release.yml b/.github/workflows/fbgemm_gpu_cuda_release.yml
index bb4ad8fa67..20c9b188f3 100644
--- a/.github/workflows/fbgemm_gpu_cuda_release.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_release.yml
@@ -27,6 +27,12 @@ on:
         type: boolean
         required: false
         default: false
+      cuda_version:
+        description: CUDA Version to Use for PyPI Publishing
+        type: choice
+        required: false
+        options: [ "11.8.0", "12.1.1" ]
+        default: "11.8.0"
 
 concurrency:
   # Cancel previous runs in the PR if a new commit is pushed
@@ -124,8 +130,6 @@ jobs:
         ]
         python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.8.0", "12.1.1" ]
-        # Specify exactly ONE CUDA version for artifact publish
-        cuda-version-publish: [ "11.8.0" ]
     needs: build_artifact
 
     steps:
@@ -164,14 +168,14 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Install FBGEMM_GPU
-      run: . $PRELUDE; install_fbgemm_gpu_package $BUILD_ENV *.whl
+      run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl
 
     - name: Test with PyTest
       timeout-minutes: 10
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
 
     - name: Push FBGEMM_GPU Binary to PYPI
-      if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish }}
+      if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == github.event.inputs.cuda_version }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
       run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu-*.whl "$PYPI_TOKEN"
diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
new file mode 100644
index 0000000000..d2903e6b99
--- /dev/null
+++ b/.github/workflows/fbgemm_gpu_pip.yml
@@ -0,0 +1,194 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+name: FBGEMM_GPU PIP Install + Test
+
+on:
+  # Manual Trigger
+  #
+  workflow_dispatch:
+    inputs:
+      fbgemm_gpu_version:
+        description: FBGEMM-GPU Version (e.g. '0.5.0rc1')
+        type: string
+        required: true
+      fbgemm_gpu_variant_type:
+        description: FBGEMM-GPU Variant
+        type: choice
+        required: true
+        options: [ "cpu", "cuda", "rocm" ]
+        default: "cpu"
+      fbgemm_gpu_variant_version:
+        description: FBGEMM-GPU Variant Version (e.g. 'CUDA 12.1.1' --> 12.1.1)
+        type: string
+        required: false
+
+
+jobs:
+  test_pypi_install_cpu:
+    if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant_type == 'cpu' }}
+    runs-on: ${{ matrix.host-machine.instance }}
+    container:
+      image: amazonlinux:2023
+      options: --user root
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: test_install
+    strategy:
+      fail-fast: false
+      matrix:
+        host-machine: [
+          { instance: "linux.4xlarge" },
+          { instance: "linux.arm64.2xlarge" },
+        ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+
+    steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which
+
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info; print_ec2_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install PyTorch-CPU
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV test cpu
+
+    - name: Install FBGEMM_GPU-CPU
+      run: . $PRELUDE; cd fbgemm_gpu; install_fbgemm_gpu_pip $BUILD_ENV ${{ github.event.inputs.fbgemm_gpu_version }} cpu
+
+    - name: Test with PyTest
+      timeout-minutes: 10
+      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
+
+
+  test_pypi_install_cuda:
+    if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant_type == 'cuda' }}
+    runs-on: ${{ matrix.host-machine.instance }}
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: test_install
+      ENFORCE_NVIDIA_GPU: 1
+    strategy:
+      fail-fast: false
+      matrix:
+        host-machine: [
+          { instance: "linux.g5.4xlarge.nvidia.gpu" },
+        ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        cuda-version: [ "11.8.0", "12.1.1" ]
+        # Specify exactly ONE CUDA version for artifact publish
+        cuda-version-publish: [ "11.8.0" ]
+
+    steps:
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
+
+    - name: Install NVIDIA Drivers and NVIDIA-Docker Runtime
+      uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info; print_ec2_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install CUDA
+      run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
+
+    - name: Install PyTorch-CUDA
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda ${{ matrix.cuda-version }}
+
+    - name: Install FBGEMM_GPU-CUDA
+      run: . $PRELUDE; cd fbgemm_gpu; install_fbgemm_gpu_pip $BUILD_ENV ${{ github.event.inputs.fbgemm_gpu_version }} cuda ${{ github.event.inputs.fbgemm_gpu_variant_version }}
+
+    - name: Test with PyTest
+      timeout-minutes: 10
+      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
+
+
+  test_pypi_install_rocm:
+    if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant_type == 'rocm' }}
+    runs-on: ${{ matrix.host-machine.instance }}
+    container:
+      image: "rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}-complete"
+      options: --user root --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: test_install
+      ENFORCE_AMD_GPU: 1
+    strategy:
+      fail-fast: false
+      matrix:
+        host-machine: [
+          { instance: "rocm" },
+        ]
+        # ROCm machines are limited, so we only test against Python 3.10
+        python-version: [ "3.10" ]
+        rocm-version: [ "5.5.1", "5.6" ]
+
+    steps:
+    - name: Setup Build Container
+      run: |
+        apt update -y
+        apt install -y git wget
+        git config --global --add safe.directory '*'
+
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Free Disk Space
+      run: . $PRELUDE; free_disk_space
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install Build Tools
+      run: . $PRELUDE; install_build_tools $BUILD_ENV
+
+    - name: Install PyTorch-ROCm
+      run:  . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }}
+
+    - name: Install FBGEMM_GPU-ROCm
+      run: . $PRELUDE; cd fbgemm_gpu; install_fbgemm_gpu_pip $BUILD_ENV ${{ github.event.inputs.fbgemm_gpu_version }} rocm ${{ github.event.inputs.fbgemm_gpu_variant_version }}
+
+    - name: Test FBGEMM_GPU-ROCm
+      timeout-minutes: 15
+      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm

From 9c8f89e58d134c989019fba1a278b1977e05fa0e Mon Sep 17 00:00:00 2001
From: Supadchaya Puangpontip <supadchaya@meta.com>
Date: Sat, 9 Sep 2023 02:42:32 -0700
Subject: [PATCH 18/94] Fix nightly publishing logic (#2008)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2008

Fix nightly publishing logic. Currently it attempts to publish all versions when for the scheduled run.
https://github.com/pytorch/FBGEMM/actions/runs/6121960197/job/16617350408

Reviewed By: q10

Differential Revision: D49117790

fbshipit-source-id: 78a10703cba494d2691b43e3223e94ba3ac99989
---
 .github/workflows/fbgemm_gpu_cuda_nightly.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/fbgemm_gpu_cuda_nightly.yml b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
index 7dd98f208a..a65e9e21fa 100644
--- a/.github/workflows/fbgemm_gpu_cuda_nightly.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
@@ -184,7 +184,7 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
 
     - name: Push FBGEMM_GPU Nightly Binary to PYPI
-      if: ${{ github.event_name == 'schedule' && matrix.cuda-version == matrix.cuda-version-publish || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish) }}
+      if: ${{ (github.event_name == 'schedule' && matrix.cuda-version == matrix.cuda-version-publish) || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish) }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
       run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu_nightly-*.whl "$PYPI_TOKEN"

From 09e0030f2b0daff53f08292a8062c9198d5f0232 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Tue, 12 Sep 2023 18:28:51 -0700
Subject: [PATCH 19/94] Auto-generate the version file (#2011)

Summary:
- Auto-generate the version file in OSS, so that the `__version__` symbol is available in the package

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2011

Reviewed By: shintaro-iwasaki

Differential Revision: D49174509

Pulled By: q10

fbshipit-source-id: d0dfd1ab0a2912016ad6e003bf88feeab696fa4a
---
 .github/scripts/fbgemm_gpu_build.bash   |  6 ++---
 .github/scripts/fbgemm_gpu_install.bash | 19 +++++++++++-----
 .github/scripts/fbgemm_gpu_lint.bash    |  2 +-
 .github/scripts/fbgemm_gpu_test.bash    |  4 ++--
 .github/scripts/setup_env.bash          |  6 ++---
 .github/scripts/utils_base.bash         | 25 ++++++++++++++++++---
 .github/scripts/utils_conda.bash        |  2 +-
 .github/scripts/utils_pytorch.bash      |  4 ++--
 fbgemm_gpu/fbgemm_gpu/__init__.py       |  4 ++++
 fbgemm_gpu/setup.py                     | 30 ++++++++++++++++++++++++-
 10 files changed, 80 insertions(+), 22 deletions(-)

diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index 1a0978bdb9..13f8d48b9f 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -41,8 +41,8 @@ prepare_fbgemm_gpu_build () {
   echo "[BUILD] Installing other build dependencies ..."
   (exec_with_retries conda run -n "${env_name}" python -m pip install -r requirements.txt) || return 1
 
-  (test_python_import "${env_name}" numpy) || return 1
-  (test_python_import "${env_name}" skbuild) || return 1
+  (test_python_import_package "${env_name}" numpy) || return 1
+  (test_python_import_package "${env_name}" skbuild) || return 1
 
   echo "[BUILD] Successfully ran git submodules update"
 }
@@ -360,7 +360,7 @@ build_fbgemm_gpu_install () {
   # Exit this directory to prevent import clashing, since there is an
   # fbgemm_gpu/ subdirectory present
   cd - || return 1
-  (test_python_import "${env_name}" fbgemm_gpu) || return 1
+  (test_python_import_package "${env_name}" fbgemm_gpu) || return 1
 
   echo "[BUILD] FBGEMM-GPU build + install completed"
 }
diff --git a/.github/scripts/fbgemm_gpu_install.bash b/.github/scripts/fbgemm_gpu_install.bash
index 9793bcf34c..68ec60226a 100644
--- a/.github/scripts/fbgemm_gpu_install.bash
+++ b/.github/scripts/fbgemm_gpu_install.bash
@@ -13,6 +13,17 @@
 # FBGEMM_GPU Install Functions
 ################################################################################
 
+__fbgemm_gpu_post_install_checks () {
+  echo "[INSTALL] Checking imports and symbols ..."
+  (test_python_import_package "${env_name}" fbgemm_gpu) || return 1
+  (test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1
+  (test_python_import_symbol "${env_name}" fbgemm_gpu __version__) || return 1
+
+  echo "[CHECK] Printing out the FBGEMM-GPU version ..."
+  installed_fbgemm_gpu_version=$(conda run -n "${env_name}" python -c "import fbgemm_gpu; print(fbgemm_gpu.__version__)")
+  echo "[CHECK] The installed version is: ${installed_fbgemm_gpu_version}"
+}
+
 install_fbgemm_gpu_wheel () {
   local env_name="$1"
   local wheel_path="$2"
@@ -38,9 +49,7 @@ install_fbgemm_gpu_wheel () {
   echo "[INSTALL] Installing FBGEMM-GPU wheel: ${wheel_path} ..."
   (exec_with_retries conda run -n "${env_name}" python -m pip install "${wheel_path}") || return 1
 
-  echo "[INSTALL] Checking imports ..."
-  (test_python_import "${env_name}" fbgemm_gpu) || return 1
-  (test_python_import "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1
+  __fbgemm_gpu_post_install_checks || return 1
 
   echo "[INSTALL] FBGEMM-GPU installation through wheel completed ..."
 }
@@ -126,9 +135,7 @@ install_fbgemm_gpu_pip () {
   # shellcheck disable=SC2086
   (exec_with_retries conda run -n "${env_name}" pip install ${fbgemm_gpu_package}) || return 1
 
-  echo "[INSTALL] Checking imports ..."
-  (test_python_import "${env_name}" fbgemm_gpu) || return 1
-  (test_python_import "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1
+  __fbgemm_gpu_post_install_checks || return 1
 
   echo "[INSTALL] FBGEMM-GPU installation through PIP completed ..."
 }
diff --git a/.github/scripts/fbgemm_gpu_lint.bash b/.github/scripts/fbgemm_gpu_lint.bash
index 011c9fc06c..dc239ab261 100644
--- a/.github/scripts/fbgemm_gpu_lint.bash
+++ b/.github/scripts/fbgemm_gpu_lint.bash
@@ -42,7 +42,7 @@ install_lint_tools () {
   # Check Python packages are importable
   local import_tests=( click )
   for p in "${import_tests[@]}"; do
-    (test_python_import "${env_name}" "${p}") || return 1
+    (test_python_import_package "${env_name}" "${p}") || return 1
   done
 
   echo "[INSTALL] Successfully installed all the lint tools"
diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
index 809c1c6be1..575cb54646 100644
--- a/.github/scripts/fbgemm_gpu_test.bash
+++ b/.github/scripts/fbgemm_gpu_test.bash
@@ -93,8 +93,8 @@ run_fbgemm_gpu_tests () {
   print_exec conda install -n "${env_name}" -y pytest
 
   echo "[TEST] Checking imports ..."
-  (test_python_import "${env_name}" fbgemm_gpu) || return 1
-  (test_python_import "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1
+  (test_python_import_package "${env_name}" fbgemm_gpu) || return 1
+  (test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1
 
   echo "[TEST] Enumerating test files ..."
   print_exec ls -lth ./*.py
diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash
index 0bc8449c5f..12477ed1a6 100755
--- a/.github/scripts/setup_env.bash
+++ b/.github/scripts/setup_env.bash
@@ -179,7 +179,7 @@ install_build_tools () {
   # Check Python packages are importable
   local import_tests=( click hypothesis jinja2 numpy skbuild wheel )
   for p in "${import_tests[@]}"; do
-    (test_python_import "${env_name}" "${p}") || return 1
+    (test_python_import_package "${env_name}" "${p}") || return 1
   done
 
   echo "[INSTALL] Successfully installed all the build tools"
@@ -212,8 +212,8 @@ publish_to_pypi () {
 
   echo "[INSTALL] Installing twine ..."
   print_exec conda install -n "${env_name}" -y twine
-  (test_python_import "${env_name}" twine) || return 1
-  (test_python_import "${env_name}" OpenSSL) || return 1
+  (test_python_import_package "${env_name}" twine) || return 1
+  (test_python_import_package "${env_name}" OpenSSL) || return 1
 
   echo "[PUBLISH] Uploading package(s) to PyPI: ${package_name} ..."
   conda run -n "${env_name}" \
diff --git a/.github/scripts/utils_base.bash b/.github/scripts/utils_base.bash
index d54b506cbc..9ab95c51e1 100644
--- a/.github/scripts/utils_base.bash
+++ b/.github/scripts/utils_base.bash
@@ -70,7 +70,26 @@ exec_with_retries () {
 # Assert Functions
 ################################################################################
 
-test_python_import () {
+test_python_import_symbol () {
+  local env_name="$1"
+  local package_name="$2"
+  local target_symbol="$3"
+  if [ "$target_symbol" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME PACKAGE_NAME SYMBOL"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env numpy __version__"
+    return 1
+  fi
+
+  if conda run -n "${env_name}" python -c "from ${package_name} import ${target_symbol}"; then
+    echo "[CHECK] Found symbol '${target_symbol}' in Python package '${package_name}'."
+  else
+    echo "[CHECK] Could not find symbol '${target_symbol}' in Python package '${package_name}'; the package might be missing or broken."
+    return 1
+  fi
+}
+
+test_python_import_package () {
   local env_name="$1"
   local python_import="$2"
   if [ "$python_import" == "" ]; then
@@ -81,9 +100,9 @@ test_python_import () {
   fi
 
   if conda run -n "${env_name}" python -c "import ${python_import}"; then
-    echo "[CHECK] Python package ${python_import} found."
+    echo "[CHECK] Python package '${python_import}' found."
   else
-    echo "[CHECK] Python package ${python_import} was not found or is broken!"
+    echo "[CHECK] Python package '${python_import}' was not found, or the package is broken!"
     return 1
   fi
 }
diff --git a/.github/scripts/utils_conda.bash b/.github/scripts/utils_conda.bash
index e022c35db0..970c96b983 100644
--- a/.github/scripts/utils_conda.bash
+++ b/.github/scripts/utils_conda.bash
@@ -116,7 +116,7 @@ create_conda_environment () {
 
   # This test fails with load errors if the pyOpenSSL and cryptography package versions don't align
   echo "[SETUP] Testing pyOpenSSL import ..."
-  (test_python_import "${env_name}" OpenSSL) || return 1
+  (test_python_import_package "${env_name}" OpenSSL) || return 1
 
   echo "[SETUP] Installed Python version: $(conda run -n "${env_name}" python --version)"
   echo "[SETUP] Successfully created Conda environment: ${env_name}"
diff --git a/.github/scripts/utils_pytorch.bash b/.github/scripts/utils_pytorch.bash
index 5c2f97faba..3a21167df3 100644
--- a/.github/scripts/utils_pytorch.bash
+++ b/.github/scripts/utils_pytorch.bash
@@ -65,7 +65,7 @@ install_pytorch_conda () {
   (exec_with_retries conda install --force-reinstall -n "${env_name}" -y ${pytorch_package} -c "${pytorch_channel}") || return 1
 
   # Check that PyTorch is importable
-  (test_python_import "${env_name}" torch.distributed) || return 1
+  (test_python_import_package "${env_name}" torch.distributed) || return 1
 
   # Print out the actual installed PyTorch version
   installed_pytorch_version=$(conda run -n "${env_name}" python -c "import torch; print(torch.__version__)")
@@ -154,7 +154,7 @@ install_pytorch_pip () {
   (exec_with_retries conda run -n "${env_name}" pip install ${pytorch_package} --extra-index-url ${pytorch_channel}) || return 1
 
   # Check that PyTorch is importable
-  (test_python_import "${env_name}" torch.distributed) || return 1
+  (test_python_import_package "${env_name}" torch.distributed) || return 1
 
   # Print out the actual installed PyTorch version
   installed_pytorch_version=$(conda run -n "${env_name}" python -c "import torch; print(torch.__version__)")
diff --git a/fbgemm_gpu/fbgemm_gpu/__init__.py b/fbgemm_gpu/fbgemm_gpu/__init__.py
index d96b17dbcb..0ff9b00b79 100644
--- a/fbgemm_gpu/fbgemm_gpu/__init__.py
+++ b/fbgemm_gpu/fbgemm_gpu/__init__.py
@@ -18,4 +18,8 @@
 # Use existence to check if fbgemm_gpu_py.so has already been loaded
 open_source: bool = True
 
+# Re-export docs
 from . import _fbgemm_gpu_docs  # noqa: F401, E402
+
+# Re-export the version string from the auto-generated version file
+from ._fbgemm_gpu_version import __version__  # noqa: F401, E402
diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py
index e545255c05..3fcff3c72c 100644
--- a/fbgemm_gpu/setup.py
+++ b/fbgemm_gpu/setup.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# @licenselint-loose-mode
 
 import argparse
 import os
@@ -11,6 +12,7 @@
 import re
 import subprocess
 import sys
+import textwrap
 
 from datetime import date
 from typing import List, Optional
@@ -179,6 +181,26 @@ def _get_cxx11_abi():
 class FbgemmGpuInstaller(PipInstall):
     """FBGEMM_GPU PIP Installer"""
 
+    @classmethod
+    def generate_version_file(cls, package_version: str) -> None:
+        with open("fbgemm_gpu/_fbgemm_gpu_version.py", "w") as file:
+            print(
+                f"[SETUP.PY] Generating version file at: {os.path.realpath(file.name)}"
+            )
+            text = textwrap.dedent(
+                f"""
+                #!/usr/bin/env python3
+                # Copyright (c) Meta Platforms, Inc. and affiliates.
+                # All rights reserved.
+                #
+                # This source code is licensed under the BSD-style license found in the
+                # LICENSE file in the root directory of this source tree.
+
+                __version__: str = "{package_version}"
+                """
+            )
+            file.write(text)
+
     @classmethod
     def description(cls) -> str:
         # Get the long description from the relevant file
@@ -250,9 +272,15 @@ def main(argv: List[str]) -> None:
     # Repair command line args for setup.
     sys.argv = [sys.argv[0]] + unknown
 
+    # Determine the package version
+    package_version = generate_package_version(args.package_name)
+
+    # Generate the version file
+    FbgemmGpuInstaller.generate_version_file(package_version)
+
     setup(
         name=args.package_name,
-        version=generate_package_version(args.package_name),
+        version=package_version,
         author="FBGEMM Team",
         author_email="packages@pytorch.org",
         long_description=FbgemmGpuInstaller.description(),

From 49058dc806e012bddb6bffe71c0b636382409b5c Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Wed, 13 Sep 2023 18:55:06 -0700
Subject: [PATCH 20/94] Add network connectivity checks (#2014)

Summary:
- Add network connectivity checks to tell the user to re-run the command with `with-proxy` if needed

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2014

Reviewed By: sryap

Differential Revision: D49256971

Pulled By: q10

fbshipit-source-id: 6d14d348d389d9a244b50de66edc39030fbb7beb
---
 .github/scripts/fbgemm_gpu_build.bash   |  2 ++
 .github/scripts/fbgemm_gpu_docs.bash    |  2 ++
 .github/scripts/fbgemm_gpu_install.bash |  4 ++--
 .github/scripts/fbgemm_gpu_lint.bash    |  2 ++
 .github/scripts/setup_env.bash          |  8 ++++++++
 .github/scripts/utils_base.bash         | 14 ++++++++++++++
 .github/scripts/utils_conda.bash        |  4 ++++
 .github/scripts/utils_cuda.bash         |  4 ++++
 .github/scripts/utils_pytorch.bash      |  4 ++++
 .github/scripts/utils_rocm.bash         |  2 ++
 .github/scripts/utils_system.bash       |  2 ++
 11 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index 13f8d48b9f..a9033abd66 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -29,6 +29,8 @@ prepare_fbgemm_gpu_build () {
     echo ""
   fi
 
+  test_network_connection || return 1
+
   if [[ "${GITHUB_WORKSPACE}" ]]; then
     # https://github.com/actions/checkout/issues/841
     git config --global --add safe.directory "${GITHUB_WORKSPACE}"
diff --git a/.github/scripts/fbgemm_gpu_docs.bash b/.github/scripts/fbgemm_gpu_docs.bash
index 0ac06fcd51..98e90a4163 100644
--- a/.github/scripts/fbgemm_gpu_docs.bash
+++ b/.github/scripts/fbgemm_gpu_docs.bash
@@ -29,6 +29,8 @@ install_docs_tools () {
     echo ""
   fi
 
+  test_network_connection || return 1
+
   echo "[INSTALL] Installing docs tools ..."
   (exec_with_retries conda install -n "${env_name}" -c conda-forge -y \
     doxygen) || return 1
diff --git a/.github/scripts/fbgemm_gpu_install.bash b/.github/scripts/fbgemm_gpu_install.bash
index 68ec60226a..38cf3280ea 100644
--- a/.github/scripts/fbgemm_gpu_install.bash
+++ b/.github/scripts/fbgemm_gpu_install.bash
@@ -54,8 +54,6 @@ install_fbgemm_gpu_wheel () {
   echo "[INSTALL] FBGEMM-GPU installation through wheel completed ..."
 }
 
-
-
 install_fbgemm_gpu_pip () {
   local env_name="$1"
   local fbgemm_gpu_version="$2"
@@ -75,6 +73,8 @@ install_fbgemm_gpu_pip () {
     echo ""
   fi
 
+  test_network_connection || return 1
+
   # Set the package variant
   if [ "$fbgemm_gpu_variant_type" == "cuda" ]; then
     # Extract the CUDA version or default to 11.8.0
diff --git a/.github/scripts/fbgemm_gpu_lint.bash b/.github/scripts/fbgemm_gpu_lint.bash
index dc239ab261..c129ecc943 100644
--- a/.github/scripts/fbgemm_gpu_lint.bash
+++ b/.github/scripts/fbgemm_gpu_lint.bash
@@ -29,6 +29,8 @@ install_lint_tools () {
     echo ""
   fi
 
+  test_network_connection || return 1
+
   echo "[INSTALL] Installing lint tools ..."
   (exec_with_retries conda install -n "${env_name}" -c conda-forge -y \
     click \
diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash
index 12477ed1a6..ee823d5afd 100755
--- a/.github/scripts/setup_env.bash
+++ b/.github/scripts/setup_env.bash
@@ -41,6 +41,8 @@ setup_bazel () {
   echo "################################################################################"
   echo ""
 
+  test_network_connection || return 1
+
   local bazel_variant="$PLATFORM_NAME_LC"
   echo "[SETUP] Downloading installer Bazel ${bazel_version} (${bazel_variant}) ..."
   print_exec wget -q "https://github.com/bazelbuild/bazel/releases/download/${bazel_version}/bazel-${bazel_version}-installer-${bazel_variant}.sh" -O install-bazel.sh
@@ -76,6 +78,8 @@ install_cxx_compiler () {
     echo ""
   fi
 
+  test_network_connection || return 1
+
   if [ "$use_system_package_manager" != "" ]; then
     echo "[INSTALL] Installing C/C++ compilers through the system package manager ..."
     install_system_packages gcc gcc-c++
@@ -161,6 +165,8 @@ install_build_tools () {
     echo ""
   fi
 
+  test_network_connection || return 1
+
   echo "[INSTALL] Installing build tools ..."
   (exec_with_retries conda install -n "${env_name}" -y \
     click \
@@ -210,6 +216,8 @@ publish_to_pypi () {
     echo ""
   fi
 
+  test_network_connection || return 1
+
   echo "[INSTALL] Installing twine ..."
   print_exec conda install -n "${env_name}" -y twine
   (test_python_import_package "${env_name}" twine) || return 1
diff --git a/.github/scripts/utils_base.bash b/.github/scripts/utils_base.bash
index 9ab95c51e1..5cfafc5f9e 100644
--- a/.github/scripts/utils_base.bash
+++ b/.github/scripts/utils_base.bash
@@ -70,6 +70,20 @@ exec_with_retries () {
 # Assert Functions
 ################################################################################
 
+test_network_connection () {
+  wget --timeout 1 pypi.org -O /dev/null
+  local exit_status=$?
+
+  # https://man7.org/linux/man-pages/man1/wget.1.html
+  if [ $exit_status == 0 ]; then
+    echo "[CHECK] Network does not appear to be blocked."
+  else
+    echo "[CHECK] Network check exit status: ${exit_status}"
+    echo "[CHECK] Network appears to be blocked; please proxy the network connetions, i.e. re-run the command prefixed with 'with-proxy'."
+    return 1
+  fi
+}
+
 test_python_import_symbol () {
   local env_name="$1"
   local package_name="$2"
diff --git a/.github/scripts/utils_conda.bash b/.github/scripts/utils_conda.bash
index 970c96b983..5251dfc6f0 100644
--- a/.github/scripts/utils_conda.bash
+++ b/.github/scripts/utils_conda.bash
@@ -35,6 +35,8 @@ setup_miniconda () {
     echo ""
   fi
 
+  test_network_connection || return 1
+
   # Download and install Miniconda if doesn't exist
   if [ ! -f "${miniconda_prefix}/bin/conda" ]; then
     print_exec mkdir -p "$miniconda_prefix"
@@ -91,6 +93,8 @@ create_conda_environment () {
     echo ""
   fi
 
+  test_network_connection || return 1
+
   echo "[SETUP] Listing existing Conda environments ..."
   print_exec conda info --envs
 
diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash
index e500a127e8..0263eb641d 100644
--- a/.github/scripts/utils_cuda.bash
+++ b/.github/scripts/utils_cuda.bash
@@ -30,6 +30,8 @@ install_cuda () {
     echo ""
   fi
 
+  test_network_connection || return 1
+
   # Check CUDA version formatting
   # shellcheck disable=SC2206
   local cuda_version_arr=(${cuda_version//./ })
@@ -89,6 +91,8 @@ install_cudnn () {
     echo ""
   fi
 
+  test_network_connection || return 1
+
   # Install cuDNN manually
   # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
   local cudnn_packages=(
diff --git a/.github/scripts/utils_pytorch.bash b/.github/scripts/utils_pytorch.bash
index 3a21167df3..0cf7916dc0 100644
--- a/.github/scripts/utils_pytorch.bash
+++ b/.github/scripts/utils_pytorch.bash
@@ -36,6 +36,8 @@ install_pytorch_conda () {
     echo ""
   fi
 
+  test_network_connection || return 1
+
   # Install the cpuonly package if needed
   if [ "$pytorch_variant_type" == "cpu" ]; then
     local pytorch_package="cpuonly pytorch"
@@ -116,6 +118,8 @@ install_pytorch_pip () {
     echo ""
   fi
 
+  test_network_connection || return 1
+
   # Set the package variant
   if [ "$pytorch_variant_type" == "cuda" ]; then
     # Extract the CUDA version or default to 11.8.0
diff --git a/.github/scripts/utils_rocm.bash b/.github/scripts/utils_rocm.bash
index 821e7a83db..9802fb80fa 100644
--- a/.github/scripts/utils_rocm.bash
+++ b/.github/scripts/utils_rocm.bash
@@ -33,6 +33,8 @@ install_rocm_ubuntu () {
     echo ""
   fi
 
+  test_network_connection || return 1
+
   # Based on instructions found in https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.4.3/page/How_to_Install_ROCm.html
 
   # Disable CLI prompts during package installation
diff --git a/.github/scripts/utils_system.bash b/.github/scripts/utils_system.bash
index bfa6170c2d..3d4cbe36aa 100644
--- a/.github/scripts/utils_system.bash
+++ b/.github/scripts/utils_system.bash
@@ -21,6 +21,8 @@ install_system_packages () {
     return 1
   fi
 
+  test_network_connection || return 1
+
   if which sudo; then
     local update_cmd=(sudo)
     local install_cmd=(sudo)

From 45ec8262cd40421a09088711f953ce1d33dea894 Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Wed, 13 Sep 2023 21:38:23 -0700
Subject: [PATCH 21/94] Allow 1 manitissa bit diff in
 TestFused8BitRowwiseQuantizationConversion (#2015)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2015

The reference implementation of FP8 quantization is in Python, but the
actual implementation is in C++/CUDA.  Upon summerdengfb's investigation,
Python has a known floating point representation issue
(https://www.geeksforgeeks.org/floating-point-error-in-python/). This
could cause quantization result discrepancy.  To workaround this
issue, we allow 1 bit difference in the FP8 quantization result (LSB
of mantissa) in `TestFused8BitRowwiseQuantizationConversion`.

Reviewed By: q10, shintaro-iwasaki

Differential Revision: D49255499

fbshipit-source-id: b28294f8076bda61589e10699119375f03b091a8
---
 fbgemm_gpu/test/quantize_ops_test.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fbgemm_gpu/test/quantize_ops_test.py b/fbgemm_gpu/test/quantize_ops_test.py
index f0b82b29bc..5f24f16325 100644
--- a/fbgemm_gpu/test/quantize_ops_test.py
+++ b/fbgemm_gpu/test/quantize_ops_test.py
@@ -118,7 +118,10 @@ def test_quantize_op(
             ncols_aligned = (ncols + 4 - 1) // 4 * 4
             # compare quantized data
             np.testing.assert_allclose(
-                quantized_data_numpy[:, :ncols], reference[:, :ncols]
+                quantized_data_numpy[:, :ncols],
+                reference[:, :ncols],
+                # Allow 1 mantissa bit difference (LSB)
+                atol=1,
             )
             # compare scales
             np.testing.assert_array_almost_equal(

From 66a53ccd2dfba84da3daa9385e3f6004fe441c37 Mon Sep 17 00:00:00 2001
From: Supadchaya Puangpontip <supadchaya@meta.com>
Date: Thu, 14 Sep 2023 04:11:05 -0700
Subject: [PATCH 22/94] Use Nova workflow to host all published wheel files at
 PyTorch site (#2016)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2016

To alleviate CUDA version mismatch issues, we aim to publish fbgemm-gpu-nightly with different CUDA versions. This diff uses Nova workflow and will host the published wheels at PyTorch site instead.
https://github.com/pytorch/FBGEMM/issues/1947

Reviewed By: q10

Differential Revision: D49258503

fbshipit-source-id: a06d095b0c03df62d8cea8fb8db1b5018c9a9dd7
---
 .github/scripts/fbgemm_gpu_build.bash    | 29 ++++++---
 .github/scripts/fbgemm_gpu_test.bash     |  2 +-
 .github/scripts/nova_dir.bash            | 19 ++++++
 .github/scripts/nova_postscript.bash     | 31 ++++++++++
 .github/scripts/nova_prescript.bash      | 75 ++++++++++++++++++++++++
 .github/scripts/utils_system.bash        | 20 ++++---
 .github/workflows/build-wheels-linux.yml | 39 ++++++++++++
 fbgemm_gpu/setup.py                      | 15 ++++-
 8 files changed, 213 insertions(+), 17 deletions(-)
 create mode 100644 .github/scripts/nova_dir.bash
 create mode 100644 .github/scripts/nova_postscript.bash
 create mode 100644 .github/scripts/nova_prescript.bash
 create mode 100644 .github/workflows/build-wheels-linux.yml

diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index a9033abd66..9d82fb61b7 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -41,7 +41,7 @@ prepare_fbgemm_gpu_build () {
   git submodule update --init --recursive
 
   echo "[BUILD] Installing other build dependencies ..."
-  (exec_with_retries conda run -n "${env_name}" python -m pip install -r requirements.txt) || return 1
+  (exec_with_retries conda run --no-capture-output -n "${env_name}" python -m pip install -r requirements.txt) || return 1
 
   (test_python_import_package "${env_name}" numpy) || return 1
   (test_python_import_package "${env_name}" skbuild) || return 1
@@ -117,7 +117,7 @@ __configure_fbgemm_gpu_build_cuda () {
   # Build only CUDA 7.0 and 8.0 (i.e. V100 and A100) because of 100 MB binary size limits from PyPI.
   echo "[BUILD] Setting CUDA build args ..."
   # shellcheck disable=SC2155
-  local nvml_lib_path=$(conda run -n "${env_name}" printenv NVML_LIB_PATH)
+  local nvml_lib_path=$(conda run --no-capture-output -n "${env_name}" printenv NVML_LIB_PATH)
   build_args=(
     --nvml_lib_path="${nvml_lib_path}"
     -DTORCH_CUDA_ARCH_LIST="'${arch_list}'"
@@ -188,7 +188,7 @@ __build_fbgemm_gpu_common_pre_steps () {
 
   # Extract the Python tag
   # shellcheck disable=SC2207
-  python_version=($(conda run -n "${env_name}" python --version))
+  python_version=($(conda run --no-capture-output -n "${env_name}" python --version))
   # shellcheck disable=SC2206
   python_version_arr=(${python_version[1]//./ })
   python_tag="py${python_version_arr[0]}${python_version_arr[1]}"
@@ -196,7 +196,7 @@ __build_fbgemm_gpu_common_pre_steps () {
 
   echo "[BUILD] Running pre-build cleanups ..."
   print_exec rm -rf dist
-  print_exec conda run -n "${env_name}" python setup.py clean
+  print_exec conda run --no-capture-output -n "${env_name}" python setup.py clean
 
   echo "[BUILD] Printing git status ..."
   print_exec git status
@@ -300,10 +300,23 @@ build_fbgemm_gpu_package () {
   # See https://github.com/pypa/manylinux
   local plat_name="manylinux2014_${MACHINE_NAME}"
 
+  echo "[BUILD] Checking arch_list = ${arch_list}"
+  echo "[BUILD] Checking build_args:"
+  echo "${build_args[@]}"
+
+  core=$(lscpu | grep "Core(s)" | awk '{print $NF}') && echo "core = ${core}" || echo "core not found"
+  sockets=$(lscpu | grep "Socket(s)" | awk '{print $NF}') && echo "sockets = ${sockets}" || echo "sockets not found"
+  re='^[0-9]+$'
+  run_multicore=""
+  if [[ $core =~ $re && $sockets =~ $re ]] ; then
+    n_core=$((core * sockets))
+    run_multicore=" -j ${n_core}"
+  fi
+
   # Distribute Python extensions as wheels on Linux
   echo "[BUILD] Building FBGEMM-GPU wheel (VARIANT=${fbgemm_variant}) ..."
-  print_exec conda run -n "${env_name}" \
-    python setup.py bdist_wheel \
+  print_exec conda run --no-capture-output -n "${env_name}" \
+    python setup.py "${run_multicore}" bdist_wheel \
       --package_name="${package_name}" \
       --python-tag="${python_tag}" \
       --plat-name="${plat_name}" \
@@ -352,7 +365,7 @@ build_fbgemm_gpu_install () {
   # Parallelism may need to be limited to prevent the build from being
   # canceled for going over ulimits
   echo "[BUILD] Building + installing FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..."
-  print_exec conda run -n "${env_name}" \
+  print_exec conda run --no-capture-output -n "${env_name}" \
     python setup.py install "${build_args[@]}"
 
   # Run checks on the built libraries
@@ -396,7 +409,7 @@ build_fbgemm_gpu_develop () {
   # Parallelism may need to be limited to prevent the build from being
   # canceled for going over ulimits
   echo "[BUILD] Building (develop) FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..."
-  print_exec conda run -n "${env_name}" \
+  print_exec conda run --no-capture-output -n "${env_name}" \
     python setup.py build develop "${build_args[@]}"
 
   # Run checks on the built libraries
diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
index 575cb54646..200b536aab 100644
--- a/.github/scripts/fbgemm_gpu_test.bash
+++ b/.github/scripts/fbgemm_gpu_test.bash
@@ -28,7 +28,7 @@ run_python_test () {
     echo "################################################################################"
   fi
 
-  if print_exec conda run -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
+  if print_exec conda run --no-capture-output -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
     echo "[TEST] Python test suite PASSED: ${python_test_file}"
     echo ""
   else
diff --git a/.github/scripts/nova_dir.bash b/.github/scripts/nova_dir.bash
new file mode 100644
index 0000000000..ac89e36d7f
--- /dev/null
+++ b/.github/scripts/nova_dir.bash
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+## Workaround for Nova Workflow to look for setup.py in fbgemm_gpu rather than root repo
+FBGEMM_DIR="/__w/FBGEMM/FBGEMM"
+export FBGEMM_REPO="${FBGEMM_DIR}/${REPOSITORY}"
+working_dir=$(pwd)
+if [[ "$working_dir" == "$FBGEMM_REPO" ]]; then cd fbgemm_gpu || echo "Failed to cd fbgemm_gpu from $(pwd)"; fi
+
+## Build clean/wheel will be done in pre-script. Set flag such that setup.py will skip these steps in Nova workflow
+export BUILD_FROM_NOVA=1
+
+## Overwrite existing ENV VAR in Nova
+if [[ "$CONDA_ENV" != "" ]]; then export CONDA_RUN="conda run --no-capture-output -p ${CONDA_ENV}" && echo "$CONDA_RUN"; fi
+if [[ "$CU_VERSION" == "cu118" ]]; then export TORCH_CUDA_ARCH_LIST='7.0;8.0' && echo "$TORCH_CUDA_ARCH_LIST"; fi
+if [[ "$CU_VERSION" == "cu121" ]]; then export TORCH_CUDA_ARCH_LIST='7.0;8.0;9.0' && echo "$TORCH_CUDA_ARCH_LIST"; fi
diff --git a/.github/scripts/nova_postscript.bash b/.github/scripts/nova_postscript.bash
new file mode 100644
index 0000000000..099dbb3c24
--- /dev/null
+++ b/.github/scripts/nova_postscript.bash
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+echo "Current working directory: $(pwd)"
+cd "${FBGEMM_REPO}" || echo "Failed to cd to ${FBGEMM_REPO}"
+PRELUDE="${FBGEMM_REPO}/.github/scripts/setup_env.bash"
+BUILD_ENV_NAME=base
+GITHUB_ENV=TRUE
+export GITHUB_ENV
+
+# Install FBGEMM_GPU Nightly
+echo "Current working directory: $(pwd)"
+# shellcheck disable=SC1091
+# shellcheck source=.github/scripts/setup_env.bash
+. "${PRELUDE}";
+
+install_fbgemm_gpu_wheel "${BUILD_ENV_NAME}" fbgemm_gpu/dist/*.whl
+
+# Test with PyTest
+echo "Current working directory: $(pwd)"
+CPU_GPU="${CU_VERSION}"
+if [ "${CU_VERSION}" != 'cpu' ]; then
+    CPU_GPU=""
+fi
+$CONDA_RUN python3 -c "import torch; print('cuda.is_available() ', torch.cuda.is_available()); print ('device_count() ',torch.cuda.device_count());"
+cd "${FBGEMM_REPO}/fbgemm_gpu/test" || { echo "Failed to cd to fbgemm_gpu/test from $(pwd)"; };
+run_fbgemm_gpu_tests "${BUILD_ENV_NAME}" "${CPU_GPU}"
diff --git a/.github/scripts/nova_prescript.bash b/.github/scripts/nova_prescript.bash
new file mode 100644
index 0000000000..0133b656f1
--- /dev/null
+++ b/.github/scripts/nova_prescript.bash
@@ -0,0 +1,75 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+echo "Current working directory: $(pwd)"
+cd "${FBGEMM_REPO}" || echo "Failed to cd to ${FBGEMM_REPO}"
+PRELUDE="${FBGEMM_REPO}/.github/scripts/setup_env.bash"
+BUILD_ENV_NAME=base
+echo "--------------------------"
+echo "----- conda env list -----"
+conda env list
+echo "--------------------------"
+echo "PRELUDE = $PRELUDE"
+export PATH="${PATH}:/usr/sbin:/sbin"
+echo "CU_VERSION = ${CU_VERSION}"
+echo "PYTHON_VERSION = ${PYTHON_VERSION}"
+echo "python3 --version = $(python3 --version)"
+echo "ARCH = ${ARCH}"
+echo "---------------------------"
+# shellcheck disable=SC1091
+# shellcheck source=.github/scripts/setup_env.bash
+. "${PRELUDE}";
+
+## Display System Info
+print_system_info
+
+## Display GPU Info
+print_gpu_info
+
+## Install C/C++ Compilers
+install_cxx_compiler "${BUILD_ENV_NAME}"
+
+## Install Build Tools
+install_build_tools "${BUILD_ENV_NAME}"
+
+## Install cuDNN
+CPU_GPU=${CU_VERSION}
+if [ "${CU_VERSION}" != 'cpu' ]; then
+    ## Nova $CU_VERSION is e.g., cu118
+    cuda_version_num=$(echo "$CU_VERSION" | cut -c 3-)
+    install_cudnn "${BUILD_ENV_NAME}" "$(pwd)/build_only/cudnn" "$cuda_version_num"
+    echo "-------- Finding NVML_LIB_PATH -----------"
+    echo "NVML_LIB_PATH = ${NVML_LIB_PATH}"
+    echo "CONDA_ENV = ${CONDA_ENV}, CUDA_HOME = ${CUDA_HOME}"
+    if [[ ${NVML_LIB_PATH} == "" ]]; then NVML_LIB_PATH=$(find "${CUDA_HOME}" -name libnvidia-ml.so) && export NVML_LIB_PATH && echo "looking in ${CUDA_HOME}" || echo "libnvidia-ml.so not found in ${CUDA_HOME}"; fi
+    if [[ ${NVML_LIB_PATH} == "" ]]; then NVML_LIB_PATH=$(find "${CONDA_ENV}" -name libnvidia-ml.so) && export NVML_LIB_PATH && echo "looking in ${CONDA_ENV}" || echo "libnvidia-ml.so not found in ${CONDA_ENV}"; fi
+    echo "NVML_LIB_PATH = ${NVML_LIB_PATH}"
+    echo "------------------------------------------"
+    CPU_GPU="cuda"
+fi
+
+cd "${FBGEMM_REPO}/fbgemm_gpu" || { echo "Failed to cd to fbgemm_gpu from $(pwd)"; }
+prepare_fbgemm_gpu_build "${BUILD_ENV_NAME}"
+
+# reset NOVA flag to run setup.py
+BUILD_FROM_NOVA=0
+export BUILD_FROM_NOVA
+
+## Build FBGEMM_GPU Nightly
+cd "${FBGEMM_REPO}/fbgemm_gpu" || echo "Failed to cd to ${FBGEMM_REPO}/fbgemm_gpu from $(pwd)"
+if [[ ${CHANNEL} == "" ]]; then CHANNEL="nightly"; fi #set nightly by default
+echo "----------------------------------------------"
+echo "build_fbgemm_gpu_package ${BUILD_ENV_NAME} ${CHANNEL} ${CPU_GPU}"
+build_fbgemm_gpu_package "${BUILD_ENV_NAME}" "${CHANNEL}" "${CPU_GPU}"
+echo "----------------------------------------------"
+
+## Temporary workaround - copy dist/ to root repo for smoke test
+echo "Copying dist folder to root repo.."
+(cp -r "${FBGEMM_REPO}/fbgemm_gpu/dist" "${FBGEMM_REPO}") && (echo "dist folder has been copied to ${FBGEMM_REPO}") || echo "Failed to copy dist/ folder to ${FBGEMM_REPO}"
+echo "----------------------------------"
+ls -al "${FBGEMM_REPO}/dist"
+echo "----------------------------------"
diff --git a/.github/scripts/utils_system.bash b/.github/scripts/utils_system.bash
index 3d4cbe36aa..297559d098 100644
--- a/.github/scripts/utils_system.bash
+++ b/.github/scripts/utils_system.bash
@@ -79,10 +79,12 @@ free_disk_space () {
 ################################################################################
 
 print_gpu_info () {
-  echo "################################################################################"
-  echo "[INFO] Printing general display info ..."
-  install_system_packages lshw
-  print_exec sudo lshw -C display
+  if [[ "${BUILD_FROM_NOVA}" != '1' ]]; then
+    echo "################################################################################"
+    echo "[INFO] Printing general display info ..."
+    install_system_packages lshw
+    print_exec sudo lshw -C display
+  fi
 
   echo "################################################################################"
   echo "[INFO] Printing NVIDIA GPU info ..."
@@ -133,11 +135,15 @@ __print_system_info_linux () {
   echo "################################################################################"
   echo "[INFO] Print CPU info ..."
   print_exec nproc
+  print_exec lscpu
   print_exec cat /proc/cpuinfo
 
-  echo "################################################################################"
-  echo "[INFO] Print PCI info ..."
-  print_exec lspci -v
+
+  if [[ "${BUILD_FROM_NOVA}" != '1' ]]; then
+    echo "################################################################################"
+    echo "[INFO] Print PCI info ..."
+    print_exec lspci -v
+  fi
 
   echo "################################################################################"
   echo "[INFO] Print Linux distribution info ..."
diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml
new file mode 100644
index 0000000000..fe2516b808
--- /dev/null
+++ b/.github/workflows/build-wheels-linux.yml
@@ -0,0 +1,39 @@
+name: Build Linux Wheels
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: linux
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      with-cuda: enable
+      with-rocm: disable
+      with-cpu: enable
+  build:
+    needs: generate-matrix
+    name: pytorch/FBGEMM
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    with:
+      repository: pytorch/FBGEMM
+      ref: ""
+      pre-script: ../.github/scripts/nova_prescript.bash
+      post-script: ../.github/scripts/nova_postscript.bash
+      smoke-test-script: ""
+      env-var-script: .github/scripts/nova_dir.bash
+      package-name: fbgemm_gpu
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      trigger-event: ${{ github.event_name }}
+    secrets:
+      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py
index 3fcff3c72c..f18f78bfb6 100644
--- a/fbgemm_gpu/setup.py
+++ b/fbgemm_gpu/setup.py
@@ -266,6 +266,19 @@ def main(argv: List[str]) -> None:
     if len(unknown) != 0 and (len(unknown) != 1 or unknown[0] != "clean"):
         print("Unknown Arguments: ", unknown)
 
+    # Skip Nova build steps since it will be done in pre-script
+    if "BUILD_FROM_NOVA" in os.environ:
+        build_from_nova = os.getenv("BUILD_FROM_NOVA")
+        print("build_from_nova", build_from_nova)
+        # Package name is the same for all variants in Nova
+        package_name = "fbgemm_gpu"
+        if str(build_from_nova) != "0":
+            # Skip build clean and build wheel steps in Nova workflow since they are done in pre-script
+            print("Build from Nova detected... exiting")
+            sys.exit(0)
+    else:
+        package_name = args.package_name
+
     if not args.cpu_only:
         set_cuda_environment_variables()
 
@@ -279,7 +292,7 @@ def main(argv: List[str]) -> None:
     FbgemmGpuInstaller.generate_version_file(package_version)
 
     setup(
-        name=args.package_name,
+        name=package_name,
         version=package_version,
         author="FBGEMM Team",
         author_email="packages@pytorch.org",

From 14cf6f283dc071a48a89a52b2c9a2993919fa804 Mon Sep 17 00:00:00 2001
From: Banit Agrawal <bagrawal@meta.com>
Date: Thu, 14 Sep 2023 16:53:15 -0700
Subject: [PATCH 23/94] Use PyTorch's p2p access enable function (#2000)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2000

We split the diff after adding a needed lazy cuda init call in enable p2p access function.

Diff 1: D48939723
[PyTorch] Add the lazy init call for p2p access function

*Prior context*
cudaEnablePeerAccess only enables cross device access for memory allocated with cudaMalloc. When using other cuda APIs such cuMemMap, peer access is managed differently.
expandable_segments:True in PyTorch uses cuMemMap, so code that just calls cudaEnablePeerAccess is not sufficient to enable cross-device copies. This patch switching the p2p access enabling functions
to use PyTorchs `get_p2p_access` which lets its allocator figure out how to correctly enable p2p access for that memory.

In the normal case (expandable_segments:False), this code performs exactly the same cuda calls as before.

Reviewed By: zdevito

Differential Revision: D49021817

fbshipit-source-id: 7ffb4b477b1d1cddccc891dd9fc8f9a2a986585e
---
 fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
index fdd6ea6cc9..4b90c2bdaa 100644
--- a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
+++ b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
@@ -10,6 +10,7 @@
 #include <ATen/core/op_registration/op_registration.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAEvent.h>
+#include <ATen/cuda/PeerToPeerAccess.h>
 #include <ATen/native/TensorAdvancedIndexing.h>
 #include <c10/core/Device.h>
 #include <c10/core/TensorOptions.h>
@@ -562,15 +563,7 @@ void init_p2p_access() {
     for (const auto i : c10::irange(at::cuda::getNumGPUs())) {
       for (const auto j : c10::irange(at::cuda::getNumGPUs())) {
         if (i != j) {
-          at::cuda::CUDAGuard g(i);
-          const auto err =
-              C10_CUDA_ERROR_HANDLED(cudaDeviceEnablePeerAccess(j, 0));
-          if (err == cudaErrorPeerAccessAlreadyEnabled) {
-            // ignore and clear the error if access was already enabled
-            C10_CUDA_CLEAR_ERROR();
-          } else {
-            AT_CUDA_CHECK(err);
-          }
+          AT_ASSERT(at::cuda::get_p2p_access(i, j));
         }
       }
     }

From a7d2be53ab830c386c5eff887aa4f2b164bb4063 Mon Sep 17 00:00:00 2001
From: Supadchaya Puangpontip <supadchaya@meta.com>
Date: Thu, 14 Sep 2023 23:39:35 -0700
Subject: [PATCH 24/94] Update Nova workflow triggers and wheel version (#2020)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2020

Update nova workflow to be triggered on pushes to main and releases branches and tags. Update wheel version name to match pytorch convention (e.g., +cpu, +cu118)

Reviewed By: q10

Differential Revision: D49296399

fbshipit-source-id: e4ed56c12dfb2f68d5b131b29c8ce0294e2a4522
---
 .github/workflows/build-wheels-linux.yml |  7 +++++++
 fbgemm_gpu/setup.py                      | 15 +++++++++++----
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml
index fe2516b808..bca0fd5f10 100644
--- a/.github/workflows/build-wheels-linux.yml
+++ b/.github/workflows/build-wheels-linux.yml
@@ -5,6 +5,13 @@ on:
   push:
     branches:
       - nightly
+      - main
+      # Release candidate branch look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-release+
+    tags:
+      # Release candidate tag look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - v[0-9]+.[0-9]+.[0-9]+
   workflow_dispatch:
 
 jobs:
diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py
index f18f78bfb6..b0d7b56b32 100644
--- a/fbgemm_gpu/setup.py
+++ b/fbgemm_gpu/setup.py
@@ -24,7 +24,7 @@
 from tabulate import tabulate
 
 
-def generate_package_version(package_name: str):
+def generate_package_version(package_name: str, version_variant: str):
     print("[SETUP.PY] Generating the package version ...")
 
     if "nightly" in package_name:
@@ -47,7 +47,7 @@ def generate_package_version(package_name: str):
         # Remove the local version identifier, if any (e.g. 0.4.0rc0.post0+git.6a63116c.dirty => 0.4.0rc0.post0)
         # Then remove post0 (keep postN for N > 0) (e.g. 0.4.0rc0.post0 => 0.4.0rc0)
         version = re.sub(".post0$", "", gitversion.version_from_git().split("+")[0])
-
+    version = str(version) + version_variant
     print(f"[SETUP.PY] Setting the package version: {version}")
     return version
 
@@ -279,14 +279,21 @@ def main(argv: List[str]) -> None:
     else:
         package_name = args.package_name
 
-    if not args.cpu_only:
+    if args.cpu_only:
+        version_variant = "+cpu"
+    else:
         set_cuda_environment_variables()
+        if torch.version.cuda is not None:
+            cuda_version = torch.version.cuda.split(".")
+            version_variant = "+cu" + str(cuda_version[0]) + str(cuda_version[1])
+        else:
+            version_variant = ""
 
     # Repair command line args for setup.
     sys.argv = [sys.argv[0]] + unknown
 
     # Determine the package version
-    package_version = generate_package_version(args.package_name)
+    package_version = generate_package_version(args.package_name, version_variant)
 
     # Generate the version file
     FbgemmGpuInstaller.generate_version_file(package_version)

From aa48aaa724912305dcbf3cf31da50f702fa1804f Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Fri, 15 Sep 2023 11:45:40 -0700
Subject: [PATCH 25/94] Improve all_to_one error message (#2019)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2019

As titled

Reviewed By: jianyuh

Differential Revision: D49296564

fbshipit-source-id: 442c13567cb7aa8de8c208c2ee1fb2ae550a8969
---
 fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
index 4b90c2bdaa..0a3ad3b63a 100644
--- a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
+++ b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
@@ -149,7 +149,12 @@ void all_to_one(
   });
 
   auto target_device_index = target_device.index();
-  TORCH_CHECK(target_device_index < num_gpus && target_device_index >= 0);
+  TORCH_CHECK(
+      target_device_index != -1,
+      "target_device.index() is -1. Please pass target_device with device "
+      "index, e.g., torch.device(\"cuda:0\")")
+
+  TORCH_CHECK(target_device_index < num_gpus);
 
   std::vector<TwoHopTransferContainer> two_hop_transfers;
   two_hop_transfers.reserve(input_tensors.size());

From be1d5cadead271fdf6952d38e55219a50728a194 Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Fri, 15 Sep 2023 11:54:38 -0700
Subject: [PATCH 26/94] Improve quantize_comm error message (#2018)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2018

As titled

Reviewed By: jianyuh, henrylhtsang, edqwerty10

Differential Revision: D49295738

fbshipit-source-id: 45524d8e220ba6b686a99d201e24c6a3d839aed7
---
 fbgemm_gpu/fbgemm_gpu/quantize_comm.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/quantize_comm.py b/fbgemm_gpu/fbgemm_gpu/quantize_comm.py
index cb0ec28167..a79010cb2b 100644
--- a/fbgemm_gpu/fbgemm_gpu/quantize_comm.py
+++ b/fbgemm_gpu/fbgemm_gpu/quantize_comm.py
@@ -193,9 +193,10 @@ def calc_quantized_size(
             self._comm_precision == SparseType.FP8 and self._row_dim > 0
         ):
             ctx = none_throws(ctx)
-            assert (
-                input_len % ctx.row_dim == 0
-            ), f"input_len {input_len} is not a multiple of row dim {ctx.row_dim}"
+            assert input_len % ctx.row_dim == 0, (
+                f"input_len {input_len} is not a multiple of row dim {ctx.row_dim} "
+                "Please check your batch size (power of 2 batch size is recommended)"
+            )
             nrows = input_len // ctx.row_dim
             ncols = (ctx.row_dim + 3) // 4 * 4 + 2 * 4
             return nrows * ncols

From f9e7ba74b314cfe2764569312f767139d07ead91 Mon Sep 17 00:00:00 2001
From: Supadchaya Puangpontip <supadchaya@meta.com>
Date: Fri, 15 Sep 2023 14:41:59 -0700
Subject: [PATCH 27/94] Change name convention for PyPI uploads (#2023)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2023

PyPI does not accept version to be specified with `+` (e.g., `+cpu` or `+cu118`).

Reviewed By: sryap

Differential Revision: D49329679

fbshipit-source-id: 1bc0c81ae15fdf157b54b2ca9226d67dacaade8e
---
 fbgemm_gpu/setup.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py
index b0d7b56b32..fba4846ca3 100644
--- a/fbgemm_gpu/setup.py
+++ b/fbgemm_gpu/setup.py
@@ -266,6 +266,17 @@ def main(argv: List[str]) -> None:
     if len(unknown) != 0 and (len(unknown) != 1 or unknown[0] != "clean"):
         print("Unknown Arguments: ", unknown)
 
+    if args.cpu_only:
+        version_variant = "+cpu"
+    else:
+        set_cuda_environment_variables()
+        if torch.version.cuda is not None:
+            cuda_version = torch.version.cuda.split(".")
+            version_variant = "+cu" + str(cuda_version[0]) + str(cuda_version[1])
+        else:
+            # rocm or other gpus - to be specified if we offcially support them
+            version_variant = ""
+
     # Skip Nova build steps since it will be done in pre-script
     if "BUILD_FROM_NOVA" in os.environ:
         build_from_nova = os.getenv("BUILD_FROM_NOVA")
@@ -277,18 +288,10 @@ def main(argv: List[str]) -> None:
             print("Build from Nova detected... exiting")
             sys.exit(0)
     else:
+        # If not building from Nova, use the fbgemm_gpu-<variant>
+        # PyPi does not accept version+xx in the name convention.
+        version_variant = ""
         package_name = args.package_name
-
-    if args.cpu_only:
-        version_variant = "+cpu"
-    else:
-        set_cuda_environment_variables()
-        if torch.version.cuda is not None:
-            cuda_version = torch.version.cuda.split(".")
-            version_variant = "+cu" + str(cuda_version[0]) + str(cuda_version[1])
-        else:
-            version_variant = ""
-
     # Repair command line args for setup.
     sys.argv = [sys.argv[0]] + unknown
 

From bbc676eb6f4f664635914065cda4549bd3962464 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Fri, 15 Sep 2023 15:31:21 -0700
Subject: [PATCH 28/94] Change scripts to accommodate conda prefix to work with
 Nova (#2022)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2022

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2021

Nova creates conda environments with `--prefix` where the environments are outside of the default `envs` folder. Current script only works with `--name`. This diff changes the scripts to accommodate conda prefix.

https://github.com/pytorch/FBGEMM/actions/runs/6189731547

Reviewed By: spcyppt

Differential Revision: D49306286

fbshipit-source-id: 2f2ca00645526639369de3f555dbab30da56739e
---
 .github/scripts/fbgemm_gpu_build.bash   | 40 +++++++++++++++++------
 .github/scripts/fbgemm_gpu_docs.bash    | 20 +++++++++---
 .github/scripts/fbgemm_gpu_install.bash | 14 ++++++--
 .github/scripts/fbgemm_gpu_lint.bash    | 24 +++++++++++---
 .github/scripts/fbgemm_gpu_test.bash    | 15 +++++++--
 .github/scripts/nova_postscript.bash    |  2 +-
 .github/scripts/nova_prescript.bash     |  2 +-
 .github/scripts/setup_env.bash          | 43 +++++++++++++++++--------
 .github/scripts/utils_base.bash         | 39 ++++++++++++++++++----
 .github/scripts/utils_conda.bash        | 15 ++++++---
 .github/scripts/utils_cuda.bash         | 25 ++++++++++----
 .github/scripts/utils_pytorch.bash      | 22 +++++++++----
 12 files changed, 198 insertions(+), 63 deletions(-)

diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index 9d82fb61b7..e651f2d727 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -40,10 +40,16 @@ prepare_fbgemm_gpu_build () {
   git submodule sync
   git submodule update --init --recursive
 
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   echo "[BUILD] Installing other build dependencies ..."
-  (exec_with_retries conda run --no-capture-output -n "${env_name}" python -m pip install -r requirements.txt) || return 1
+  # shellcheck disable=SC2086
+  (exec_with_retries conda run --no-capture-output ${env_prefix} python -m pip install -r requirements.txt) || return 1
 
+  # shellcheck disable=SC2086
   (test_python_import_package "${env_name}" numpy) || return 1
+  # shellcheck disable=SC2086
   (test_python_import_package "${env_name}" skbuild) || return 1
 
   echo "[BUILD] Successfully ran git submodules update"
@@ -78,7 +84,8 @@ __configure_fbgemm_gpu_build_rocm () {
   fi
 
   echo "[BUILD] Setting the following ROCm targets: ${arch_list}"
-  print_exec conda env config vars set -n "${env_name}" PYTORCH_ROCM_ARCH="${arch_list}"
+  # shellcheck disable=SC2086
+  print_exec conda env config vars set ${env_prefix} PYTORCH_ROCM_ARCH="${arch_list}"
 
   echo "[BUILD] Setting ROCm build args ..."
   build_args=()
@@ -116,8 +123,8 @@ __configure_fbgemm_gpu_build_cuda () {
 
   # Build only CUDA 7.0 and 8.0 (i.e. V100 and A100) because of 100 MB binary size limits from PyPI.
   echo "[BUILD] Setting CUDA build args ..."
-  # shellcheck disable=SC2155
-  local nvml_lib_path=$(conda run --no-capture-output -n "${env_name}" printenv NVML_LIB_PATH)
+  # shellcheck disable=SC2155,SC2086
+  local nvml_lib_path=$(conda run --no-capture-output ${env_prefix} printenv NVML_LIB_PATH)
   build_args=(
     --nvml_lib_path="${nvml_lib_path}"
     -DTORCH_CUDA_ARCH_LIST="'${arch_list}'"
@@ -187,8 +194,8 @@ __build_fbgemm_gpu_common_pre_steps () {
   echo "[BUILD] Determined Python package name to use: ${package_name}"
 
   # Extract the Python tag
-  # shellcheck disable=SC2207
-  python_version=($(conda run --no-capture-output -n "${env_name}" python --version))
+  # shellcheck disable=SC2207,SC2086
+  python_version=($(conda run --no-capture-output ${env_prefix} python --version))
   # shellcheck disable=SC2206
   python_version_arr=(${python_version[1]//./ })
   python_tag="py${python_version_arr[0]}${python_version_arr[1]}"
@@ -196,7 +203,8 @@ __build_fbgemm_gpu_common_pre_steps () {
 
   echo "[BUILD] Running pre-build cleanups ..."
   print_exec rm -rf dist
-  print_exec conda run --no-capture-output -n "${env_name}" python setup.py clean
+  # shellcheck disable=SC2086
+  print_exec conda run --no-capture-output ${env_prefix} python setup.py clean
 
   echo "[BUILD] Printing git status ..."
   print_exec git status
@@ -285,6 +293,9 @@ build_fbgemm_gpu_package () {
     return 1
   fi
 
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   # Set up and configure the build
   __build_fbgemm_gpu_common_pre_steps || return 1
   __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1
@@ -315,7 +326,8 @@ build_fbgemm_gpu_package () {
 
   # Distribute Python extensions as wheels on Linux
   echo "[BUILD] Building FBGEMM-GPU wheel (VARIANT=${fbgemm_variant}) ..."
-  print_exec conda run --no-capture-output -n "${env_name}" \
+  # shellcheck disable=SC2086
+  print_exec conda run --no-capture-output ${env_prefix} \
     python setup.py "${run_multicore}" bdist_wheel \
       --package_name="${package_name}" \
       --python-tag="${python_tag}" \
@@ -351,6 +363,9 @@ build_fbgemm_gpu_install () {
     return 1
   fi
 
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   # Set up and configure the build
   __build_fbgemm_gpu_common_pre_steps || return 1
   __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1
@@ -365,7 +380,8 @@ build_fbgemm_gpu_install () {
   # Parallelism may need to be limited to prevent the build from being
   # canceled for going over ulimits
   echo "[BUILD] Building + installing FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..."
-  print_exec conda run --no-capture-output -n "${env_name}" \
+  # shellcheck disable=SC2086
+  print_exec conda run --no-capture-output ${env_prefix} \
     python setup.py install "${build_args[@]}"
 
   # Run checks on the built libraries
@@ -395,6 +411,9 @@ build_fbgemm_gpu_develop () {
     return 1
   fi
 
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   # Set up and configure the build
   __build_fbgemm_gpu_common_pre_steps || return 1
   __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1
@@ -409,7 +428,8 @@ build_fbgemm_gpu_develop () {
   # Parallelism may need to be limited to prevent the build from being
   # canceled for going over ulimits
   echo "[BUILD] Building (develop) FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..."
-  print_exec conda run --no-capture-output -n "${env_name}" \
+  # shellcheck disable=SC2086
+  print_exec conda run --no-capture-output ${env_prefix} \
     python setup.py build develop "${build_args[@]}"
 
   # Run checks on the built libraries
diff --git a/.github/scripts/fbgemm_gpu_docs.bash b/.github/scripts/fbgemm_gpu_docs.bash
index 98e90a4163..d2b21f5649 100644
--- a/.github/scripts/fbgemm_gpu_docs.bash
+++ b/.github/scripts/fbgemm_gpu_docs.bash
@@ -31,11 +31,15 @@ install_docs_tools () {
 
   test_network_connection || return 1
 
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   echo "[INSTALL] Installing docs tools ..."
-  (exec_with_retries conda install -n "${env_name}" -c conda-forge -y \
+  # shellcheck disable=SC2086
+  (exec_with_retries conda install ${env_prefix} -c conda-forge -y \
     doxygen) || return 1
 
-  # Check binaries are visible in the PAATH
+  # Check binaries are visible in the PATH
   (test_binpath "${env_name}" doxygen) || return 1
 
   echo "[INSTALL] Successfully installed all the docs tools"
@@ -62,14 +66,20 @@ build_fbgemm_gpu_docs () {
     echo ""
   fi
 
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   echo "[BUILD] Installing docs-build dependencies ..."
-  (exec_with_retries conda run -n "${env_name}" python -m pip install -r requirements.txt) || return 1
+  # shellcheck disable=SC2086
+  (exec_with_retries conda run ${env_prefix} python -m pip install -r requirements.txt) || return 1
 
   echo "[BUILD] Running Doxygen build ..."
-  (exec_with_retries conda run -n "${env_name}" doxygen Doxyfile.in) || return 1
+  # shellcheck disable=SC2086
+  (exec_with_retries conda run ${env_prefix} doxygen Doxyfile.in) || return 1
 
   echo "[BUILD] Building HTML pages ..."
-  (exec_with_retries conda run -n "${env_name}" make html) || return 1
+  # shellcheck disable=SC2086
+  (exec_with_retries conda run ${env_prefix} make html) || return 1
 
   echo "[INSTALL] FBGEMM-GPU documentation build completed"
 }
diff --git a/.github/scripts/fbgemm_gpu_install.bash b/.github/scripts/fbgemm_gpu_install.bash
index 38cf3280ea..50353efa47 100644
--- a/.github/scripts/fbgemm_gpu_install.bash
+++ b/.github/scripts/fbgemm_gpu_install.bash
@@ -20,7 +20,8 @@ __fbgemm_gpu_post_install_checks () {
   (test_python_import_symbol "${env_name}" fbgemm_gpu __version__) || return 1
 
   echo "[CHECK] Printing out the FBGEMM-GPU version ..."
-  installed_fbgemm_gpu_version=$(conda run -n "${env_name}" python -c "import fbgemm_gpu; print(fbgemm_gpu.__version__)")
+  # shellcheck disable=SC2086
+  installed_fbgemm_gpu_version=$(conda run ${env_prefix} python -c "import fbgemm_gpu; print(fbgemm_gpu.__version__)")
   echo "[CHECK] The installed version is: ${installed_fbgemm_gpu_version}"
 }
 
@@ -46,8 +47,12 @@ install_fbgemm_gpu_wheel () {
   print_exec sha256sum "${wheel_path}"
   print_exec md5sum "${wheel_path}"
 
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   echo "[INSTALL] Installing FBGEMM-GPU wheel: ${wheel_path} ..."
-  (exec_with_retries conda run -n "${env_name}" python -m pip install "${wheel_path}") || return 1
+  # shellcheck disable=SC2086
+  (exec_with_retries conda run ${env_prefix} python -m pip install "${wheel_path}") || return 1
 
   __fbgemm_gpu_post_install_checks || return 1
 
@@ -131,9 +136,12 @@ install_fbgemm_gpu_pip () {
     fi
   fi
 
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   echo "[INSTALL] Attempting to install FBGEMM-GPU ${fbgemm_gpu_version}+${fbgemm_gpu_variant} through PIP ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda run -n "${env_name}" pip install ${fbgemm_gpu_package}) || return 1
+  (exec_with_retries conda run ${env_prefix} pip install ${fbgemm_gpu_package}) || return 1
 
   __fbgemm_gpu_post_install_checks || return 1
 
diff --git a/.github/scripts/fbgemm_gpu_lint.bash b/.github/scripts/fbgemm_gpu_lint.bash
index c129ecc943..122d547862 100644
--- a/.github/scripts/fbgemm_gpu_lint.bash
+++ b/.github/scripts/fbgemm_gpu_lint.bash
@@ -31,8 +31,12 @@ install_lint_tools () {
 
   test_network_connection || return 1
 
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   echo "[INSTALL] Installing lint tools ..."
-  (exec_with_retries conda install -n "${env_name}" -c conda-forge -y \
+  # shellcheck disable=SC2086
+  (exec_with_retries conda install ${env_prefix} -c conda-forge -y \
     click \
     flake8 \
     ufmt) || return 1
@@ -72,10 +76,14 @@ lint_fbgemm_gpu_flake8 () {
 
   echo "::add-matcher::fbgemm_gpu/test/lint/flake8_problem_matcher.json"
 
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   # E501 = line too long
   # W503 = line break before binary operator (deprecated)
   # E203 = whitespace before ":"
-  (print_exec conda run -n "${env_name}" flake8 --ignore=E501,W503,E203 .) || return 1
+  # shellcheck disable=SC2086
+  (print_exec conda run ${env_prefix} flake8 --ignore=E501,W503,E203 .) || return 1
 
   echo "[TEST] Finished running flake8 lint checks"
 }
@@ -102,8 +110,12 @@ lint_fbgemm_gpu_ufmt () {
     fbgemm_gpu/bench
   )
 
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   for p in "${lint_paths[@]}"; do
-    (print_exec conda run -n "${env_name}" ufmt diff "${p}") || return 1
+    # shellcheck disable=SC2086
+    (print_exec conda run ${env_prefix} ufmt diff "${p}") || return 1
   done
 
   echo "[TEST] Finished running ufmt lint checks"
@@ -131,8 +143,12 @@ lint_fbgemm_gpu_copyright () {
     fbgemm_gpu/bench
   )
 
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   for p in "${lint_paths[@]}"; do
-    (print_exec conda run -n "${env_name}" python fbgemm_gpu/test/lint/check_meta_header.py --path="${p}" --fixit=False) || return 1
+    # shellcheck disable=SC2086
+    (print_exec conda run ${env_prefix} python fbgemm_gpu/test/lint/check_meta_header.py --path="${p}" --fixit=False) || return 1
   done
 
   echo "[TEST] Finished running Meta Copyright Header checks"
diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
index 200b536aab..b593caa95d 100644
--- a/.github/scripts/fbgemm_gpu_test.bash
+++ b/.github/scripts/fbgemm_gpu_test.bash
@@ -28,7 +28,11 @@ run_python_test () {
     echo "################################################################################"
   fi
 
-  if print_exec conda run --no-capture-output -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
+  # shellcheck disable=SC2086
+  if print_exec conda run --no-capture-output ${env_prefix} python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
     echo "[TEST] Python test suite PASSED: ${python_test_file}"
     echo ""
   else
@@ -62,10 +66,14 @@ run_fbgemm_gpu_tests () {
     echo ""
   fi
 
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   # Enable ROCM testing if specified
   if [ "$fbgemm_variant" == "rocm" ]; then
     echo "[TEST] Set environment variable FBGEMM_TEST_WITH_ROCM to enable ROCm tests ..."
-    print_exec conda env config vars set -n "${env_name}" FBGEMM_TEST_WITH_ROCM=1
+    # shellcheck disable=SC2086
+    print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1
   fi
 
   # These are either non-tests or currently-broken tests in both FBGEMM_GPU and FBGEMM_GPU-CPU
@@ -90,7 +98,8 @@ run_fbgemm_gpu_tests () {
   fi
 
   echo "[TEST] Installing pytest ..."
-  print_exec conda install -n "${env_name}" -y pytest
+  # shellcheck disable=SC2086
+  print_exec conda install ${env_prefix} -y pytest
 
   echo "[TEST] Checking imports ..."
   (test_python_import_package "${env_name}" fbgemm_gpu) || return 1
diff --git a/.github/scripts/nova_postscript.bash b/.github/scripts/nova_postscript.bash
index 099dbb3c24..c1e5d5bfd5 100644
--- a/.github/scripts/nova_postscript.bash
+++ b/.github/scripts/nova_postscript.bash
@@ -8,7 +8,7 @@
 echo "Current working directory: $(pwd)"
 cd "${FBGEMM_REPO}" || echo "Failed to cd to ${FBGEMM_REPO}"
 PRELUDE="${FBGEMM_REPO}/.github/scripts/setup_env.bash"
-BUILD_ENV_NAME=base
+BUILD_ENV_NAME=${CONDA_ENV}
 GITHUB_ENV=TRUE
 export GITHUB_ENV
 
diff --git a/.github/scripts/nova_prescript.bash b/.github/scripts/nova_prescript.bash
index 0133b656f1..89106525fe 100644
--- a/.github/scripts/nova_prescript.bash
+++ b/.github/scripts/nova_prescript.bash
@@ -8,7 +8,7 @@
 echo "Current working directory: $(pwd)"
 cd "${FBGEMM_REPO}" || echo "Failed to cd to ${FBGEMM_REPO}"
 PRELUDE="${FBGEMM_REPO}/.github/scripts/setup_env.bash"
-BUILD_ENV_NAME=base
+BUILD_ENV_NAME=${CONDA_ENV}
 echo "--------------------------"
 echo "----- conda env list -----"
 conda env list
diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash
index ee823d5afd..e26b95c722 100755
--- a/.github/scripts/setup_env.bash
+++ b/.github/scripts/setup_env.bash
@@ -104,16 +104,21 @@ install_cxx_compiler () {
     else
       archname="$MACHINE_NAME_LC"
     fi
+
+    # shellcheck disable=SC2155
+    local env_prefix=$(env_name_or_prefix "${env_name}")
+
     echo "[INSTALL] Installing C/C++ compilers through Conda (architecture = ${archname}) ..."
-    (exec_with_retries conda install -n "${env_name}" -y "gxx_linux-${archname}"=10.4.0 "sysroot_linux-${archname}"=2.17 -c conda-forge) || return 1
+    # shellcheck disable=SC2086
+    (exec_with_retries conda install ${env_prefix} -y "gxx_linux-${archname}"=10.4.0 "sysroot_linux-${archname}"=2.17 -c conda-forge) || return 1
 
     # The compilers are visible in the PATH as `x86_64-conda-linux-gnu-cc` and
     # `x86_64-conda-linux-gnu-c++`, so symlinks will need to be created
     echo "[INSTALL] Setting the C/C++ compiler symlinks ..."
-    # shellcheck disable=SC2155
-    local cc_path=$(conda run -n "${env_name}" printenv CC)
-    # shellcheck disable=SC2155
-    local cxx_path=$(conda run -n "${env_name}" printenv CXX)
+    # shellcheck disable=SC2155,SC2086
+    local cc_path=$(conda run ${env_prefix} printenv CC)
+    # shellcheck disable=SC2155,SC2086
+    local cxx_path=$(conda run ${env_prefix} printenv CXX)
 
     print_exec ln -s "${cc_path}" "$(dirname "$cc_path")/cc"
     print_exec ln -s "${cc_path}" "$(dirname "$cc_path")/gcc"
@@ -129,22 +134,25 @@ install_cxx_compiler () {
 
   # https://stackoverflow.com/questions/2224334/gcc-dump-preprocessor-defines
   echo "[INFO] Printing out all preprocessor defines in the C compiler ..."
-  print_exec conda run -n "${env_name}" cc -dM -E -
+  # shellcheck disable=SC2086
+  print_exec conda run ${env_prefix} cc -dM -E -
 
   # https://stackoverflow.com/questions/2224334/gcc-dump-preprocessor-defines
   echo "[INFO] Printing out all preprocessor defines in the C++ compiler ..."
-  print_exec conda run -n "${env_name}" c++ -dM -E -x c++ -
+  # shellcheck disable=SC2086
+  print_exec conda run ${env_prefix} c++ -dM -E -x c++ -
 
   # Print out the C++ version
-  print_exec conda run -n "${env_name}" c++ --version
+  # shellcheck disable=SC2086
+  print_exec conda run ${env_prefix} c++ --version
 
   # https://stackoverflow.com/questions/4991707/how-to-find-my-current-compilers-standard-like-if-it-is-c90-etc
   echo "[INFO] Printing the default version of the C standard used by the compiler ..."
-  print_exec "conda run -n ${env_name} cc -dM -E - | grep __STDC_VERSION__"
+  print_exec "conda run ${env_prefix} cc -dM -E - | grep __STDC_VERSION__"
 
   # https://stackoverflow.com/questions/2324658/how-to-determine-the-version-of-the-c-standard-used-by-the-compiler
   echo "[INFO] Printing the default version of the C++ standard used by the compiler ..."
-  print_exec "conda run -n ${env_name} c++ -dM -E -x c++ - | grep __cplusplus"
+  print_exec "conda run ${env_prefix} c++ -dM -E -x c++ - | grep __cplusplus"
 
   echo "[INSTALL] Successfully installed C/C++ compilers"
 }
@@ -167,8 +175,12 @@ install_build_tools () {
 
   test_network_connection || return 1
 
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   echo "[INSTALL] Installing build tools ..."
-  (exec_with_retries conda install -n "${env_name}" -y \
+  # shellcheck disable=SC2086
+  (exec_with_retries conda install ${env_prefix} -y \
     click \
     cmake \
     hypothesis \
@@ -218,13 +230,18 @@ publish_to_pypi () {
 
   test_network_connection || return 1
 
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   echo "[INSTALL] Installing twine ..."
-  print_exec conda install -n "${env_name}" -y twine
+  # shellcheck disable=SC2086
+  print_exec conda install ${env_prefix} -y twine
   (test_python_import_package "${env_name}" twine) || return 1
   (test_python_import_package "${env_name}" OpenSSL) || return 1
 
   echo "[PUBLISH] Uploading package(s) to PyPI: ${package_name} ..."
-  conda run -n "${env_name}" \
+  # shellcheck disable=SC2086
+  conda run ${env_prefix} \
     python -m twine upload \
       --username __token__ \
       --password "${pypi_token}" \
diff --git a/.github/scripts/utils_base.bash b/.github/scripts/utils_base.bash
index 5cfafc5f9e..8be352ad3e 100644
--- a/.github/scripts/utils_base.bash
+++ b/.github/scripts/utils_base.bash
@@ -70,6 +70,18 @@ exec_with_retries () {
 # Assert Functions
 ################################################################################
 
+env_name_or_prefix () {
+  local env=$1
+  if [[ ${env} == /* ]]; then
+    # If the input string is a PATH (i.e. starts with '/'), then determine the
+    # Conda environment by directory prefix
+    echo "-p ${env}";
+  else
+    # Else, determine the Conda environment by name
+    echo "-n ${env}";
+  fi
+}
+
 test_network_connection () {
   wget --timeout 1 pypi.org -O /dev/null
   local exit_status=$?
@@ -95,7 +107,10 @@ test_python_import_symbol () {
     return 1
   fi
 
-  if conda run -n "${env_name}" python -c "from ${package_name} import ${target_symbol}"; then
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
+  # shellcheck disable=SC2086
+  if conda run ${env_prefix} python -c "from ${package_name} import ${target_symbol}"; then
     echo "[CHECK] Found symbol '${target_symbol}' in Python package '${package_name}'."
   else
     echo "[CHECK] Could not find symbol '${target_symbol}' in Python package '${package_name}'; the package might be missing or broken."
@@ -113,7 +128,10 @@ test_python_import_package () {
     return 1
   fi
 
-  if conda run -n "${env_name}" python -c "import ${python_import}"; then
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
+  # shellcheck disable=SC2086
+  if conda run ${env_prefix} python -c "import ${python_import}"; then
     echo "[CHECK] Python package '${python_import}' found."
   else
     echo "[CHECK] Python package '${python_import}' was not found, or the package is broken!"
@@ -131,7 +149,10 @@ test_binpath () {
     return 1
   fi
 
-  if conda run -n "${env_name}" which "${bin_name}"; then
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
+  # shellcheck disable=SC2086
+  if conda run ${env_prefix} which "${bin_name}"; then
     echo "[CHECK] Binary ${bin_name} found in PATH"
   else
     echo "[CHECK] Binary ${bin_name} not found in PATH!"
@@ -149,12 +170,15 @@ test_filepath () {
     return 1
   fi
 
-  # shellcheck disable=SC2155
-  local conda_prefix=$(conda run -n "${env_name}" printenv CONDA_PREFIX)
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
+  # shellcheck disable=SC2155,SC2086
+  local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
   # shellcheck disable=SC2155
   local file_path=$(find "${conda_prefix}" -type f -name "${file_name}")
   # shellcheck disable=SC2155
   local link_path=$(find "${conda_prefix}" -type l -name "${file_name}")
+
   if [ "${file_path}" != "" ]; then
     echo "[CHECK] ${file_name} found in CONDA_PREFIX PATH (file): ${file_path}"
   elif [ "${link_path}" != "" ]; then
@@ -175,7 +199,10 @@ test_env_var () {
     return 1
   fi
 
-  if conda run -n "${env_name}" printenv "${env_key}"; then
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
+  # shellcheck disable=SC2086
+  if conda run ${env_prefix} printenv "${env_key}"; then
     echo "[CHECK] Environment variable ${env_key} is defined in the Conda environment"
   else
     echo "[CHECK] Environment variable ${env_key} is not defined in the Conda environment!"
diff --git a/.github/scripts/utils_conda.bash b/.github/scripts/utils_conda.bash
index 5251dfc6f0..96aae5dfe3 100644
--- a/.github/scripts/utils_conda.bash
+++ b/.github/scripts/utils_conda.bash
@@ -106,22 +106,29 @@ create_conda_environment () {
   local conda_prefix=$(conda run -n base printenv CONDA_PREFIX)
   print_exec rm -rf "${conda_prefix}/envs/${env_name}"
 
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   # The `-y` flag removes any existing Conda environment with the same name
   echo "[SETUP] Creating new Conda environment (Python ${python_version}) ..."
-  (exec_with_retries conda create -y --name "${env_name}" python="${python_version}") || return 1
+  # shellcheck disable=SC2086
+  (exec_with_retries conda create -y ${env_prefix} python="${python_version}") || return 1
 
   echo "[SETUP] Upgrading PIP to latest ..."
-  (exec_with_retries conda run -n "${env_name}" pip install --upgrade pip) || return 1
+  # shellcheck disable=SC2086
+  (exec_with_retries conda run ${env_prefix} pip install --upgrade pip) || return 1
 
   # The pyOpenSSL and cryptography packages versions need to line up for PyPI publishing to work
   # https://stackoverflow.com/questions/74981558/error-updating-python3-pip-attributeerror-module-lib-has-no-attribute-openss
   echo "[SETUP] Upgrading pyOpenSSL ..."
-  (exec_with_retries conda run -n "${env_name}" python -m pip install "pyOpenSSL>22.1.0") || return 1
+  # shellcheck disable=SC2086
+  (exec_with_retries conda run ${env_prefix} python -m pip install "pyOpenSSL>22.1.0") || return 1
 
   # This test fails with load errors if the pyOpenSSL and cryptography package versions don't align
   echo "[SETUP] Testing pyOpenSSL import ..."
   (test_python_import_package "${env_name}" OpenSSL) || return 1
 
-  echo "[SETUP] Installed Python version: $(conda run -n "${env_name}" python --version)"
+  # shellcheck disable=SC2086
+  echo "[SETUP] Installed Python version: $(conda run ${env_prefix} python --version)"
   echo "[SETUP] Successfully created Conda environment: ${env_name}"
 }
diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash
index 0263eb641d..705ef8dc25 100644
--- a/.github/scripts/utils_cuda.bash
+++ b/.github/scripts/utils_cuda.bash
@@ -43,9 +43,13 @@ install_cuda () {
   # Clean up packages before installation
   conda_cleanup
 
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   # Install CUDA packages
   echo "[INSTALL] Installing CUDA ${cuda_version} ..."
-  (exec_with_retries conda install --force-reinstall -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}") || return 1
+  # shellcheck disable=SC2086
+  (exec_with_retries conda install --force-reinstall ${env_prefix} -y cuda -c "nvidia/label/cuda-${cuda_version}") || return 1
 
   # Ensure that nvcc is properly installed
   (test_binpath "${env_name}" nvcc) || return 1
@@ -58,18 +62,21 @@ install_cuda () {
   (test_filepath "${env_name}" libnvidia-ml.so) || return 1
 
   echo "[INSTALL] Set environment variable NVML_LIB_PATH ..."
-  # shellcheck disable=SC2155
-  local conda_prefix=$(conda run -n "${env_name}" printenv CONDA_PREFIX)
+  # shellcheck disable=SC2155,SC2086
+  local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
   # shellcheck disable=SC2155
   local nvml_lib_path=$(find "${conda_prefix}" -name libnvidia-ml.so)
-  print_exec conda env config vars set -n "${env_name}" NVML_LIB_PATH="${nvml_lib_path}"
+  # shellcheck disable=SC2086
+  print_exec conda env config vars set ${env_prefix} NVML_LIB_PATH="${nvml_lib_path}"
 
   # https://stackoverflow.com/questions/27686382/how-can-i-dump-all-nvcc-preprocessor-defines
   echo "[INFO] Printing out all preprocessor defines in nvcc ..."
-  print_exec conda run -n "${env_name}" nvcc --compiler-options -dM -E -x cu - < /dev/null
+  # shellcheck disable=SC2086
+  print_exec conda run ${env_prefix} nvcc --compiler-options -dM -E -x cu - < /dev/null
 
   # Print nvcc version
-  print_exec conda run -n "${env_name}" nvcc --version
+  # shellcheck disable=SC2086
+  print_exec conda run ${env_prefix} nvcc --version
   echo "[INSTALL] Successfully installed CUDA ${cuda_version}"
 }
 
@@ -145,9 +152,13 @@ install_cudnn () {
   cd - || return 1
   rm -rf "$tmp_dir"
 
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   # Export the environment variables to the Conda environment
   echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..."
-  print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib"
+  # shellcheck disable=SC2086
+  print_exec conda env config vars set ${env_prefix} CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib"
 
   echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})"
 }
diff --git a/.github/scripts/utils_pytorch.bash b/.github/scripts/utils_pytorch.bash
index 0cf7916dc0..c586bc4ddd 100644
--- a/.github/scripts/utils_pytorch.bash
+++ b/.github/scripts/utils_pytorch.bash
@@ -59,25 +59,30 @@ install_pytorch_conda () {
   # Clean up packages before installation
   conda_cleanup
 
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   # Install PyTorch packages
   # NOTE: Installation of large package might fail due to corrupt package download
   # Use --force-reinstall to address this on retries - https://datascience.stackexchange.com/questions/41732/conda-verification-failed
   echo "[INSTALL] Attempting to install '${pytorch_package}' (${pytorch_version}, variant = ${pytorch_variant_type}) through Conda using channel '${pytorch_channel}' ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda install --force-reinstall -n "${env_name}" -y ${pytorch_package} -c "${pytorch_channel}") || return 1
+  (exec_with_retries conda install --force-reinstall ${env_prefix} -y ${pytorch_package} -c "${pytorch_channel}") || return 1
 
   # Check that PyTorch is importable
   (test_python_import_package "${env_name}" torch.distributed) || return 1
 
   # Print out the actual installed PyTorch version
-  installed_pytorch_version=$(conda run -n "${env_name}" python -c "import torch; print(torch.__version__)")
+  # shellcheck disable=SC2086
+  installed_pytorch_version=$(conda run ${env_prefix} python -c "import torch; print(torch.__version__)")
   echo "[CHECK] NOTE: The installed version is: ${installed_pytorch_version}"
 
   # Run check for GPU variant
   if [ "$pytorch_variant_type" == "cuda" ]; then
     # Ensure that the PyTorch build is the GPU variant (i.e. contains cuDNN reference)
     # This test usually applies to the PyTorch nightly builds
-    if conda list -n "${env_name}" pytorch | grep cudnn; then
+    # shellcheck disable=SC2086
+    if conda list ${env_prefix} pytorch | grep cudnn; then
       echo "[CHECK] The installed PyTorch ${pytorch_version} contains references to cuDNN"
     else
       echo "[CHECK] The installed PyTorch ${pytorch_version} appears to be the CPU-only version as it is missing references to cuDNN!"
@@ -153,21 +158,26 @@ install_pytorch_pip () {
     local pytorch_channel="https://download.pytorch.org/whl/${pytorch_variant}/"
   fi
 
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   echo "[INSTALL] Attempting to install PyTorch ${pytorch_version}+${pytorch_variant} through PIP using channel ${pytorch_channel} ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda run -n "${env_name}" pip install ${pytorch_package} --extra-index-url ${pytorch_channel}) || return 1
+  (exec_with_retries conda run ${env_prefix} pip install ${pytorch_package} --extra-index-url ${pytorch_channel}) || return 1
 
   # Check that PyTorch is importable
   (test_python_import_package "${env_name}" torch.distributed) || return 1
 
   # Print out the actual installed PyTorch version
-  installed_pytorch_version=$(conda run -n "${env_name}" python -c "import torch; print(torch.__version__)")
+  # shellcheck disable=SC2086
+  installed_pytorch_version=$(conda run ${env_prefix} python -c "import torch; print(torch.__version__)")
   echo "[CHECK] NOTE: The installed version is: ${installed_pytorch_version}"
 
   if [ "$pytorch_variant_type" != "cpu" ]; then
     # Ensure that the PyTorch build is of the correct variant
     # This test usually applies to the PyTorch nightly builds
-    if conda run -n "${env_name}" pip list torch | grep torch | grep "${pytorch_variant}"; then
+    # shellcheck disable=SC2086
+    if conda run ${env_prefix} pip list torch | grep torch | grep "${pytorch_variant}"; then
       echo "[CHECK] The installed PyTorch ${pytorch_version} is the correct variant (${pytorch_variant})"
     else
       echo "[CHECK] The installed PyTorch ${pytorch_version} appears to be an incorrect variant as it is missing references to ${pytorch_variant}!"

From bc0b58ddd5634b0817ea442b6e6e6476f30c7664 Mon Sep 17 00:00:00 2001
From: Supadchaya Puangpontip <supadchaya@meta.com>
Date: Fri, 15 Sep 2023 15:48:50 -0700
Subject: [PATCH 29/94] Add aarch64 builds on Nova (#2024)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2024

Add arm builds through Nova to be hosted on pytorch.org

Reviewed By: osalpekar

Differential Revision: D49333436

fbshipit-source-id: 5d8c192afe10db5ed9b5c6bb35f823d66794e81a
---
 .../workflows/build_wheels_aarch64_linux.yml  | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 .github/workflows/build_wheels_aarch64_linux.yml

diff --git a/.github/workflows/build_wheels_aarch64_linux.yml b/.github/workflows/build_wheels_aarch64_linux.yml
new file mode 100644
index 0000000000..35bb2f42f9
--- /dev/null
+++ b/.github/workflows/build_wheels_aarch64_linux.yml
@@ -0,0 +1,60 @@
+name: Build Aarch64 Linux Wheels
+
+on:
+    pull_request:
+    push:
+      branches:
+        - nightly
+        - main
+        # Release candidate branch look like: v1.11.0-release
+        - v[0-9]+.[0-9]+.[0-9]+-release+
+      tags:
+        # Release candidate tag look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+        - v[0-9]+.[0-9]+.[0-9]+
+    workflow_dispatch:
+
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+    cancel-in-progress: true
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: linux-aarch64
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      with-cuda: disable
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/FBGEMM
+            smoke-test-script: ""
+            pre-script: ../.github/scripts/nova_prescript.bash
+            post-script: ../.github/scripts/nova_postscript.bash
+            env-var-script: .github/scripts/nova_dir.bash
+            package-name: fbgemm_gpu
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      env-var-script: ${{ matrix.env-var-script }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      trigger-event: ${{ github.event_name }}
+      architecture: aarch64
+      setup-miniconda: false
+    secrets:
+      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}

From 8e588e6158131814e75faa98259e8be66c9f9faa Mon Sep 17 00:00:00 2001
From: Sungmin Cho <sungmincho@meta.com>
Date: Fri, 15 Sep 2023 19:32:47 -0700
Subject: [PATCH 30/94] Fix bug in fp8_split_embedding_codegen_forward_kernel
 dispatch (#2009)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2009

For the case of loading FP8 tables with 128B * 8 < dimension <= 128B * 16, the numbers for (MinNum128BRows, MaxNum128BRows) should be (8, 16) and not (4, 8 ). Because of this bug, FP8 tables with dimension in that range don't get properly loaded upon codegen_forward. I wrote a little test to show this deterministically. I will remove the test before land if it is too specific.

Reviewed By: jianyuh

Differential Revision: D49139649

fbshipit-source-id: 300e844885e7a0f8e6e8193f03f135ceb76def28
---
 .../embedding_backward_code_generator.py      |  2 +-
 ...ward_quantized_split_nbit_host_template.cu |  2 +-
 .../split_table_batched_embeddings_test.py    | 22 +++++++++++++++++++
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/codegen/embedding_backward_code_generator.py b/fbgemm_gpu/codegen/embedding_backward_code_generator.py
index 4256a6e02d..a86c4642b2 100644
--- a/fbgemm_gpu/codegen/embedding_backward_code_generator.py
+++ b/fbgemm_gpu/codegen/embedding_backward_code_generator.py
@@ -366,7 +366,7 @@ class elem_type:
                 template_instance_params(*map(str, (2, 4, 1, 2))),
                 template_instance_params(*map(str, (2, 4, 2, 4))),
                 template_instance_params(*map(str, (2, 4, 4, 8))),
-                template_instance_params(*map(str, (2, 2, 4, 8))),
+                template_instance_params(*map(str, (2, 2, 8, 16))),
             ],
         ),
         "INT8": elem_type(
diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_split_nbit_host_template.cu b/fbgemm_gpu/codegen/embedding_forward_quantized_split_nbit_host_template.cu
index 4cc314f384..2d70981fe7 100644
--- a/fbgemm_gpu/codegen/embedding_forward_quantized_split_nbit_host_template.cu
+++ b/fbgemm_gpu/codegen/embedding_forward_quantized_split_nbit_host_template.cu
@@ -410,7 +410,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
           Y(2, 4, 4, 8);
         }
         if (max_fp8_128b_rows > 8) {
-          Y(2, 2, 4, 8);
+          Y(2, 2, 8, 16);
         }
       }
     }));
diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
index ddce4c0ea5..e022d1ae26 100644
--- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py
+++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -4643,6 +4643,28 @@ def test_nbit_forward_cpu_bf16_out(
             output_dtype,
         )
 
+    @unittest.skipIf(*gpu_unavailable)
+    def test_nbit_forward_gpu_no_cache_fp8_2048(self) -> None:
+        # Test the case of FB8 table with 128B*8 < D <= 128B*16
+        self.execute_nbit_forward_(
+            T=1,
+            D=2048,  # 128B*8 < D <= 128B*16
+            B=128,
+            log_E=2,
+            L=4,
+            weighted=False,
+            mixed=False,
+            pooling_mode=PoolingMode.SUM,
+            weights_ty=SparseType.FP8,  # FP8 table
+            use_cache=False,
+            cache_algorithm=CacheAlgorithm.LRU,
+            use_cpu=False,
+            use_array_for_index_remapping=True,
+            do_pruning=False,
+            mixed_weights_ty=False,
+            output_dtype=SparseType.FP16,
+        )
+
     @unittest.skipIf(*gpu_unavailable)
     @given(
         nbit_weights_ty=get_nbit_weights_ty(),

From eb1103b7aca186f4bc6b9fd1c36a9a69cf49e2c2 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Sat, 16 Sep 2023 08:09:08 -0700
Subject: [PATCH 31/94] Use PyTorch PIP for workflows (#2025)

Summary:
- Re-organize bash scripts for easier readability

- Use PyTorch PIP to install fbgemm_gpu for PIP-install workflows

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2025

Reviewed By: spcyppt

Differential Revision: D49342654

Pulled By: q10

fbshipit-source-id: fe9f85f568dcba6b2992f51df6a4a094009364d7
---
 .github/scripts/fbgemm_gpu_install.bash |  68 +------
 .github/scripts/setup_env.bash          | 229 +-----------------------
 .github/scripts/utils_build.bash        | 185 +++++++++++++++++++
 .github/scripts/utils_pip.bash          | 144 +++++++++++++++
 .github/scripts/utils_pytorch.bash      |  65 +------
 5 files changed, 348 insertions(+), 343 deletions(-)
 create mode 100644 .github/scripts/utils_build.bash
 create mode 100644 .github/scripts/utils_pip.bash

diff --git a/.github/scripts/fbgemm_gpu_install.bash b/.github/scripts/fbgemm_gpu_install.bash
index 50353efa47..84c105083a 100644
--- a/.github/scripts/fbgemm_gpu_install.bash
+++ b/.github/scripts/fbgemm_gpu_install.bash
@@ -8,6 +8,8 @@
 
 # shellcheck disable=SC1091,SC2128
 . "$( dirname -- "$BASH_SOURCE"; )/utils_base.bash"
+# shellcheck disable=SC1091,SC2128
+. "$( dirname -- "$BASH_SOURCE"; )/utils_pip.bash"
 
 ################################################################################
 # FBGEMM_GPU Install Functions
@@ -78,72 +80,14 @@ install_fbgemm_gpu_pip () {
     echo ""
   fi
 
-  test_network_connection || return 1
-
-  # Set the package variant
-  if [ "$fbgemm_gpu_variant_type" == "cuda" ]; then
-    # Extract the CUDA version or default to 11.8.0
-    local cuda_version="${fbgemm_gpu_variant_version:-11.8.0}"
-    # shellcheck disable=SC2206
-    local cuda_version_arr=(${cuda_version//./ })
-    # Convert, i.e. cuda 11.7.1 => cu117
-    local fbgemm_gpu_variant="cu${cuda_version_arr[0]}${cuda_version_arr[1]}"
-  elif [ "$fbgemm_gpu_variant_type" == "rocm" ]; then
-    # Extract the ROCM version or default to 5.5.1
-    local rocm_version="${fbgemm_gpu_variant_version:-5.5.1}"
-    # shellcheck disable=SC2206
-    local rocm_version_arr=(${rocm_version//./ })
-    # Convert, i.e. rocm 5.5.1 => rocm5.5
-    local fbgemm_gpu_variant="rocm${rocm_version_arr[0]}.${rocm_version_arr[1]}"
-  else
-    local fbgemm_gpu_variant_type="cpu"
-    local fbgemm_gpu_variant="cpu"
-  fi
-  echo "[INSTALL] Extracted FBGEMM-GPU variant: ${fbgemm_gpu_variant}"
-
-  # Set the package name and installation channel
-#   if [ "$fbgemm_gpu_version" == "nightly" ] || [ "$fbgemm_gpu_version" == "test" ]; then
-#     local fbgemm_gpu_package="--pre fbgemm-gpu"
-#     local fbgemm_gpu_channel="https://download.pytorch.org/whl/${fbgemm_gpu_version}/${fbgemm_gpu_variant}/"
-#   elif [ "$fbgemm_gpu_version" == "latest" ]; then
-#     local fbgemm_gpu_package="fbgemm-gpu"
-#     local fbgemm_gpu_channel="https://download.pytorch.org/whl/${fbgemm_gpu_variant}/"
-#   else
-#     local fbgemm_gpu_package="fbgemm-gpu==${fbgemm_gpu_version}+${fbgemm_gpu_variant}"
-#     local fbgemm_gpu_channel="https://download.pytorch.org/whl/${fbgemm_gpu_variant}/"
-#   fi
-
-  if [ "$fbgemm_gpu_variant_type" == "cuda" ]; then
-    if [ "$fbgemm_gpu_version" == "nightly" ]; then
-      local fbgemm_gpu_package="fbgemm-gpu-nightly"
-    elif [ "$fbgemm_gpu_version" == "latest" ]; then
-      local fbgemm_gpu_package="fbgemm-gpu"
-    else
-      local fbgemm_gpu_package="fbgemm-gpu==${fbgemm_gpu_version}"
-    fi
-
-  elif [ "$fbgemm_gpu_variant_type" == "rocm" ]; then
-    echo "ROCm is currently not supported in PyPI!"
-    return 1
-
-  else
-    if [ "$fbgemm_gpu_version" == "nightly" ]; then
-      local fbgemm_gpu_package="fbgemm-gpu-nightly-cpu"
-    elif [ "$fbgemm_gpu_version" == "latest" ]; then
-      local fbgemm_gpu_package="fbgemm-gpu-cpu"
-    else
-      local fbgemm_gpu_package="fbgemm-gpu-cpu==${fbgemm_gpu_version}"
-    fi
-  fi
-
   # shellcheck disable=SC2155
   local env_prefix=$(env_name_or_prefix "${env_name}")
 
-  echo "[INSTALL] Attempting to install FBGEMM-GPU ${fbgemm_gpu_version}+${fbgemm_gpu_variant} through PIP ..."
-  # shellcheck disable=SC2086
-  (exec_with_retries conda run ${env_prefix} pip install ${fbgemm_gpu_package}) || return 1
+  # Install the package from PyTorch PIP (not PyPI)
+  install_from_pytorch_pip "${env_name}" fbgemm_gpu "${fbgemm_gpu_version}" "${fbgemm_gpu_variant_type}" "${fbgemm_gpu_variant_version}" || return 1
 
+  # Run post-installation checks
   __fbgemm_gpu_post_install_checks || return 1
 
-  echo "[INSTALL] FBGEMM-GPU installation through PIP completed ..."
+  echo "[INSTALL] Successfully installed FBGEMM-GPU through PyTorch PIP"
 }
diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash
index e26b95c722..e37fdd0a15 100755
--- a/.github/scripts/setup_env.bash
+++ b/.github/scripts/setup_env.bash
@@ -10,10 +10,14 @@
 # shellcheck disable=SC1091,SC2128
 . "$( dirname -- "$BASH_SOURCE"; )/utils_system.bash"
 # shellcheck disable=SC1091,SC2128
+. "$( dirname -- "$BASH_SOURCE"; )/utils_build.bash"
+# shellcheck disable=SC1091,SC2128
 . "$( dirname -- "$BASH_SOURCE"; )/utils_conda.bash"
 # shellcheck disable=SC1091,SC2128
 . "$( dirname -- "$BASH_SOURCE"; )/utils_cuda.bash"
 # shellcheck disable=SC1091,SC2128
+. "$( dirname -- "$BASH_SOURCE"; )/utils_pip.bash"
+# shellcheck disable=SC1091,SC2128
 . "$( dirname -- "$BASH_SOURCE"; )/utils_rocm.bash"
 # shellcheck disable=SC1091,SC2128
 . "$( dirname -- "$BASH_SOURCE"; )/utils_pytorch.bash"
@@ -27,228 +31,3 @@
 . "$( dirname -- "$BASH_SOURCE"; )/fbgemm_gpu_lint.bash"
 # shellcheck disable=SC1091,SC2128
 . "$( dirname -- "$BASH_SOURCE"; )/fbgemm_gpu_test.bash"
-
-################################################################################
-# Bazel Setup Functions
-################################################################################
-
-setup_bazel () {
-  local bazel_version="${1:-6.1.1}"
-  echo "################################################################################"
-  echo "# Setup Bazel"
-  echo "#"
-  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
-  echo "################################################################################"
-  echo ""
-
-  test_network_connection || return 1
-
-  local bazel_variant="$PLATFORM_NAME_LC"
-  echo "[SETUP] Downloading installer Bazel ${bazel_version} (${bazel_variant}) ..."
-  print_exec wget -q "https://github.com/bazelbuild/bazel/releases/download/${bazel_version}/bazel-${bazel_version}-installer-${bazel_variant}.sh" -O install-bazel.sh
-
-  echo "[SETUP] Installing Bazel ..."
-  print_exec bash install-bazel.sh
-  print_exec rm -f install-bazel.sh
-
-  print_exec bazel --version
-  echo "[SETUP] Successfully set up Bazel"
-}
-
-
-################################################################################
-# Build Tools Setup Functions
-################################################################################
-
-install_cxx_compiler () {
-  local env_name="$1"
-  local use_system_package_manager="$2"
-  if [ "$env_name" == "" ]; then
-    echo "Usage: ${FUNCNAME[0]} ENV_NAME [USE_YUM]"
-    echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env     # Install C/C++ compilers through Conda"
-    echo "    ${FUNCNAME[0]} build_env 1   # Install C/C++ compilers through the system package manager"
-    return 1
-  else
-    echo "################################################################################"
-    echo "# Install C/C++ Compilers"
-    echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
-    echo "################################################################################"
-    echo ""
-  fi
-
-  test_network_connection || return 1
-
-  if [ "$use_system_package_manager" != "" ]; then
-    echo "[INSTALL] Installing C/C++ compilers through the system package manager ..."
-    install_system_packages gcc gcc-c++
-
-  else
-    # Install gxx_linux-<arch> from conda-forge instead of from anaconda channel.
-    # sysroot_linux-<arch> needs to be installed alongside this:
-    #
-    #   https://root-forum.cern.ch/t/error-timespec-get-has-not-been-declared-with-conda-root-package/45712/6
-    #   https://github.com/conda-forge/conda-forge.github.io/issues/1625
-    #   https://conda-forge.org/docs/maintainer/knowledge_base.html#using-centos-7
-    #   https://github.com/conda/conda-build/issues/4371
-    #
-    # NOTE: We install g++ 10.x instead of 11.x becaue 11.x builds binaries that
-    # reference GLIBCXX_3.4.29, which may not be available on systems with older
-    # versions of libstdc++.so.6 such as CentOS Stream 8 and Ubuntu 20.04
-    local archname=""
-    if [ "$MACHINE_NAME_LC" = "x86_64" ]; then
-      archname="64"
-    elif [ "$MACHINE_NAME_LC" = "aarch64" ] || [ "$MACHINE_NAME_LC" = "arm64" ]; then
-      archname="aarch64"
-    else
-      archname="$MACHINE_NAME_LC"
-    fi
-
-    # shellcheck disable=SC2155
-    local env_prefix=$(env_name_or_prefix "${env_name}")
-
-    echo "[INSTALL] Installing C/C++ compilers through Conda (architecture = ${archname}) ..."
-    # shellcheck disable=SC2086
-    (exec_with_retries conda install ${env_prefix} -y "gxx_linux-${archname}"=10.4.0 "sysroot_linux-${archname}"=2.17 -c conda-forge) || return 1
-
-    # The compilers are visible in the PATH as `x86_64-conda-linux-gnu-cc` and
-    # `x86_64-conda-linux-gnu-c++`, so symlinks will need to be created
-    echo "[INSTALL] Setting the C/C++ compiler symlinks ..."
-    # shellcheck disable=SC2155,SC2086
-    local cc_path=$(conda run ${env_prefix} printenv CC)
-    # shellcheck disable=SC2155,SC2086
-    local cxx_path=$(conda run ${env_prefix} printenv CXX)
-
-    print_exec ln -s "${cc_path}" "$(dirname "$cc_path")/cc"
-    print_exec ln -s "${cc_path}" "$(dirname "$cc_path")/gcc"
-    print_exec ln -s "${cxx_path}" "$(dirname "$cxx_path")/c++"
-    print_exec ln -s "${cxx_path}" "$(dirname "$cxx_path")/g++"
-  fi
-
-  # Check C/C++ compilers are visible
-  (test_binpath "${env_name}" cc) || return 1
-  (test_binpath "${env_name}" gcc) || return 1
-  (test_binpath "${env_name}" c++) || return 1
-  (test_binpath "${env_name}" g++) || return 1
-
-  # https://stackoverflow.com/questions/2224334/gcc-dump-preprocessor-defines
-  echo "[INFO] Printing out all preprocessor defines in the C compiler ..."
-  # shellcheck disable=SC2086
-  print_exec conda run ${env_prefix} cc -dM -E -
-
-  # https://stackoverflow.com/questions/2224334/gcc-dump-preprocessor-defines
-  echo "[INFO] Printing out all preprocessor defines in the C++ compiler ..."
-  # shellcheck disable=SC2086
-  print_exec conda run ${env_prefix} c++ -dM -E -x c++ -
-
-  # Print out the C++ version
-  # shellcheck disable=SC2086
-  print_exec conda run ${env_prefix} c++ --version
-
-  # https://stackoverflow.com/questions/4991707/how-to-find-my-current-compilers-standard-like-if-it-is-c90-etc
-  echo "[INFO] Printing the default version of the C standard used by the compiler ..."
-  print_exec "conda run ${env_prefix} cc -dM -E - | grep __STDC_VERSION__"
-
-  # https://stackoverflow.com/questions/2324658/how-to-determine-the-version-of-the-c-standard-used-by-the-compiler
-  echo "[INFO] Printing the default version of the C++ standard used by the compiler ..."
-  print_exec "conda run ${env_prefix} c++ -dM -E -x c++ - | grep __cplusplus"
-
-  echo "[INSTALL] Successfully installed C/C++ compilers"
-}
-
-install_build_tools () {
-  local env_name="$1"
-  if [ "$env_name" == "" ]; then
-    echo "Usage: ${FUNCNAME[0]} ENV_NAME"
-    echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env"
-    return 1
-  else
-    echo "################################################################################"
-    echo "# Install Build Tools"
-    echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
-    echo "################################################################################"
-    echo ""
-  fi
-
-  test_network_connection || return 1
-
-  # shellcheck disable=SC2155
-  local env_prefix=$(env_name_or_prefix "${env_name}")
-
-  echo "[INSTALL] Installing build tools ..."
-  # shellcheck disable=SC2086
-  (exec_with_retries conda install ${env_prefix} -y \
-    click \
-    cmake \
-    hypothesis \
-    jinja2 \
-    ninja \
-    numpy \
-    scikit-build \
-    wheel) || return 1
-
-  # Check binaries are visible in the PAATH
-  (test_binpath "${env_name}" cmake) || return 1
-  (test_binpath "${env_name}" ninja) || return 1
-
-  # Check Python packages are importable
-  local import_tests=( click hypothesis jinja2 numpy skbuild wheel )
-  for p in "${import_tests[@]}"; do
-    (test_python_import_package "${env_name}" "${p}") || return 1
-  done
-
-  echo "[INSTALL] Successfully installed all the build tools"
-}
-
-
-################################################################################
-# PyPI Publish Functions
-################################################################################
-
-publish_to_pypi () {
-  local env_name="$1"
-  local package_name="$2"
-  local pypi_token="$3"
-  if [ "$pypi_token" == "" ]; then
-    echo "Usage: ${FUNCNAME[0]} ENV_NAME PACKAGE_NAME PYPI_TOKEN"
-    echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu_nightly-*.whl MY_TOKEN"
-    echo ""
-    echo "PYPI_TOKEN is missing!"
-    return 1
-  else
-    echo "################################################################################"
-    echo "# Publish to PyPI"
-    echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
-    echo "################################################################################"
-    echo ""
-  fi
-
-  test_network_connection || return 1
-
-  # shellcheck disable=SC2155
-  local env_prefix=$(env_name_or_prefix "${env_name}")
-
-  echo "[INSTALL] Installing twine ..."
-  # shellcheck disable=SC2086
-  print_exec conda install ${env_prefix} -y twine
-  (test_python_import_package "${env_name}" twine) || return 1
-  (test_python_import_package "${env_name}" OpenSSL) || return 1
-
-  echo "[PUBLISH] Uploading package(s) to PyPI: ${package_name} ..."
-  # shellcheck disable=SC2086
-  conda run ${env_prefix} \
-    python -m twine upload \
-      --username __token__ \
-      --password "${pypi_token}" \
-      --skip-existing \
-      --verbose \
-      "${package_name}"
-
-  echo "[PUBLISH] Successfully published package(s) to PyPI: ${package_name}"
-  echo "[PUBLISH] NOTE: The publish command is a successful no-op if the wheel version already existed in PyPI; please double check!"
-}
diff --git a/.github/scripts/utils_build.bash b/.github/scripts/utils_build.bash
new file mode 100644
index 0000000000..12febee996
--- /dev/null
+++ b/.github/scripts/utils_build.bash
@@ -0,0 +1,185 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# shellcheck disable=SC1091,SC2128
+. "$( dirname -- "$BASH_SOURCE"; )/utils_base.bash"
+
+################################################################################
+# Bazel Setup Functions
+################################################################################
+
+setup_bazel () {
+  local bazel_version="${1:-6.1.1}"
+  echo "################################################################################"
+  echo "# Setup Bazel"
+  echo "#"
+  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "################################################################################"
+  echo ""
+
+  test_network_connection || return 1
+
+  local bazel_variant="$PLATFORM_NAME_LC"
+  echo "[SETUP] Downloading installer Bazel ${bazel_version} (${bazel_variant}) ..."
+  print_exec wget -q "https://github.com/bazelbuild/bazel/releases/download/${bazel_version}/bazel-${bazel_version}-installer-${bazel_variant}.sh" -O install-bazel.sh
+
+  echo "[SETUP] Installing Bazel ..."
+  print_exec bash install-bazel.sh
+  print_exec rm -f install-bazel.sh
+
+  print_exec bazel --version
+  echo "[SETUP] Successfully set up Bazel"
+}
+
+
+################################################################################
+# Build Tools Setup Functions
+################################################################################
+
+install_cxx_compiler () {
+  local env_name="$1"
+  local use_system_package_manager="$2"
+  if [ "$env_name" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME [USE_YUM]"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env     # Install C/C++ compilers through Conda"
+    echo "    ${FUNCNAME[0]} build_env 1   # Install C/C++ compilers through the system package manager"
+    return 1
+  else
+    echo "################################################################################"
+    echo "# Install C/C++ Compilers"
+    echo "#"
+    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "################################################################################"
+    echo ""
+  fi
+
+  test_network_connection || return 1
+
+  if [ "$use_system_package_manager" != "" ]; then
+    echo "[INSTALL] Installing C/C++ compilers through the system package manager ..."
+    install_system_packages gcc gcc-c++
+
+  else
+    # Install gxx_linux-<arch> from conda-forge instead of from anaconda channel.
+    # sysroot_linux-<arch> needs to be installed alongside this:
+    #
+    #   https://root-forum.cern.ch/t/error-timespec-get-has-not-been-declared-with-conda-root-package/45712/6
+    #   https://github.com/conda-forge/conda-forge.github.io/issues/1625
+    #   https://conda-forge.org/docs/maintainer/knowledge_base.html#using-centos-7
+    #   https://github.com/conda/conda-build/issues/4371
+    #
+    # NOTE: We install g++ 10.x instead of 11.x becaue 11.x builds binaries that
+    # reference GLIBCXX_3.4.29, which may not be available on systems with older
+    # versions of libstdc++.so.6 such as CentOS Stream 8 and Ubuntu 20.04
+    local archname=""
+    if [ "$MACHINE_NAME_LC" = "x86_64" ]; then
+      archname="64"
+    elif [ "$MACHINE_NAME_LC" = "aarch64" ] || [ "$MACHINE_NAME_LC" = "arm64" ]; then
+      archname="aarch64"
+    else
+      archname="$MACHINE_NAME_LC"
+    fi
+
+    # shellcheck disable=SC2155
+    local env_prefix=$(env_name_or_prefix "${env_name}")
+
+    echo "[INSTALL] Installing C/C++ compilers through Conda (architecture = ${archname}) ..."
+    # shellcheck disable=SC2086
+    (exec_with_retries conda install ${env_prefix} -y "gxx_linux-${archname}"=10.4.0 "sysroot_linux-${archname}"=2.17 -c conda-forge) || return 1
+
+    # The compilers are visible in the PATH as `x86_64-conda-linux-gnu-cc` and
+    # `x86_64-conda-linux-gnu-c++`, so symlinks will need to be created
+    echo "[INSTALL] Setting the C/C++ compiler symlinks ..."
+    # shellcheck disable=SC2155,SC2086
+    local cc_path=$(conda run ${env_prefix} printenv CC)
+    # shellcheck disable=SC2155,SC2086
+    local cxx_path=$(conda run ${env_prefix} printenv CXX)
+
+    print_exec ln -s "${cc_path}" "$(dirname "$cc_path")/cc"
+    print_exec ln -s "${cc_path}" "$(dirname "$cc_path")/gcc"
+    print_exec ln -s "${cxx_path}" "$(dirname "$cxx_path")/c++"
+    print_exec ln -s "${cxx_path}" "$(dirname "$cxx_path")/g++"
+  fi
+
+  # Check C/C++ compilers are visible
+  (test_binpath "${env_name}" cc) || return 1
+  (test_binpath "${env_name}" gcc) || return 1
+  (test_binpath "${env_name}" c++) || return 1
+  (test_binpath "${env_name}" g++) || return 1
+
+  # https://stackoverflow.com/questions/2224334/gcc-dump-preprocessor-defines
+  echo "[INFO] Printing out all preprocessor defines in the C compiler ..."
+  # shellcheck disable=SC2086
+  print_exec conda run ${env_prefix} cc -dM -E -
+
+  # https://stackoverflow.com/questions/2224334/gcc-dump-preprocessor-defines
+  echo "[INFO] Printing out all preprocessor defines in the C++ compiler ..."
+  # shellcheck disable=SC2086
+  print_exec conda run ${env_prefix} c++ -dM -E -x c++ -
+
+  # Print out the C++ version
+  # shellcheck disable=SC2086
+  print_exec conda run ${env_prefix} c++ --version
+
+  # https://stackoverflow.com/questions/4991707/how-to-find-my-current-compilers-standard-like-if-it-is-c90-etc
+  echo "[INFO] Printing the default version of the C standard used by the compiler ..."
+  print_exec "conda run ${env_prefix} cc -dM -E - | grep __STDC_VERSION__"
+
+  # https://stackoverflow.com/questions/2324658/how-to-determine-the-version-of-the-c-standard-used-by-the-compiler
+  echo "[INFO] Printing the default version of the C++ standard used by the compiler ..."
+  print_exec "conda run ${env_prefix} c++ -dM -E -x c++ - | grep __cplusplus"
+
+  echo "[INSTALL] Successfully installed C/C++ compilers"
+}
+
+install_build_tools () {
+  local env_name="$1"
+  if [ "$env_name" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env"
+    return 1
+  else
+    echo "################################################################################"
+    echo "# Install Build Tools"
+    echo "#"
+    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "################################################################################"
+    echo ""
+  fi
+
+  test_network_connection || return 1
+
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
+  echo "[INSTALL] Installing build tools ..."
+  # shellcheck disable=SC2086
+  (exec_with_retries conda install ${env_prefix} -y \
+    click \
+    cmake \
+    hypothesis \
+    jinja2 \
+    ninja \
+    numpy \
+    scikit-build \
+    wheel) || return 1
+
+  # Check binaries are visible in the PAATH
+  (test_binpath "${env_name}" cmake) || return 1
+  (test_binpath "${env_name}" ninja) || return 1
+
+  # Check Python packages are importable
+  local import_tests=( click hypothesis jinja2 numpy skbuild wheel )
+  for p in "${import_tests[@]}"; do
+    (test_python_import_package "${env_name}" "${p}") || return 1
+  done
+
+  echo "[INSTALL] Successfully installed all the build tools"
+}
diff --git a/.github/scripts/utils_pip.bash b/.github/scripts/utils_pip.bash
new file mode 100644
index 0000000000..f649be68eb
--- /dev/null
+++ b/.github/scripts/utils_pip.bash
@@ -0,0 +1,144 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# shellcheck disable=SC1091,SC2128
+. "$( dirname -- "$BASH_SOURCE"; )/utils_base.bash"
+
+################################################################################
+# PyTorch PIP Install Functions
+################################################################################
+
+install_from_pytorch_pip () {
+  local env_name="$1"
+  local package_name="$2"
+  local package_version="$3"
+  local package_variant_type="$4"
+  local package_variant_version="$5"
+  if [ "$package_variant_type" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME PACKAGE_NAME PACKAGE_VERSION PACKAGE_VARIANT_TYPE [PACKAGE_VARIANT_VERSION]"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env torch 1.11.0 cpu             # Install the CPU variant a specific version"
+    echo "    ${FUNCNAME[0]} build_env torch latest cpu             # Install the CPU variant of the latest stable version"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test cuda 11.7.1  # Install the variant for CUDA 11.7"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm 5.3  # Install the variant for ROCM 5.3"
+    return 1
+  else
+    echo "################################################################################"
+    echo "# Install ${package_name} (PyTorch PIP)"
+    echo "#"
+    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "################################################################################"
+    echo ""
+  fi
+
+  test_network_connection || return 1
+
+  # Set the package variant
+  if [ "$package_variant_type" == "cuda" ]; then
+    # Extract the CUDA version or default to 11.8.0
+    local cuda_version="${package_variant_version:-11.8.0}"
+    # shellcheck disable=SC2206
+    local cuda_version_arr=(${cuda_version//./ })
+    # Convert, i.e. cuda 11.7.1 => cu117
+    local package_variant="cu${cuda_version_arr[0]}${cuda_version_arr[1]}"
+  elif [ "$package_variant_type" == "rocm" ]; then
+    # Extract the ROCM version or default to 5.5.1
+    local rocm_version="${package_variant_version:-5.5.1}"
+    # shellcheck disable=SC2206
+    local rocm_version_arr=(${rocm_version//./ })
+    # Convert, i.e. rocm 5.5.1 => rocm5.5
+    local package_variant="rocm${rocm_version_arr[0]}.${rocm_version_arr[1]}"
+  else
+    local package_variant_type="cpu"
+    local package_variant="cpu"
+  fi
+  echo "[INSTALL] Extracted package variant: ${package_variant}"
+
+  # Set the package name and installation channel
+  if [ "$package_version" == "nightly" ] || [ "$package_version" == "test" ]; then
+    local package_package="--pre ${package_name}"
+    local package_channel="https://download.pytorch.org/whl/${package_version}/${package_variant}/"
+  elif [ "$package_version" == "latest" ]; then
+    local package_package="${package_name}"
+    local package_channel="https://download.pytorch.org/whl/${package_variant}/"
+  else
+    local package_package="${package_name}==${package_version}+${package_variant}"
+    local package_channel="https://download.pytorch.org/whl/${package_variant}/"
+  fi
+
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
+  echo "[INSTALL] Attempting to install [${package_name}, ${package_version}+${package_variant}] through PIP using channel ${package_channel} ..."
+  # shellcheck disable=SC2086
+  (exec_with_retries conda run ${env_prefix} pip install ${package_package} --extra-index-url ${package_channel}) || return 1
+
+  # Check only applies to non-CPU variants
+  if [ "$package_variant_type" != "cpu" ]; then
+    # Ensure that the package build is of the correct variant
+    # This test usually applies to the nightly builds
+    # shellcheck disable=SC2086
+    if conda run ${env_prefix} pip list "${package_name}" | grep "${package_name}" | grep "${package_variant}"; then
+      echo "[CHECK] The installed package [${package_name}, ${package_version}] is the correct variant (${package_variant})"
+    else
+      echo "[CHECK] The installed package [${package_name}, ${package_version}] appears to be an incorrect variant as it is missing references to ${package_variant}!"
+      echo "[CHECK] This can happen if the variant of the package (e.g. GPU, nightly) for the MAJOR.MINOR version of CUDA or ROCm presently installed on the system is not available."
+      return 1
+    fi
+  fi
+}
+
+
+################################################################################
+# PyPI Publish Functions
+################################################################################
+
+publish_to_pypi () {
+  local env_name="$1"
+  local package_name="$2"
+  local pypi_token="$3"
+  if [ "$pypi_token" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME PACKAGE_NAME PYPI_TOKEN"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu_nightly-*.whl MY_TOKEN"
+    echo ""
+    echo "PYPI_TOKEN is missing!"
+    return 1
+  else
+    echo "################################################################################"
+    echo "# Publish to PyPI"
+    echo "#"
+    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "################################################################################"
+    echo ""
+  fi
+
+  test_network_connection || return 1
+
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
+  echo "[INSTALL] Installing twine ..."
+  # shellcheck disable=SC2086
+  print_exec conda install ${env_prefix} -y twine
+  (test_python_import_package "${env_name}" twine) || return 1
+  (test_python_import_package "${env_name}" OpenSSL) || return 1
+
+  echo "[PUBLISH] Uploading package(s) to PyPI: ${package_name} ..."
+  # shellcheck disable=SC2086
+  conda run ${env_prefix} \
+    python -m twine upload \
+      --username __token__ \
+      --password "${pypi_token}" \
+      --skip-existing \
+      --verbose \
+      "${package_name}"
+
+  echo "[PUBLISH] Successfully published package(s) to PyPI: ${package_name}"
+  echo "[PUBLISH] NOTE: The publish command is a successful no-op if the wheel version already existed in PyPI; please double check!"
+}
diff --git a/.github/scripts/utils_pytorch.bash b/.github/scripts/utils_pytorch.bash
index c586bc4ddd..8aaea9f4fd 100644
--- a/.github/scripts/utils_pytorch.bash
+++ b/.github/scripts/utils_pytorch.bash
@@ -10,6 +10,8 @@
 . "$( dirname -- "$BASH_SOURCE"; )/utils_base.bash"
 # shellcheck disable=SC1091,SC2128
 . "$( dirname -- "$BASH_SOURCE"; )/utils_conda.bash"
+# shellcheck disable=SC1091,SC2128
+. "$( dirname -- "$BASH_SOURCE"; )/utils_pip.bash"
 
 ################################################################################
 # PyTorch Setup Functions
@@ -73,8 +75,8 @@ install_pytorch_conda () {
   (test_python_import_package "${env_name}" torch.distributed) || return 1
 
   # Print out the actual installed PyTorch version
-  # shellcheck disable=SC2086
-  installed_pytorch_version=$(conda run ${env_prefix} python -c "import torch; print(torch.__version__)")
+  # shellcheck disable=SC2086,SC2155
+  local installed_pytorch_version=$(conda run ${env_prefix} python -c "import torch; print(torch.__version__)")
   echo "[CHECK] NOTE: The installed version is: ${installed_pytorch_version}"
 
   # Run check for GPU variant
@@ -123,73 +125,24 @@ install_pytorch_pip () {
     echo ""
   fi
 
-  test_network_connection || return 1
-
-  # Set the package variant
-  if [ "$pytorch_variant_type" == "cuda" ]; then
-    # Extract the CUDA version or default to 11.8.0
-    local cuda_version="${pytorch_variant_version:-11.8.0}"
-    # shellcheck disable=SC2206
-    local cuda_version_arr=(${cuda_version//./ })
-    # Convert, i.e. cuda 11.7.1 => cu117
-    local pytorch_variant="cu${cuda_version_arr[0]}${cuda_version_arr[1]}"
-  elif [ "$pytorch_variant_type" == "rocm" ]; then
-    # Extract the ROCM version or default to 5.5.1
-    local rocm_version="${pytorch_variant_version:-5.5.1}"
-    # shellcheck disable=SC2206
-    local rocm_version_arr=(${rocm_version//./ })
-    # Convert, i.e. rocm 5.5.1 => rocm5.5
-    local pytorch_variant="rocm${rocm_version_arr[0]}.${rocm_version_arr[1]}"
-  else
-    local pytorch_variant_type="cpu"
-    local pytorch_variant="cpu"
-  fi
-  echo "[INSTALL] Extracted PyTorch variant: ${pytorch_variant}"
-
-  # Set the package name and installation channel
-  if [ "$pytorch_version" == "nightly" ] || [ "$pytorch_version" == "test" ]; then
-    local pytorch_package="--pre torch"
-    local pytorch_channel="https://download.pytorch.org/whl/${pytorch_version}/${pytorch_variant}/"
-  elif [ "$pytorch_version" == "latest" ]; then
-    local pytorch_package="torch"
-    local pytorch_channel="https://download.pytorch.org/whl/${pytorch_variant}/"
-  else
-    local pytorch_package="torch==${pytorch_version}+${pytorch_variant}"
-    local pytorch_channel="https://download.pytorch.org/whl/${pytorch_variant}/"
-  fi
-
   # shellcheck disable=SC2155
   local env_prefix=$(env_name_or_prefix "${env_name}")
 
-  echo "[INSTALL] Attempting to install PyTorch ${pytorch_version}+${pytorch_variant} through PIP using channel ${pytorch_channel} ..."
-  # shellcheck disable=SC2086
-  (exec_with_retries conda run ${env_prefix} pip install ${pytorch_package} --extra-index-url ${pytorch_channel}) || return 1
+  # Install the package from PyTorch PIP (not PyPI)
+  install_from_pytorch_pip "${env_name}" torch "${pytorch_version}" "${pytorch_variant_type}" "${pytorch_variant_version}" || return 1
 
   # Check that PyTorch is importable
   (test_python_import_package "${env_name}" torch.distributed) || return 1
 
   # Print out the actual installed PyTorch version
-  # shellcheck disable=SC2086
-  installed_pytorch_version=$(conda run ${env_prefix} python -c "import torch; print(torch.__version__)")
+  # shellcheck disable=SC2086,SC2155
+  local installed_pytorch_version=$(conda run ${env_prefix} python -c "import torch; print(torch.__version__)")
   echo "[CHECK] NOTE: The installed version is: ${installed_pytorch_version}"
 
-  if [ "$pytorch_variant_type" != "cpu" ]; then
-    # Ensure that the PyTorch build is of the correct variant
-    # This test usually applies to the PyTorch nightly builds
-    # shellcheck disable=SC2086
-    if conda run ${env_prefix} pip list torch | grep torch | grep "${pytorch_variant}"; then
-      echo "[CHECK] The installed PyTorch ${pytorch_version} is the correct variant (${pytorch_variant})"
-    else
-      echo "[CHECK] The installed PyTorch ${pytorch_version} appears to be an incorrect variant as it is missing references to ${pytorch_variant}!"
-      echo "[CHECK] This can happen if the variant of PyTorch (e.g. GPU, nightly) for the MAJOR.MINOR version of CUDA or ROCm presently installed on the system is not available."
-      return 1
-    fi
-  fi
-
   if [ "$pytorch_variant_type" == "cuda" ]; then
     # Ensure that the PyTorch-CUDA headers are properly installed
     (test_filepath "${env_name}" cuda_cmake_macros.h) || return 1
   fi
 
-  echo "[INSTALL] Successfully installed PyTorch through PIP"
+  echo "[INSTALL] Successfully installed PyTorch through PyTorch PIP"
 }

From f030bbc181da2fae07cba2fccc26130a52e696f4 Mon Sep 17 00:00:00 2001
From: Abdul Zainul-Abedin <azainul@meta.com>
Date: Mon, 18 Sep 2023 13:03:57 -0700
Subject: [PATCH 32/94] Add support for duplicate in permutations for
 permute_pooled_embs_split (#1940)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1940

This diff builds ontop of the pervious diffs and adds support for duplicates to the permute_pooled_embs_split op.

Background
Currently permute_pooled_embs_split does not support duplicates in a permutation, this poses a problem with passing the same embeddings to multiple modules. This doc proposes a solution to allow duplicate subsets in the resultant permutation.
Details
The required implementation of permute_pooled_embs_split should support a subset being repeated. This is represented by having duplicates in the permute list. This also results in the output list size being greater than the input list.
Input: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Offset_dims: [0,  2,  5,  6, 10]
Permute: [3, 0, 2, 1, 3]
Output:  [6, 7, 8, 9, 0, 1, 5, 2, 3, 4, 6, 7, 8, 9]

Reviewed By: sryap

Differential Revision: D48305847

fbshipit-source-id: 4c82683b725592cad458e83596617a14f4c6e988
---
 .../permute_pooled_embedding_ops_split.h      | 32 +++++++
 .../permute_pooled_embedding_ops.cu           |  1 +
 .../permute_pooled_embedding_ops_split.cu     | 48 +++++++++-
 ...permute_pooled_embedding_ops_split_cpu.cpp | 66 +++++++++++++-
 ...permute_pooled_embedding_ops_split_gpu.cpp | 21 +++++
 .../permute_pooled_embedding_split_test.py    | 88 +++++++++++++++++++
 6 files changed, 249 insertions(+), 7 deletions(-)
 create mode 100644 fbgemm_gpu/test/permute_pooled_embedding_split_test.py

diff --git a/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops_split.h b/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops_split.h
index b35297a2ba..5a5908d53e 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops_split.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops_split.h
@@ -22,6 +22,28 @@ at::Tensor permute_pooled_embs_split_cpu(
     const at::Tensor& inv_offset_dim_list,
     const at::Tensor& inv_permute_list);
 
+// Implementation of permute_pooled_embs_split for GPU. This supports both the
+// duplicate and non-duplicate cases with the allow_duplicates flag.
+///@ingroup permute-pooled-embs-gpu-impl
+at::Tensor permute_pooled_embs_split_gpu_impl(
+    const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
+    const at::Tensor& offset_dim_list,
+    const at::Tensor& permute_list,
+    const at::Tensor& inv_offset_dim_list,
+    const at::Tensor& inv_permute_list,
+    const bool& allow_duplicates);
+
+// Implementation of permute_pooled_embs_split for GPU for the duplicate
+// permutations use case. This calls the permute_pooled_embs_split_gpu_impl
+// function.
+///@ingroup permute-duplicate-pooled-embs-gpu
+at::Tensor permute_duplicate_pooled_embs_split_gpu(
+    const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
+    const at::Tensor& offset_dim_list,
+    const at::Tensor& permute_list,
+    const at::Tensor& inv_offset_dim_list,
+    const at::Tensor& inv_permute_list);
+
 ///@ingroup permute-pooled-embs-gpu
 at::Tensor permute_pooled_embs_split_gpu(
     const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
@@ -38,6 +60,16 @@ at::Tensor permute_pooled_embs_auto_grad_split_cpu(
     const at::Tensor& inv_offset_dim_list,
     const at::Tensor& inv_permute_list);
 
+// Implementation of permute_pooled_embs_auto_grad_split for GPU for the
+// duplicate permutations use case.
+///@ingroup permute-duplicate-pooled-embs-gpu
+at::Tensor permute_duplicate_pooled_embs_auto_grad_split_gpu(
+    const at::Tensor& pooled_embs,
+    const at::Tensor& offset_dim_list,
+    const at::Tensor& permute_list,
+    const at::Tensor& inv_offset_dim_list,
+    const at::Tensor& inv_permute_list);
+
 ///@ingroup permute-pooled-embs-gpu
 at::Tensor permute_pooled_embs_auto_grad_split_gpu(
     const at::Tensor& pooled_embs,
diff --git a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu
index 56f8315a94..0f4a219f6a 100644
--- a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu
+++ b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu
@@ -57,6 +57,7 @@ Tensor permute_pooled_embs_gpu(
       inv_permute_list,
       false);
 }
+
 Tensor permute_pooled_embs_gpu_impl(
     const Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
     const Tensor& offset_dim_list,
diff --git a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu
index a0788581a5..6221ea63a7 100644
--- a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu
+++ b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu
@@ -28,6 +28,43 @@ Tensor permute_pooled_embs_split_gpu(
     const Tensor& permute_list,
     const Tensor& inv_offset_dim_list,
     const Tensor& inv_permute_list) {
+  TORCH_CHECK(offset_dim_list.numel() == permute_list.numel() + 1);
+  TORCH_CHECK(offset_dim_list.numel() == inv_offset_dim_list.numel());
+
+  return permute_pooled_embs_split_gpu_impl(
+      pooled_embs,
+      offset_dim_list,
+      permute_list,
+      inv_offset_dim_list,
+      inv_permute_list,
+      false);
+}
+
+Tensor permute_duplicate_pooled_embs_split_gpu(
+    const Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
+    const Tensor& offset_dim_list,
+    const Tensor& permute_list,
+    const Tensor& inv_offset_dim_list,
+    const Tensor& inv_permute_list) {
+  TORCH_CHECK(offset_dim_list.numel() > 0);
+  TORCH_CHECK(inv_offset_dim_list.numel() > 0);
+
+  return permute_pooled_embs_split_gpu_impl(
+      pooled_embs,
+      offset_dim_list,
+      permute_list,
+      inv_offset_dim_list,
+      inv_permute_list,
+      true);
+}
+
+Tensor permute_pooled_embs_split_gpu_impl(
+    const Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
+    const Tensor& offset_dim_list,
+    const Tensor& permute_list,
+    const Tensor& inv_offset_dim_list,
+    const Tensor& inv_permute_list,
+    const bool& allow_duplicates) {
   // inv_permute_list is not being used so it's not checked here.
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
       pooled_embs, offset_dim_list, permute_list, inv_offset_dim_list);
@@ -45,9 +82,14 @@ Tensor permute_pooled_embs_split_gpu(
   TENSORS_ON_SAME_DEVICE(pooled_embs_contiguous, offset_dim_list);
   TENSORS_ON_SAME_DEVICE(pooled_embs_contiguous, permute_list);
   TENSORS_ON_SAME_DEVICE(pooled_embs_contiguous, inv_offset_dim_list);
-  TORCH_CHECK(offset_dim_list.numel() == permute_list.numel() + 1);
-  TORCH_CHECK(offset_dim_list.numel() == inv_offset_dim_list.numel());
-  Tensor permuted_pooled_embs = at::empty_like(pooled_embs_contiguous);
+
+  // Last index in inv_offset_dim_list contains the size of output.
+  // This will cause a D->H sync.
+  const int64_t permuted_embs_dim_sum =
+      allow_duplicates ? inv_offset_dim_list[-1].item<int64_t>() : dim_sum;
+  Tensor permuted_pooled_embs = at::empty(
+      {pooled_embs_contiguous.size(0), permuted_embs_dim_sum},
+      pooled_embs_contiguous.options());
 
   // This kernel is moving D elements per warp.
   // We are launching ( div_round_up(T, warp_per_block), B ) blocks.
diff --git a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp
index 866873d044..1935f2a83a 100644
--- a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp
+++ b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp
@@ -24,12 +24,13 @@ using torch::autograd::AutogradContext;
 using torch::autograd::Variable;
 using torch::autograd::variable_list;
 
-Tensor permute_pooled_embs_split_cpu(
+Tensor permute_pooled_embs_split_cpu_impl(
     const Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
     const Tensor& offset_dim_list,
     const Tensor& permute_list,
     const Tensor& inv_offset_dim_list,
-    const Tensor& inv_permute_list) {
+    const Tensor& inv_permute_list,
+    const bool& allow_duplicates) {
   TORCH_CHECK(
       offset_dim_list.scalar_type() == at::ScalarType::Long,
       "offset_dim_list needs to have long/int64 type")
@@ -38,9 +39,10 @@ Tensor permute_pooled_embs_split_cpu(
       "permute_list needs to have long/int64 type")
   auto permute = permute_list.data_ptr<int64_t>();
   const auto n = permute_list.numel();
+  const auto dims_size = allow_duplicates ? offset_dim_list.numel() : n;
   std::vector<int64_t> dims;
-  dims.reserve(n - 1);
-  for (const auto i : c10::irange(1, n)) {
+  dims.reserve(dims_size - 1);
+  for (const auto i : c10::irange(1, dims_size)) {
     dims.push_back(offset_dim_list[i].item<int64_t>());
   }
   auto ts = pooled_embs.tensor_split(dims, 1);
@@ -52,6 +54,36 @@ Tensor permute_pooled_embs_split_cpu(
   return at::cat(permuted_ts, 1);
 }
 
+Tensor permute_pooled_embs_split_cpu(
+    const Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
+    const Tensor& offset_dim_list,
+    const Tensor& permute_list,
+    const Tensor& inv_offset_dim_list,
+    const Tensor& inv_permute_list) {
+  return permute_pooled_embs_split_cpu_impl(
+      pooled_embs,
+      offset_dim_list,
+      permute_list,
+      inv_offset_dim_list,
+      inv_permute_list,
+      false);
+}
+
+Tensor permute_duplicate_pooled_embs_split_cpu(
+    const Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
+    const Tensor& offset_dim_list,
+    const Tensor& permute_list,
+    const Tensor& inv_offset_dim_list,
+    const Tensor& inv_permute_list) {
+  return permute_pooled_embs_split_cpu_impl(
+      pooled_embs,
+      offset_dim_list,
+      permute_list,
+      inv_offset_dim_list,
+      inv_permute_list,
+      true);
+}
+
 Tensor permute_pooled_embs_auto_grad_split_cpu(
     const Tensor& pooled_embs,
     const Tensor& offset_dim_list,
@@ -65,6 +97,22 @@ Tensor permute_pooled_embs_auto_grad_split_cpu(
       inv_offset_dim_list,
       inv_permute_list);
 }
+
+Tensor permute_duplicate_pooled_embs_auto_grad_split_cpu(
+    const Tensor& pooled_embs,
+    const Tensor& offset_dim_list,
+    const Tensor& permute_list,
+    const Tensor& inv_offset_dim_list,
+    const Tensor& inv_permute_list) {
+  return PermutePooledEmbsFunctionSplit<
+      permute_duplicate_pooled_embs_split_cpu>::
+      apply(
+          pooled_embs,
+          offset_dim_list,
+          permute_list,
+          inv_offset_dim_list,
+          inv_permute_list);
+}
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
@@ -72,9 +120,19 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       "permute_pooled_embs_split(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor");
   DISPATCH_TO_CPU(
       "permute_pooled_embs_split", fbgemm_gpu::permute_pooled_embs_split_cpu);
+  m.def(
+      "permute_duplicate_pooled_embs_split(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor");
+  DISPATCH_TO_CPU(
+      "permute_duplicate_pooled_embs_split",
+      fbgemm_gpu::permute_duplicate_pooled_embs_split_cpu);
   m.def(
       "permute_pooled_embs_auto_grad_split(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor");
   DISPATCH_TO_CPU(
       "permute_pooled_embs_auto_grad_split",
       fbgemm_gpu::permute_pooled_embs_auto_grad_split_cpu);
+  m.def(
+      "permute_duplicate_pooled_embs_auto_grad_split(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor");
+  DISPATCH_TO_CPU(
+      "permute_duplicate_pooled_embs_auto_grad_split",
+      fbgemm_gpu::permute_duplicate_pooled_embs_auto_grad_split_cpu);
 }
diff --git a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp
index 95b0cf014a..2831a22fbb 100644
--- a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp
+++ b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp
@@ -34,12 +34,33 @@ Tensor permute_pooled_embs_auto_grad_split_gpu(
       inv_permute_list);
 }
 
+Tensor permute_duplicate_pooled_embs_auto_grad_split_gpu(
+    const Tensor& pooled_embs,
+    const Tensor& offset_dim_list,
+    const Tensor& permute_list,
+    const Tensor& inv_offset_dim_list,
+    const Tensor& inv_permute_list) {
+  return PermutePooledEmbsFunctionSplit<
+      permute_duplicate_pooled_embs_split_gpu>::
+      apply(
+          pooled_embs,
+          offset_dim_list,
+          permute_list,
+          inv_offset_dim_list,
+          inv_permute_list);
+}
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   DISPATCH_TO_CUDA(
       "permute_pooled_embs_split", fbgemm_gpu::permute_pooled_embs_split_gpu);
+  DISPATCH_TO_CUDA(
+      "permute_duplicate_pooled_embs_split",
+      fbgemm_gpu::permute_duplicate_pooled_embs_split_gpu);
   DISPATCH_TO_CUDA(
       "permute_pooled_embs_auto_grad_split",
       fbgemm_gpu::permute_pooled_embs_auto_grad_split_gpu);
+  DISPATCH_TO_CUDA(
+      "permute_duplicate_pooled_embs_auto_grad_split",
+      fbgemm_gpu::permute_duplicate_pooled_embs_auto_grad_split_gpu);
 }
diff --git a/fbgemm_gpu/test/permute_pooled_embedding_split_test.py b/fbgemm_gpu/test/permute_pooled_embedding_split_test.py
new file mode 100644
index 0000000000..a5e537912e
--- /dev/null
+++ b/fbgemm_gpu/test/permute_pooled_embedding_split_test.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env fbpython
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from itertools import accumulate
+from typing import List, Tuple
+
+import torch
+import torch._dynamo
+
+try:
+    # pyre-ignore[21]
+    from fbgemm_gpu import open_source  # noqa: F401
+
+    # pyre-ignore[21]
+    from test_utils import gpu_unavailable
+except Exception:
+    torch.ops.load_library(
+        "//deeplearning/fbgemm/fbgemm_gpu:permute_pooled_embedding_ops_split_gpu"
+    )
+    torch.ops.load_library(
+        "//deeplearning/fbgemm/fbgemm_gpu:permute_pooled_embedding_ops_split_cpu"
+    )
+    from fbgemm_gpu.test.test_utils import gpu_unavailable
+
+typed_gpu_unavailable: Tuple[bool, str] = gpu_unavailable
+
+
+class PermutePooledEmbeddingSplitTest(unittest.TestCase):
+    def setUp(self) -> None:
+        super().setUp()
+        self.device = "cuda"
+
+    @unittest.skipIf(*typed_gpu_unavailable)
+    def test_duplicate_permutations(self) -> None:
+        # self.device = "cuda"
+        embs_dims = [2, 3, 1, 4]
+        permute = [3, 0, 2, 0, 1, 3]
+        expected_result = [6, 7, 8, 9, 0, 1, 5, 0, 1, 2, 3, 4, 6, 7, 8, 9]
+        input = torch.Tensor([range(10)]).to(device="cuda")
+
+        _permute = torch.tensor(permute, device=self.device, dtype=torch.int64)
+        _offset_dim_list = torch.tensor(
+            [0] + list(accumulate(embs_dims)), device=self.device, dtype=torch.int64
+        )
+        inv_permute: List[int] = [0] * len(permute)
+        for i, p in enumerate(permute):
+            inv_permute[p] = i
+        _inv_permute = torch.tensor(inv_permute, device=self.device, dtype=torch.int64)
+        inv_embs_dims = [embs_dims[i] for i in permute]
+        _inv_offset_dim_list = torch.tensor(
+            [0] + list(accumulate(inv_embs_dims)),
+            device=self.device,
+            dtype=torch.int64,
+        )
+
+        result = torch.ops.fbgemm.permute_duplicate_pooled_embs_auto_grad_split(
+            input,
+            _offset_dim_list.to(device=input.device),
+            _permute.to(device=input.device),
+            _inv_offset_dim_list.to(device=input.device),
+            _inv_permute.to(device=input.device),
+        )
+        self.assertEqual(
+            result.view(16).tolist(),
+            expected_result,
+        )
+
+        input = input.to(device="cpu")
+        result = torch.ops.fbgemm.permute_duplicate_pooled_embs_auto_grad_split(
+            input,
+            _offset_dim_list.to(device=input.device),
+            _permute.to(device=input.device),
+            _inv_offset_dim_list.to(device=input.device),
+            _inv_permute.to(device=input.device),
+        )
+        self.assertEqual(
+            result.view(16).tolist(),
+            expected_result,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From aae88d6022c6bc017303793217788dcecabcd9c1 Mon Sep 17 00:00:00 2001
From: Supadchaya Puangpontip <supadchaya@meta.com>
Date: Tue, 19 Sep 2023 12:02:36 -0700
Subject: [PATCH 33/94] Enable sm 9.0 for cuda 12.1 (#2002)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2002

Enable sm 9.0 (H100) for cuda 12.1

Reviewed By: q10

Differential Revision: D49032365

fbshipit-source-id: 9dc5abe284190aa4d3accaeb178a8b5cc370b5b6
---
 .github/scripts/fbgemm_gpu_build.bash | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index e651f2d727..12b3c49ca4 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -112,7 +112,14 @@ __configure_fbgemm_gpu_build_cuda () {
 
   else
     echo "[BUILD] Using the default CUDA targets ..."
-    local arch_list="7.0;8.0"
+    # For cuda version 12.1, enable sm 9.0
+    cuda_version_nvcc=$(conda run -n "${env_name}" nvcc --version)
+    echo "$cuda_version_nvcc"
+    if [[ $cuda_version_nvcc == *"V12.1"* ]]; then
+      local arch_list="7.0;8.0;9.0"
+    else
+      local arch_list="7.0;8.0"
+    fi
   fi
 
   # Unset the environment-supplied TORCH_CUDA_ARCH_LIST because it will take

From e41a2c52cee987140d444c3e16e787519a30e024 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Tue, 19 Sep 2023 16:12:24 -0700
Subject: [PATCH 34/94] Fix PIP install workflows (#2026)

Summary:
- Fix package name from fbgemm_gpu to fbgemm-gpu to make post-install checks pass

- Add numpy as an install dependency for the fbgemm-gpu package

- Rename workflow files for consistency

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2026

Reviewed By: spcyppt

Differential Revision: D49408816

Pulled By: q10

fbshipit-source-id: aa7f90625c48a0cf0732c788050a189c28418479
---
 .github/scripts/fbgemm_gpu_install.bash       | 12 +++---
 .github/scripts/utils_base.bash               |  2 +-
 .github/scripts/utils_pip.bash                |  8 +++-
 ...nux.yml => build_wheels_linux_aarch64.yml} |  0
 ...s-linux.yml => build_wheels_linux_x86.yml} |  2 +-
 .github/workflows/fbgemm_gpu_cpu_nightly.yml  |  2 +-
 .github/workflows/fbgemm_gpu_cpu_release.yml  |  2 +-
 .github/workflows/fbgemm_gpu_cuda_nightly.yml |  2 +-
 .github/workflows/fbgemm_gpu_cuda_release.yml |  2 +-
 .github/workflows/fbgemm_gpu_pip.yml          | 38 ++++++++++++-------
 fbgemm_gpu/setup.py                           |  6 +++
 11 files changed, 50 insertions(+), 26 deletions(-)
 rename .github/workflows/{build_wheels_aarch64_linux.yml => build_wheels_linux_aarch64.yml} (100%)
 rename .github/workflows/{build-wheels-linux.yml => build_wheels_linux_x86.yml} (98%)

diff --git a/.github/scripts/fbgemm_gpu_install.bash b/.github/scripts/fbgemm_gpu_install.bash
index 84c105083a..e78ca79bd6 100644
--- a/.github/scripts/fbgemm_gpu_install.bash
+++ b/.github/scripts/fbgemm_gpu_install.bash
@@ -16,6 +16,10 @@
 ################################################################################
 
 __fbgemm_gpu_post_install_checks () {
+  local env_name="$1"
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
   echo "[INSTALL] Checking imports and symbols ..."
   (test_python_import_package "${env_name}" fbgemm_gpu) || return 1
   (test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1
@@ -56,7 +60,7 @@ install_fbgemm_gpu_wheel () {
   # shellcheck disable=SC2086
   (exec_with_retries conda run ${env_prefix} python -m pip install "${wheel_path}") || return 1
 
-  __fbgemm_gpu_post_install_checks || return 1
+  __fbgemm_gpu_post_install_checks "${env_name}" || return 1
 
   echo "[INSTALL] FBGEMM-GPU installation through wheel completed ..."
 }
@@ -80,14 +84,12 @@ install_fbgemm_gpu_pip () {
     echo ""
   fi
 
-  # shellcheck disable=SC2155
-  local env_prefix=$(env_name_or_prefix "${env_name}")
-
   # Install the package from PyTorch PIP (not PyPI)
+  # The package's canonical name is 'fbgemm-gpu' (hyphen, not underscore)
   install_from_pytorch_pip "${env_name}" fbgemm_gpu "${fbgemm_gpu_version}" "${fbgemm_gpu_variant_type}" "${fbgemm_gpu_variant_version}" || return 1
 
   # Run post-installation checks
-  __fbgemm_gpu_post_install_checks || return 1
+  __fbgemm_gpu_post_install_checks "${env_name}" || return 1
 
   echo "[INSTALL] Successfully installed FBGEMM-GPU through PyTorch PIP"
 }
diff --git a/.github/scripts/utils_base.bash b/.github/scripts/utils_base.bash
index 8be352ad3e..05ff368900 100644
--- a/.github/scripts/utils_base.bash
+++ b/.github/scripts/utils_base.bash
@@ -83,7 +83,7 @@ env_name_or_prefix () {
 }
 
 test_network_connection () {
-  wget --timeout 1 pypi.org -O /dev/null
+  wget -q --timeout 1 pypi.org -O /dev/null
   local exit_status=$?
 
   # https://man7.org/linux/man-pages/man1/wget.1.html
diff --git a/.github/scripts/utils_pip.bash b/.github/scripts/utils_pip.bash
index f649be68eb..4782632a3e 100644
--- a/.github/scripts/utils_pip.bash
+++ b/.github/scripts/utils_pip.bash
@@ -15,7 +15,7 @@
 
 install_from_pytorch_pip () {
   local env_name="$1"
-  local package_name="$2"
+  local package_name_raw="$2"
   local package_version="$3"
   local package_variant_type="$4"
   local package_variant_version="$5"
@@ -29,7 +29,7 @@ install_from_pytorch_pip () {
     return 1
   else
     echo "################################################################################"
-    echo "# Install ${package_name} (PyTorch PIP)"
+    echo "# Install ${package_name_raw} (PyTorch PIP)"
     echo "#"
     echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
     echo "################################################################################"
@@ -38,6 +38,10 @@ install_from_pytorch_pip () {
 
   test_network_connection || return 1
 
+  # Replace underscores with hyphens to materialize the canonical name of the package
+  # shellcheck disable=SC2155
+  local package_name=$(echo "${package_name_raw}" | tr '_' '-')
+
   # Set the package variant
   if [ "$package_variant_type" == "cuda" ]; then
     # Extract the CUDA version or default to 11.8.0
diff --git a/.github/workflows/build_wheels_aarch64_linux.yml b/.github/workflows/build_wheels_linux_aarch64.yml
similarity index 100%
rename from .github/workflows/build_wheels_aarch64_linux.yml
rename to .github/workflows/build_wheels_linux_aarch64.yml
diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build_wheels_linux_x86.yml
similarity index 98%
rename from .github/workflows/build-wheels-linux.yml
rename to .github/workflows/build_wheels_linux_x86.yml
index bca0fd5f10..dfb141d6f6 100644
--- a/.github/workflows/build-wheels-linux.yml
+++ b/.github/workflows/build_wheels_linux_x86.yml
@@ -1,4 +1,4 @@
-name: Build Linux Wheels
+name: Build x86 Linux Wheels
 
 on:
   pull_request:
diff --git a/.github/workflows/fbgemm_gpu_cpu_nightly.yml b/.github/workflows/fbgemm_gpu_cpu_nightly.yml
index 2b279bc5ea..e273c4e64c 100644
--- a/.github/workflows/fbgemm_gpu_cpu_nightly.yml
+++ b/.github/workflows/fbgemm_gpu_cpu_nightly.yml
@@ -6,7 +6,7 @@
 name: FBGEMM_GPU-CPU Nightly Build
 
 on:
-  # PR Trigger (enabled only for debugging)
+  # PR Trigger (enabled for regression checks and debugging)
   #
   pull_request:
     branches:
diff --git a/.github/workflows/fbgemm_gpu_cpu_release.yml b/.github/workflows/fbgemm_gpu_cpu_release.yml
index 6e11f7d022..87aa54ac75 100644
--- a/.github/workflows/fbgemm_gpu_cpu_release.yml
+++ b/.github/workflows/fbgemm_gpu_cpu_release.yml
@@ -6,7 +6,7 @@
 name: FBGEMM_GPU-CPU Release Build
 
 on:
-  # PR Trigger (enabled only for debugging)
+  # PR Trigger (enabled for regression checks and debugging)
   #
   pull_request:
     branches:
diff --git a/.github/workflows/fbgemm_gpu_cuda_nightly.yml b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
index a65e9e21fa..420e879ea8 100644
--- a/.github/workflows/fbgemm_gpu_cuda_nightly.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
@@ -6,7 +6,7 @@
 name: FBGEMM_GPU-CUDA Nightly Build
 
 on:
-  # PR Trigger (enabled only for debugging)
+  # PR Trigger (enabled for regression checks and debugging)
   #
   pull_request:
     branches:
diff --git a/.github/workflows/fbgemm_gpu_cuda_release.yml b/.github/workflows/fbgemm_gpu_cuda_release.yml
index 20c9b188f3..571ac76b0d 100644
--- a/.github/workflows/fbgemm_gpu_cuda_release.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_release.yml
@@ -6,7 +6,7 @@
 name: FBGEMM_GPU-CUDA Release Build
 
 on:
-  # PR Trigger (enabled only for debugging)
+  # PR Trigger (enabled for regression checks and debugging)
   #
   pull_request:
     branches:
diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
index d2903e6b99..fafab2949b 100644
--- a/.github/workflows/fbgemm_gpu_pip.yml
+++ b/.github/workflows/fbgemm_gpu_pip.yml
@@ -6,12 +6,22 @@
 name: FBGEMM_GPU PIP Install + Test
 
 on:
+  # PR Trigger (enabled for regression checks and debugging)
+  #
+  pull_request:
+    branches:
+      - main
+
   # Manual Trigger
   #
   workflow_dispatch:
     inputs:
+      pytorch_version:
+        description: PyTorch Version (e.g. '2.1.0', 'nightly', 'test')
+        type: string
+        required: true
       fbgemm_gpu_version:
-        description: FBGEMM-GPU Version (e.g. '0.5.0rc1')
+        description: FBGEMM-GPU Version (e.g. '0.5.0rc1', 'nightly', 'test')
         type: string
         required: true
       fbgemm_gpu_variant_type:
@@ -20,15 +30,11 @@ on:
         required: true
         options: [ "cpu", "cuda", "rocm" ]
         default: "cpu"
-      fbgemm_gpu_variant_version:
-        description: FBGEMM-GPU Variant Version (e.g. 'CUDA 12.1.1' --> 12.1.1)
-        type: string
-        required: false
 
 
 jobs:
   test_pypi_install_cpu:
-    if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant_type == 'cpu' }}
+    if: ${{ github.event_name == 'pull_request' || (github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant_type == 'cpu') }}
     runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: amazonlinux:2023
@@ -67,11 +73,14 @@ jobs:
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
 
+    - name: Install Build Tools
+      run: . $PRELUDE; install_build_tools $BUILD_ENV
+
     - name: Install PyTorch-CPU
-      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV test cpu
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_version || 'nightly' }} cpu
 
     - name: Install FBGEMM_GPU-CPU
-      run: . $PRELUDE; cd fbgemm_gpu; install_fbgemm_gpu_pip $BUILD_ENV ${{ github.event.inputs.fbgemm_gpu_version }} cpu
+      run: . $PRELUDE; install_fbgemm_gpu_pip $BUILD_ENV ${{ github.event.inputs.fbgemm_gpu_version || 'nightly' }} cpu
 
     - name: Test with PyTest
       timeout-minutes: 10
@@ -79,7 +88,7 @@ jobs:
 
 
   test_pypi_install_cuda:
-    if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant_type == 'cuda' }}
+    if: ${{ github.event_name == 'pull_request' || (github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant_type == 'cuda') }}
     runs-on: ${{ matrix.host-machine.instance }}
     defaults:
       run:
@@ -118,14 +127,17 @@ jobs:
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
 
+    - name: Install Build Tools
+      run: . $PRELUDE; install_build_tools $BUILD_ENV
+
     - name: Install CUDA
       run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
 
     - name: Install PyTorch-CUDA
-      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda ${{ matrix.cuda-version }}
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_version || 'nightly' }} cuda ${{ matrix.cuda-version }}
 
     - name: Install FBGEMM_GPU-CUDA
-      run: . $PRELUDE; cd fbgemm_gpu; install_fbgemm_gpu_pip $BUILD_ENV ${{ github.event.inputs.fbgemm_gpu_version }} cuda ${{ github.event.inputs.fbgemm_gpu_variant_version }}
+      run: . $PRELUDE; install_fbgemm_gpu_pip $BUILD_ENV ${{ github.event.inputs.fbgemm_gpu_version || 'nightly' }} cuda ${{ matrix.cuda-version }}
 
     - name: Test with PyTest
       timeout-minutes: 10
@@ -184,10 +196,10 @@ jobs:
       run: . $PRELUDE; install_build_tools $BUILD_ENV
 
     - name: Install PyTorch-ROCm
-      run:  . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }}
+      run:  . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_version || 'nightly' }} rocm ${{ matrix.rocm-version }}
 
     - name: Install FBGEMM_GPU-ROCm
-      run: . $PRELUDE; cd fbgemm_gpu; install_fbgemm_gpu_pip $BUILD_ENV ${{ github.event.inputs.fbgemm_gpu_version }} rocm ${{ github.event.inputs.fbgemm_gpu_variant_version }}
+      run: . $PRELUDE; install_fbgemm_gpu_pip $BUILD_ENV ${{ github.event.inputs.fbgemm_gpu_version || 'nightly' }} rocm ${{ matrix.rocm-version }}
 
     - name: Test FBGEMM_GPU-ROCm
       timeout-minutes: 15
diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py
index fba4846ca3..dfe98d384f 100644
--- a/fbgemm_gpu/setup.py
+++ b/fbgemm_gpu/setup.py
@@ -318,6 +318,12 @@ def main(argv: List[str]) -> None:
             "CUDA",
         ],
         packages=["fbgemm_gpu"],
+        install_requires=[
+            # Only specify numpy, as specifying torch will auto-install the
+            # release version of torch, which is not what we want for the
+            # nightly and test packages
+            "numpy",
+        ],
         cmake_args=cmake_environment_variables(args),
         cmdclass={
             "install": FbgemmGpuInstaller,

From 39507643d7a5b6e1f8756969932c5b254f89add5 Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Wed, 20 Sep 2023 14:02:34 -0700
Subject: [PATCH 35/94] Add BF16 in padded FP8 quantize ops (#2010)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2010

- Add BF16 support in `FloatToPaddedFP8RowwiseQuantized` and
  `PaddedFP8RowwiseQuantizedToFloat`.
- Refactor `src/quantize_ops/quantize_fp8_rowwise.cu`
- Move unit test from `hpc` to `fbgemm_gpu`

Reviewed By: jianyuh, summerdengfb, qchip

Differential Revision: D49166595

fbshipit-source-id: 21f65fd3b3f8d237697cc39dcc10142b751dd4c5
---
 .../include/fbgemm_gpu/fbgemm_cuda_utils.cuh  |  32 +++++
 fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h    |   3 +-
 .../src/quantize_ops/quantize_fp8_rowwise.cu  |  93 +++-----------
 .../src/quantize_ops/quantize_ops_cpu.cpp     |   2 +-
 .../quantize_padded_fp8_rowwise.cu            |  51 ++++----
 fbgemm_gpu/test/quantize_ops_test.py          | 118 ++++++++++++++++--
 6 files changed, 189 insertions(+), 110 deletions(-)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
index 167f3109a2..1f33253964 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
@@ -1696,6 +1696,19 @@ static DEVICE_INLINE float __bfloat162float(const at::BFloat16 input) {
 #endif
 }
 
+// Helper functions for converting data to float
+static DEVICE_INLINE float to_float(const float input) {
+  return input;
+}
+
+static DEVICE_INLINE float to_float(const at::Half input) {
+  return __half2float(input);
+}
+
+static DEVICE_INLINE float to_float(const at::BFloat16 input) {
+  return __bfloat162float(input);
+}
+
 #ifdef __HIP_PLATFORM_HCC__
 // the descriptions of __float2bfloat16 and __float2bfloat16_rn are identical
 // https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____BFLOAT16__MISC.html#group__CUDA__MATH____BFLOAT16__MISC
@@ -1710,6 +1723,25 @@ static __host__ __device__ __nv_bfloat16 __float2bfloat16_rn(float f) {
 }
 #endif
 
+// Helper functions for storing float in quantized storage
+static DEVICE_INLINE void quantize_float_store(
+    at::BFloat16* output,
+    const float input) {
+  *reinterpret_cast<__nv_bfloat16*>(output) = __float2bfloat16(input);
+}
+
+static DEVICE_INLINE void quantize_float_store(
+    at::Half* output,
+    const float input) {
+  *output = __float2half(input);
+}
+
+static DEVICE_INLINE void quantize_float_store(
+    float* output,
+    const float input) {
+  *output = input;
+}
+
 #if !(                                                  \
     ((defined(CUDA_VERSION) && CUDA_VERSION < 11000) || \
      (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))))
diff --git a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
index 5b993373e1..dbeccc28f8 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
@@ -226,7 +226,8 @@ at::Tensor _paddedFP8rowwise_to_float_gpu(
     const at::Tensor& input,
     const bool forward = true,
     const int64_t row_dim = 256,
-    const int64_t output_last_dim = -1);
+    const int64_t output_last_dim = -1,
+    const int64_t output_dtype = 0);
 at::Tensor _fused8bitrowwise_to_half_gpu(const at::Tensor& input);
 at::Tensor _fused8bitrowwise_to_float_or_half_gpu(
     const at::Tensor& input,
diff --git a/fbgemm_gpu/src/quantize_ops/quantize_fp8_rowwise.cu b/fbgemm_gpu/src/quantize_ops/quantize_fp8_rowwise.cu
index 1ca26fbf6e..cde36c3d55 100644
--- a/fbgemm_gpu/src/quantize_ops/quantize_fp8_rowwise.cu
+++ b/fbgemm_gpu/src/quantize_ops/quantize_fp8_rowwise.cu
@@ -48,16 +48,8 @@ __global__ inline void _float_to_FP8rowwise_cuda_kernel(
         max_pos / (kEpsilon + fmaxf(maximum_element, -minimum_element));
     output_row_scale_bias[0] = scale;
     for (int64_t col = 0; col < ncols; ++col) {
-      if constexpr (std::is_same<input_t, at::BFloat16>::value) {
-        output_row[col] = float_to_hfp8(
-            __bfloat162float(input_row[col]) * scale, ebit, bias, max_pos);
-      } else if constexpr (std::is_same<input_t, at::Half>::value) {
-        output_row[col] = float_to_hfp8(
-            __half2float(input_row[col]) * scale, ebit, bias, max_pos);
-      } else {
-        output_row[col] =
-            float_to_hfp8(input_row[col] * scale, ebit, bias, max_pos);
-      }
+      output_row[col] =
+          float_to_hfp8(to_float(input_row[col]) * scale, ebit, bias, max_pos);
     }
   }
 }
@@ -95,15 +87,7 @@ __global__ inline void _get_FP8_qparam_cuda_kernel(
     for (int64_t col = threadIdx.x; col < ncols; col += lane_width) {
       // Get thread-local minmax. These are the smallest min and max ever seen
       // by this thread.
-      if constexpr (std::is_same<input_t, at::BFloat16>::value) {
-        maximum_element =
-            fmaxf(maximum_element, fabs(__bfloat162float(input_row[col])));
-      } else if constexpr (std::is_same<input_t, at::Half>::value) {
-        maximum_element =
-            fmaxf(maximum_element, fabs(__half2float(input_row[col])));
-      } else {
-        maximum_element = fmaxf(maximum_element, fabs(input_row[col]));
-      }
+      maximum_element = fmaxf(maximum_element, fabs(to_float(input_row[col])));
     }
   }
 
@@ -164,16 +148,8 @@ __global__ inline void _compute_FP8_quantize_cuda_kernel(
       // TODO: lift range_list into shared memory. However, when nrows is large,
       // it might exceed the size of shared memory.
       // output_addr[0] = lrintf((input[input_idx] - bias) * inverse_scale);
-      if constexpr (std::is_same<input_t, at::BFloat16>::value) {
-        output_addr[0] = float_to_hfp8(
-            __bfloat162float(input[input_idx]) * scale, ebit, bias, max_pos);
-      } else if constexpr (std::is_same<input_t, at::Half>::value) {
-        output_addr[0] = float_to_hfp8(
-            __half2float(input[input_idx]) * scale, ebit, bias, max_pos);
-      } else {
-        output_addr[0] =
-            float_to_hfp8(input[input_idx] * scale, ebit, bias, max_pos);
-      }
+      output_addr[0] = float_to_hfp8(
+          to_float(input[input_idx]) * scale, ebit, bias, max_pos);
     }
   }
 }
@@ -201,15 +177,7 @@ __global__ inline void _FP8rowwise_to_float_cuda_kernel(
 
       const float output_ =
           hfp8_to_float(input_row[col], ebit, bias) / input_row_scale_bias[0];
-
-      if constexpr (std::is_same<output_t, at::BFloat16>::value) {
-        *reinterpret_cast<__nv_bfloat16*>(&output_row[col]) =
-            __float2bfloat16(output_);
-      } else if constexpr (std::is_same<output_t, at::Half>::value) {
-        output_row[col] = __half2float(output_);
-      } else {
-        output_row[col] = output_;
-      }
+      quantize_float_store(&output_row[col], output_);
     }
   }
 }
@@ -348,8 +316,10 @@ _float_to_FP8rowwise_gpu(const Tensor& input, const bool forward) {
   }
 }
 
-template <typename output_t>
-Tensor _FP8rowwise_to_float_gpu_t(const Tensor& input, bool forward) {
+Tensor _FP8rowwise_to_float_gpu_t(
+    const Tensor& input,
+    bool forward,
+    const int64_t output_dtype) {
   TENSOR_ON_CUDA_GPU(input);
   TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
 
@@ -371,24 +341,14 @@ Tensor _FP8rowwise_to_float_gpu_t(const Tensor& input, bool forward) {
   // that size).
   auto output_dims = input_sizes.vec();
   output_dims[last_dim] = output_columns;
-  Tensor output;
-  if constexpr (std::is_same_v<output_t, float>) {
-    output = at::empty(
-        output_dims, // 4 = sizeof(float)
-        input.options().dtype(at::kFloat));
-  } else if constexpr (std::is_same_v<output_t, half>) { // T = at::Half
-    output = at::empty(
-        output_dims, // 4 = sizeof(float)
-        input.options().dtype(at::kHalf));
-  } else if constexpr (std::is_same_v<
-                           output_t,
-                           __nv_bfloat16>) { // T = at::BFloat16
-    output = at::empty(
-        output_dims, // 4 = sizeof(float)
-        input.options().dtype(at::kBFloat16));
-  } else {
-    TORCH_CHECK(false);
-  }
+  const auto output_sdtype = static_cast<SparseType>(output_dtype);
+  TORCH_CHECK(
+      output_sdtype == SparseType::FP32 || output_sdtype == SparseType::FP16 ||
+      output_sdtype == SparseType::BF16);
+
+  Tensor output = at::empty(
+      output_dims, // 4 = sizeof(float)
+      input.options().dtype(getScalarType(output_sdtype)));
 
   if (nrows == 0 || output_columns == 0) {
     return output;
@@ -422,22 +382,7 @@ DLL_PUBLIC at::Tensor _FP8rowwise_to_float_gpu(
     const at::Tensor& input,
     bool forward,
     const int64_t output_dtype) {
-  SparseType output_sparse_dtype = static_cast<SparseType>(output_dtype);
-  Tensor output;
-  switch (output_sparse_dtype) {
-    case SparseType::FP32:
-      output = _FP8rowwise_to_float_gpu_t<float>(input, forward);
-      break;
-    case SparseType::FP16:
-      output = _FP8rowwise_to_float_gpu_t<half>(input, forward);
-      break;
-    case SparseType::BF16:
-      output = _FP8rowwise_to_float_gpu_t<__nv_bfloat16>(input, forward);
-      break;
-    default:
-      TORCH_CHECK(false);
-  }
-  return output;
+  return _FP8rowwise_to_float_gpu_t(input, forward, output_dtype);
 }
 
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp b/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp
index b30934804b..9521f5a4c4 100644
--- a/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp
+++ b/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp
@@ -446,7 +446,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "MSFPQuantizedToFloat(Tensor input, int ebits, int mbits, int bias) -> Tensor");
   m.def(
-      "PaddedFP8RowwiseQuantizedToFloat(Tensor input, bool forward, int row_dim, int output_last_dim=-1) -> Tensor");
+      "PaddedFP8RowwiseQuantizedToFloat(Tensor input, bool forward, int row_dim, int output_last_dim=-1, int output_dtype=0) -> Tensor");
 }
 
 TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
diff --git a/fbgemm_gpu/src/quantize_ops/quantize_padded_fp8_rowwise.cu b/fbgemm_gpu/src/quantize_ops/quantize_padded_fp8_rowwise.cu
index 4ca15d12a0..cfffc1746f 100644
--- a/fbgemm_gpu/src/quantize_ops/quantize_padded_fp8_rowwise.cu
+++ b/fbgemm_gpu/src/quantize_ops/quantize_padded_fp8_rowwise.cu
@@ -61,7 +61,7 @@ __global__ inline void _float_to_paddedFP8rowwise_cuda_kernel(
         *reinterpret_cast<float*>((row == threads - 1) ? &pad : &last_buc_idx);
     for (int col = 0; col < range; col += 1) {
       output_row[col] =
-          float_to_hfp8(input_row[col] * scale, ebit, bias, max_pos);
+          float_to_hfp8(to_float(input_row[col]) * scale, ebit, bias, max_pos);
     }
     return;
   }
@@ -88,8 +88,8 @@ __global__ inline void _float_to_paddedFP8rowwise_cuda_kernel(
     output_row_scale[1] = *reinterpret_cast<float*>(
         (ncols - col > row_dim) ? &last_buc_idx : &pad);
     for (int bi = 0; bi < std::min(row_dim, (int)(ncols - col)); ++bi) {
-      output_row[col + bi + col_offset] =
-          float_to_hfp8(input_row[col + bi] * scale, ebit, bias, max_pos);
+      output_row[col + bi + col_offset] = float_to_hfp8(
+          to_float(input_row[col + bi]) * scale, ebit, bias, max_pos);
     }
   }
 }
@@ -160,7 +160,8 @@ __global__ inline void _PaddedFP8rowwise_to_float_1d_cuda_kernel(
   const auto pad_offset = offsets[row];
   output_t* output_row = output + row * row_dim - pad_offset;
   for (int col = threadIdx.x; col < row_dim - pad; col += blockDim.x) {
-    output_row[col] = hfp8_to_float(input_row[col], ebit, bias) / scale;
+    const auto output_ = hfp8_to_float(input_row[col], ebit, bias) / scale;
+    quantize_float_store(&output_row[col], output_);
   }
 }
 
@@ -193,8 +194,9 @@ __global__ inline void _PaddedFP8rowwise_to_float_2d_cuda_kernel(
     // bucket
     pad = (pad > 0) ? pad : 0;
     for (int bi = 0; bi < row_dim - pad; ++bi) {
-      output_row[col + bi - col_offset] =
+      const auto output_ =
           hfp8_to_float(input_row[col + bi], ebit, bias) / input_row_scale[0];
+      quantize_float_store(&output_row[col + bi - col_offset], output_);
     }
     col_offset = col_offset + 8 + pad;
   }
@@ -203,7 +205,6 @@ __global__ inline void _PaddedFP8rowwise_to_float_2d_cuda_kernel(
 } // namespace
 
 // revising INT8 rowwise template for FP8 rowwise quantization
-template <typename input_t>
 Tensor _float_to_paddedFP8rowwise_gpu_t(
     const Tensor& input,
     const bool forward,
@@ -241,7 +242,7 @@ Tensor _float_to_paddedFP8rowwise_gpu_t(
   const auto num_blocks = cuda_calc_xblock_count(
       nrows == 1 ? (ncols + row_dim - 1) / row_dim : nrows, threads_per_block);
 
-  FBGEMM_DISPATCH_FLOAT_AND_HALF(
+  FBGEMM_DISPATCH_FLOAT_HALF_AND_BFLOAT16(
       input.scalar_type(), "_float_to_FP8rowwise_cuda_kernel", [&] {
         _float_to_paddedFP8rowwise_cuda_kernel<scalar_t>
             <<<num_blocks,
@@ -259,12 +260,12 @@ Tensor _float_to_paddedFP8rowwise_gpu_t(
   return output;
 }
 
-template <typename output_t>
 Tensor _paddedFP8rowwise_to_float_gpu_t(
     const Tensor& input,
     const bool forward,
     const int64_t row_dim,
-    const int64_t output_last_dim) {
+    const int64_t output_last_dim,
+    const int64_t output_dtype) {
   TENSOR_ON_CUDA_GPU(input);
   TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
 
@@ -328,16 +329,15 @@ Tensor _paddedFP8rowwise_to_float_gpu_t(
   }
 
   output_dims[last_dim] = output_columns;
-  Tensor output;
-  if constexpr (std::is_same_v<output_t, float>) {
-    output = at::empty(
-        output_dims, // 4 = sizeof(float)
-        input.options().dtype(at::kFloat));
-  } else { // T = at::Half
-    output = at::empty(
-        output_dims, // 4 = sizeof(float)
-        input.options().dtype(at::kHalf));
-  }
+
+  const auto output_sdtype = static_cast<SparseType>(output_dtype);
+  TORCH_CHECK(
+      output_sdtype == SparseType::FP32 || output_sdtype == SparseType::FP16 ||
+      output_sdtype == SparseType::BF16);
+
+  Tensor output = at::empty(
+      output_dims, // 4 = sizeof(float)
+      input.options().dtype(getScalarType(output_sdtype)));
 
   if (nrows == 0 || output_columns == 0) {
     return output;
@@ -357,7 +357,7 @@ Tensor _paddedFP8rowwise_to_float_gpu_t(
     constexpr int kMaxThreads = 1024;
     const auto threads_per_block =
         kMaxThreads < row_dim ? kMaxThreads : row_dim;
-    FBGEMM_DISPATCH_FLOAT_AND_HALF(
+    FBGEMM_DISPATCH_FLOAT_HALF_AND_BFLOAT16(
         output.scalar_type(), "PaddedFP8rowwise_to_float_1d_cuda_kernel", [&] {
           _PaddedFP8rowwise_to_float_1d_cuda_kernel<scalar_t>
               <<<num_rows,
@@ -375,7 +375,7 @@ Tensor _paddedFP8rowwise_to_float_gpu_t(
         });
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else {
-    FBGEMM_DISPATCH_FLOAT_AND_HALF(
+    FBGEMM_DISPATCH_FLOAT_HALF_AND_BFLOAT16(
         output.scalar_type(), "PaddedFP8rowwise_to_float_2d_cuda_kernel", [&] {
           _PaddedFP8rowwise_to_float_2d_cuda_kernel<scalar_t>
               <<<num_blocks,
@@ -402,16 +402,17 @@ DLL_PUBLIC Tensor _float_to_paddedFP8rowwise_gpu(
     const Tensor& input,
     const bool forward,
     const int64_t row_dim) {
-  return _float_to_paddedFP8rowwise_gpu_t<float>(input, forward, row_dim);
+  return _float_to_paddedFP8rowwise_gpu_t(input, forward, row_dim);
 }
 
 DLL_PUBLIC at::Tensor _paddedFP8rowwise_to_float_gpu(
     const at::Tensor& input,
     const bool forward,
     const int64_t row_dim,
-    const int64_t output_last_dim) {
-  return _paddedFP8rowwise_to_float_gpu_t<float>(
-      input, forward, row_dim, output_last_dim);
+    const int64_t output_last_dim,
+    const int64_t output_dtype) {
+  return _paddedFP8rowwise_to_float_gpu_t(
+      input, forward, row_dim, output_last_dim, output_dtype);
 }
 
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/test/quantize_ops_test.py b/fbgemm_gpu/test/quantize_ops_test.py
index 5f24f16325..25bd83ceb0 100644
--- a/fbgemm_gpu/test/quantize_ops_test.py
+++ b/fbgemm_gpu/test/quantize_ops_test.py
@@ -977,12 +977,18 @@ def test_quantize_and_dequantize_op_cuda_large_nrows_bf16(
 
 class TestFP8RowwiseQuantizationConversion(unittest.TestCase):
     enable_logging: bool = False
+    max_examples: int = 40
 
     def setUp(self) -> None:
         self.enable_logging = bool(os.getenv("FBGEMM_GPU_ENABLE_LOGGING", 0))
         if self.enable_logging:
             logging.info("Enabled logging for TestFP8RowwiseQuantizationConversion")
 
+        torch._dynamo.config.cache_size_limit = self.max_examples
+        logging.info(
+            f"Setting torch._dynamo.config.cache_size_limit = {self.max_examples}"
+        )
+
     @unittest.skipIf(*gpu_unavailable)
     # pyre-fixme[56]:
     @given(
@@ -1002,7 +1008,7 @@ def setUp(self) -> None:
         # if before PT 2.1, we don't support symint_vector, so turn it off
         test_compile=st.booleans() if symint_vector_unsupported() else st.just(False),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=10, deadline=None)
+    @settings(verbosity=Verbosity.verbose, max_examples=max_examples, deadline=None)
     def test_quantize_and_dequantize_op_fp8_rowwise(
         self,
         batched: bool,
@@ -1039,16 +1045,16 @@ def test_quantize_and_dequantize_op_fp8_rowwise(
             torch._dynamo.mark_dynamic(quantized_data_gpu, 0)
             torch._dynamo.mark_dynamic(quantized_data_gpu, 1)
 
+        output_dtype = {
+            torch.float: SparseType.FP32,
+            torch.half: SparseType.FP16,
+            torch.bfloat16: SparseType.BF16,
+        }[dtype].as_int()
+
         dequantized_data_gpu = quantize_func(
             quantized_data_gpu,
             forward=forward,
-            output_dtype=SparseType.FP32.as_int()
-            if dtype == torch.float
-            else (
-                SparseType.FP16.as_int()
-                if dtype == torch.half
-                else SparseType.BF16.as_int()
-            ),
+            output_dtype=output_dtype,
         )
 
         if m == 0 or n == 0:
@@ -1057,10 +1063,13 @@ def test_quantize_and_dequantize_op_fp8_rowwise(
 
         assert (
             dequantized_data_gpu.dtype == dtype
-        ), "result is {dequantized_data_gpu.dtype} type, but expected {dtype}"
+        ), "Result is {dequantized_data_gpu.dtype} type, but expected {dtype}"
+
         qref = input_data_gpu.float()
         dq = dequantized_data_gpu.float()
 
+        assert not torch.isnan(dq).any(), "Results contain nan"
+
         if self.enable_logging:
             # Logging quantization errors
             errors = (qref - dq) / (qref + 1e-5)
@@ -1073,6 +1082,97 @@ def test_quantize_and_dequantize_op_fp8_rowwise(
 
         torch.testing.assert_close(qref.cpu(), dq.cpu(), rtol=0.1, atol=0.05)
 
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-fixme[56]:
+    @given(
+        m=st.integers(min_value=1, max_value=1000),
+        n1=st.integers(min_value=1, max_value=1000),
+        n2=st.integers(min_value=1, max_value=1000),
+        n3=st.integers(min_value=1, max_value=1000),
+        row_dim=st.integers(min_value=1, max_value=2048),
+        forward=st.booleans(),
+        given_last_dim=st.booleans(),
+        dtype=st.sampled_from(
+            [
+                torch.float,
+                torch.half,
+                torch.bfloat16,
+            ],
+        ),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=max_examples, deadline=None)
+    def test_quantize_and_dequantize_op_padded_fp8_rowwise(
+        self,
+        m: int,
+        n1: int,
+        n2: int,
+        n3: int,
+        row_dim: int,
+        forward: bool,
+        given_last_dim: bool,
+        dtype: torch.dtype,
+    ) -> None:
+        row_dim = row_dim * 4
+        device = "cuda"
+        input1 = torch.rand(m, n1, device=device, dtype=dtype)
+        input2 = torch.rand(m, n2, device=device, dtype=dtype)
+        input3 = torch.rand(m, n3, device=device, dtype=dtype)
+        output_dtype = {
+            torch.float: SparseType.FP32,
+            torch.half: SparseType.FP16,
+            torch.bfloat16: SparseType.BF16,
+        }[dtype].as_int()
+
+        q1 = torch.ops.fbgemm.FloatToPaddedFP8RowwiseQuantized(
+            input1, forward=forward, row_dim=row_dim
+        )
+        q2 = torch.ops.fbgemm.FloatToPaddedFP8RowwiseQuantized(
+            input2, forward=forward, row_dim=row_dim
+        )
+        q3 = torch.ops.fbgemm.FloatToPaddedFP8RowwiseQuantized(
+            input3, forward=forward, row_dim=row_dim
+        )
+        qcat = torch.cat([q1, q3, q2], dim=-1)
+        if given_last_dim:
+            d_qcat = torch.ops.fbgemm.PaddedFP8RowwiseQuantizedToFloat(
+                qcat,
+                forward=forward,
+                row_dim=row_dim,
+                output_last_dim=n1 + n2 + n3,
+                output_dtype=output_dtype,
+            )
+        else:
+            d_qcat = torch.ops.fbgemm.PaddedFP8RowwiseQuantizedToFloat(
+                qcat,
+                forward=forward,
+                row_dim=row_dim,
+                output_dtype=output_dtype,
+            )
+
+        assert (
+            d_qcat.dtype == dtype
+        ), "Result is {d_qcat.dtype} type, but expected {dtype}"
+        qref = torch.cat([input1, input3, input2], dim=-1).cpu().float()
+        dqcat = d_qcat.cpu().float()
+
+        assert not torch.isnan(dqcat).any(), "Results contain nan"
+
+        if self.enable_logging:
+            # Logging quantization errors
+            errors = (dqcat - qref) / (qref + 1e-5)
+            assert not torch.isnan(errors).any()
+            val, idx = torch.topk(errors.abs(), k=min(10, errors.shape[-1]))
+            logging.info(f"top-10 errors {val}")
+            logging.info(f"qref {torch.gather(qref, dim=1, index=idx)}")
+            logging.info(f"dqcat {torch.gather(dqcat, dim=1, index=idx)}")
+            logging.info(
+                f"relative error: max: {errors.abs().max()*100:.1f}%, "
+                f"median: {errors.abs().median()*100:.1f}%, "
+                f"mean: {errors.abs().mean()*100:.1f}%"
+            )
+
+        torch.testing.assert_allclose(dqcat, qref, rtol=0.1, atol=0.05)
+
 
 if __name__ == "__main__":
     unittest.main()

From a511f9d053c0ffcd8794401b4bc49a75ea8ea13f Mon Sep 17 00:00:00 2001
From: Flavio Sales Truzzi <ftruzzi@meta.com>
Date: Wed, 20 Sep 2023 15:45:19 -0700
Subject: [PATCH 36/94] - SymInts for pack_segments meta function (#2017)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2017

Add SymInts for pack_segments

Reviewed By: ezyang, xw285cornell, bdhirsh

Differential Revision: D48712041

fbshipit-source-id: b1d69ea4980cf1c8cc26ff976eec55623355e227
---
 fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h    |  7 ++-
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp  | 51 +++-------------
 fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp  | 60 +++++++++++++++----
 fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp | 25 ++++++--
 fbgemm_gpu/test/sparse_ops_test.py            | 56 ++++++++++++++++-
 5 files changed, 132 insertions(+), 67 deletions(-)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
index dbeccc28f8..add79aaa30 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
@@ -9,6 +9,7 @@
 #pragma once
 
 #include <ATen/ATen.h>
+#include <cstdint>
 
 namespace fbgemm_gpu {
 
@@ -722,18 +723,18 @@ std::tuple<at::Tensor, at::Tensor> permute_sequence_embeddings_cuda(
 at::Tensor pack_segments_cpu(
     const at::Tensor& t_in,
     const at::Tensor& lengths,
-    const int64_t max_length);
+    int64_t max_length);
 
 ///@ingroup sparse-data-cuda
 at::Tensor pack_segments_cuda(
     const at::Tensor& t_in,
     const at::Tensor& lengths,
-    const int64_t max_length);
+    int64_t max_length);
 
 at::Tensor pack_segments_forward_cuda(
     const at::Tensor& t_in,
     const at::Tensor& lengths,
-    const int64_t max_length);
+    int64_t max_length);
 
 at::Tensor pack_segments_backward_cuda(
     const at::Tensor& data,
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index 4ea54e18ee..eab2829497 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -2508,54 +2508,12 @@ Tensor pack_segments_backward_cpu(
 
   return unpacked_tensor;
 }
-
-class PackSegmentsFunction
-    : public torch::autograd::Function<PackSegmentsFunction> {
- public:
-  static torch::autograd::variable_list forward(
-      torch::autograd::AutogradContext* ctx,
-      const Tensor& t_in,
-      const Tensor& lengths,
-      const int64_t max_length) {
-    int64_t total_length = t_in.expect_contiguous()->sizes()[0];
-    ctx->saved_data["max_length"] = max_length;
-    ctx->saved_data["total_length"] = total_length;
-    ctx->save_for_backward({lengths});
-
-    // Run the forward pass.
-    const auto& res = pack_segments_forward_cpu(t_in, lengths, max_length);
-    torch::autograd::variable_list outputs(1);
-    outputs[0] = res;
-    return outputs;
-  }
-
-  static torch::autograd::variable_list backward(
-      torch::autograd::AutogradContext* ctx,
-      torch::autograd::variable_list grad_output) {
-    TORCH_CHECK(grad_output.size() == 1);
-    const Tensor& grad = grad_output[0];
-    const auto& max_length = ctx->saved_data["max_length"].toInt();
-    const auto& total_length = ctx->saved_data["total_length"].toInt();
-
-    // Retrieve saved variables for backward.
-    const auto& saved_variables = ctx->get_saved_variables();
-    const auto& lengths = saved_variables[0];
-
-    torch::autograd::variable_list grad_inputs(5);
-    grad_inputs[0] =
-        pack_segments_backward_cpu(grad, lengths, total_length, max_length);
-    return grad_inputs;
-  }
-};
-
 Tensor pack_segments_cpu(
     const Tensor& t_in,
     const Tensor& lengths,
     const int64_t max_length) {
-  const auto& res = PackSegmentsFunction::apply(t_in, lengths, max_length);
-  return res[0];
+  return pack_segments_forward_cpu(t_in, lengths, max_length);
 }
-
 namespace {
 Tensor index_select_dim0(
     const Tensor& input,
@@ -2692,7 +2650,10 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       "permute102_baddbmm_permute102(Tensor bias, Tensor A, Tensor B) -> Tensor");
   m.def(
       "permute_sequence_embeddings(Tensor permute, Tensor lengths, Tensor embeddings) -> (Tensor, Tensor)");
-  m.def("pack_segments(Tensor t_in, Tensor lengths, int max_length) -> Tensor");
+  m.def(
+      "pack_segments(Tensor t_in, Tensor lengths, SymInt max_length) -> Tensor");
+  m.def(
+      "pack_segments_backward(Tensor data, Tensor lengths, SymInt total_length, SymInt max_length) -> Tensor");
   // A specialization of at::index_select for selecting dim 0
   //
   // The consecutive_range_start and consecutive_range_length arguments are for
@@ -2789,6 +2750,8 @@ TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
       "permute_sequence_embeddings",
       fbgemm_gpu::permute_sequence_embeddings_cpu);
   DISPATCH_TO_CPU("pack_segments", fbgemm_gpu::pack_segments_cpu);
+  DISPATCH_TO_CPU(
+      "pack_segments_backward", fbgemm_gpu::pack_segments_backward_cpu);
   DISPATCH_TO_CPU("index_select_dim0", fbgemm_gpu::index_select_dim0);
   DISPATCH_TO_CPU(
       "group_index_select_dim0", fbgemm_gpu::group_index_select_dim0);
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp
index 9c7a2f3fd1..8937708f5b 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp
@@ -6,6 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include "ATen/ops/tensor.h"
+#include "c10/core/SymInt.h"
 #include "fbgemm_gpu/sparse_ops.h"
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
@@ -14,6 +16,7 @@
 #include <torch/csrc/autograd/custom_function.h>
 #include <torch/library.h>
 #include <torch/script.h>
+#include <cstdint>
 #include <stdexcept> // for logic_error
 
 using Tensor = at::Tensor;
@@ -64,18 +67,24 @@ class PackSegments : public torch::autograd::Function<PackSegments> {
       torch::autograd::AutogradContext* ctx,
       const Tensor& t_in,
       const Tensor& lengths,
-      const int64_t max_length) {
-    const int64_t total_length = t_in.contiguous().size(0);
+      const at::SymInt& max_length) {
+    const at::SymInt total_length = t_in.sym_size(0);
+
+    at::AutoDispatchBelowADInplaceOrView guard;
+
+    static auto custom_pack_segments_op =
+        torch::Dispatcher::singleton()
+            .findSchemaOrThrow("fbgemm::pack_segments", "")
+            .typed<at::Tensor(
+                const at::Tensor&, const at::Tensor&, const at::SymInt)>();
+
+    Tensor res = custom_pack_segments_op.call(t_in, lengths, max_length);
+
     ctx->saved_data["max_length"] = max_length;
     ctx->saved_data["total_length"] = total_length;
     ctx->save_for_backward({lengths});
 
-    // Run the forward pass.
-    const auto& res = pack_segments_forward_cuda(t_in, lengths, max_length);
-
-    torch::autograd::variable_list outputs(1);
-    outputs[0] = res;
-    return outputs;
+    return {res};
   }
 
   static torch::autograd::variable_list backward(
@@ -83,20 +92,39 @@ class PackSegments : public torch::autograd::Function<PackSegments> {
       torch::autograd::variable_list grad_output) {
     TORCH_CHECK(grad_output.size() == 2 or grad_output.size() == 1);
     const Tensor& grad = grad_output[0];
-    const auto& max_length = ctx->saved_data["max_length"].toInt();
-    const auto& total_length = ctx->saved_data["total_length"].toInt();
+    const auto& max_length = ctx->saved_data["max_length"].toSymInt();
+    const auto& total_length = ctx->saved_data["total_length"].toSymInt();
 
     // Retrieve saved variables for backward.
     const auto& saved_variables = ctx->get_saved_variables();
     const auto& lengths = saved_variables[0];
 
     torch::autograd::variable_list grad_inputs(5);
-    grad_inputs[0] =
-        pack_segments_backward_cuda(grad, lengths, total_length, max_length);
+
+    static auto custom_pack_segments_backward_op =
+        torch::Dispatcher::singleton()
+            .findSchemaOrThrow("fbgemm::pack_segments_backward", "")
+            .typed<at::Tensor(
+                const at::Tensor&,
+                const at::Tensor&,
+                const at::SymInt,
+                const at::SymInt)>();
+
+    grad_inputs[0] = custom_pack_segments_backward_op.call(
+        grad, lengths, total_length, max_length);
     return grad_inputs;
   }
 };
 
+torch::Tensor pack_segments_autograd(
+    const Tensor& t_in,
+    const Tensor& lengths,
+    const at::SymInt max_length
+
+) {
+  return PackSegments::apply(t_in, lengths, max_length)[0];
+}
+
 class LookupFunctionBatchedUnaryEmbeddingOp
     : public torch::autograd::Function<LookupFunctionBatchedUnaryEmbeddingOp> {
  public:
@@ -648,8 +676,14 @@ TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
   DISPATCH_TO_CUDA(
       "generic_histogram_binning_calibration_by_feature",
       fbgemm_gpu::generic_histogram_binning_calibration_by_feature_cuda);
-  DISPATCH_TO_CUDA("pack_segments", fbgemm_gpu::pack_segments_cuda);
+  DISPATCH_TO_CUDA("pack_segments", fbgemm_gpu::pack_segments_forward_cuda);
+  DISPATCH_TO_CUDA(
+      "pack_segments_backward", fbgemm_gpu::pack_segments_backward_cuda);
   DISPATCH_TO_CUDA("index_select_dim0", fbgemm_gpu::index_select_dim0_gpu);
   DISPATCH_TO_CUDA(
       "group_index_select_dim0", fbgemm_gpu::group_index_select_dim0_gpu);
 }
+
+TORCH_LIBRARY_IMPL(fbgemm, Autograd, m) {
+  m.impl("pack_segments", &fbgemm_gpu::pack_segments_autograd);
+}
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp
index 64db0d9c63..96b42429d6 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp
@@ -11,6 +11,8 @@
 #include <torch/csrc/autograd/custom_function.h>
 #include <torch/library.h>
 
+#include "c10/core/SymIntArrayRef.h"
+#include "c10/util/DimVector.h"
 #include "fbgemm_gpu/sparse_ops.h"
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
@@ -23,12 +25,26 @@ namespace {
 Tensor pack_segments_forward_meta(
     const Tensor& t_in,
     const Tensor& lengths,
-    const int64_t max_length) {
-  at::DimVector padded_values_shape({lengths.numel(), max_length});
+    const at::SymInt max_length) {
+  at::SymDimVector padded_values_shape({lengths.sym_numel(), max_length});
+
   for (const auto i : c10::irange(1, t_in.dim())) {
-    padded_values_shape.push_back(t_in.size(i));
+    padded_values_shape.push_back(t_in.sym_size(i));
   }
-  return at::empty(padded_values_shape, t_in.options());
+  return at::empty_symint(padded_values_shape, t_in.options());
+}
+
+Tensor pack_segments_backward_meta(
+    const at::Tensor& data,
+    const at::Tensor& lengths,
+    const at::SymInt total_length,
+    const at::SymInt max_length) {
+  // Create output tensor of appropriate dimensions
+  auto shape = data.sym_sizes().vec();
+  shape.erase(shape.begin());
+  shape[0] = total_length;
+
+  return at::empty_symint(shape, data.options());
 }
 
 Tensor offsets_range_meta_symint(const Tensor& offsets, at::SymInt range_size) {
@@ -52,6 +68,7 @@ Tensor batched_unary_embeddings_forward_meta(
 
 TORCH_LIBRARY_IMPL(fbgemm, Meta, m) {
   m.impl("pack_segments", TORCH_FN(fbgemm_gpu::pack_segments_forward_meta));
+  m.impl("unpack_segments", TORCH_FN(fbgemm_gpu::pack_segments_backward_meta));
   m.impl(
       "asynchronous_complete_cumsum",
       TORCH_FN(fbgemm_gpu::asynchronous_complete_cumsum_meta));
diff --git a/fbgemm_gpu/test/sparse_ops_test.py b/fbgemm_gpu/test/sparse_ops_test.py
index 32c4c2916e..59f633ece3 100644
--- a/fbgemm_gpu/test/sparse_ops_test.py
+++ b/fbgemm_gpu/test/sparse_ops_test.py
@@ -1731,6 +1731,7 @@ def _pack_segments_ref(
                 torch.half,
             ]
         ),
+        torch_compile=st.booleans(),
     )
     @settings(deadline=None)
     def test_pack_segments(
@@ -1740,11 +1741,13 @@ def test_pack_segments(
         batch_size: int,
         divisions: int,
         dtype: torch.dtype,
+        torch_compile: bool,
     ) -> None:
         input_raw = np.random.rand(batch_size, n, k)
         input_data = torch.tensor(input_raw, dtype=dtype, requires_grad=True)
         lengths = torch.tensor(
-            get_n_rand_num_summing_to_k(divisions, batch_size), dtype=torch.int
+            get_n_rand_num_summing_to_k(divisions, batch_size),
+            dtype=torch.int,
         )
         max_length = lengths.max().item()
 
@@ -1766,7 +1769,48 @@ def test_pack_segments(
         packed_tensor.backward(grad_cpu)
 
         if gpu_available:
-            packed_cuda = torch.ops.fbgemm.pack_segments(
+            pack_segments_fun = torch.ops.fbgemm.pack_segments
+
+            if torch_compile:
+                pack_segments_fun = torch.compile(pack_segments_fun, dynamic=True)
+
+            packed_cuda = pack_segments_fun(
+                t_in=input_data.cuda(),
+                lengths=lengths.cuda(),
+                max_length=max_length,
+            )
+
+            self.assertTrue(torch.equal(packed_tensor, packed_cuda.cpu()))
+
+            # GPU backward
+            packed_cuda.backward(grad_cpu.cuda())
+
+            # dynamic check
+            input_raw = np.random.rand(batch_size, n + 1, k + 2)
+            input_data = torch.tensor(input_raw, dtype=dtype, requires_grad=True)
+            lengths = torch.tensor(
+                get_n_rand_num_summing_to_k(divisions, batch_size), dtype=torch.int
+            )
+            max_length = lengths.max().item()
+            packed_tensor = torch.ops.fbgemm.pack_segments(
+                t_in=input_data, lengths=lengths, max_length=max_length
+            )
+
+            packed_ref = self._pack_segments_ref(lengths, input_raw)
+            packed_ref = torch.Tensor(packed_ref).to(dtype)
+
+            self.assertTrue(torch.equal(packed_tensor, packed_ref))
+
+            grad_cpu = torch.tensor(
+                np.random.uniform(low=0.01, high=0.5, size=packed_ref.shape).astype(
+                    np.float32
+                )
+            ).to(dtype)
+            # CPU backward
+            packed_tensor.backward(grad_cpu)
+
+            # reusing the previously compiled kernel
+            packed_cuda = pack_segments_fun(
                 t_in=input_data.cuda(),
                 lengths=lengths.cuda(),
                 max_length=max_length,
@@ -1788,6 +1832,7 @@ def test_pack_segments(
                 torch.half,
             ]
         ),
+        torch_compile=st.booleans(),
     )
     @settings(deadline=None)
     def test_pack_segments_smaller_max_len(
@@ -1798,6 +1843,7 @@ def test_pack_segments_smaller_max_len(
         divisions: int,
         max_length: int,
         dtype: torch.dtype,
+        torch_compile: bool,
     ) -> None:
         input_data = torch.tensor(np.random.rand(batch_size, n, k), dtype=dtype)
         lengths = torch.tensor(
@@ -1820,7 +1866,11 @@ def test_pack_segments_smaller_max_len(
         self.assertTrue(torch.equal(packed_tensor, packed_ref))
 
         if gpu_available:
-            packed_cuda = torch.ops.fbgemm.pack_segments(
+            pack_segments_fun = torch.ops.fbgemm.pack_segments
+            if torch_compile:
+                pack_segments_fun = torch.compile(pack_segments_fun)
+
+            packed_cuda = pack_segments_fun(
                 t_in=input_data.cuda(),
                 lengths=lengths.cuda(),
                 max_length=max_length,

From 8f02dd3a6523f139090cdc6b9eaa0b993705895f Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Thu, 21 Sep 2023 04:47:10 -0700
Subject: [PATCH 37/94] Use cron schedule for PIP install workflows (#2027)

Summary:
- Use a cron schedule for the PIP install workflows

- Activate building wheels for ROCm variant

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2027

Reviewed By: spcyppt

Differential Revision: D49484270

Pulled By: q10

fbshipit-source-id: 72a50b89c50d6f82c02f4a673f51abbc683e45ca
---
 .github/scripts/fbgemm_gpu_build.bash         |  26 ++--
 .github/scripts/fbgemm_gpu_docs.bash          |   4 +-
 .github/scripts/fbgemm_gpu_install.bash       |   4 +-
 .github/scripts/fbgemm_gpu_lint.bash          |   8 +-
 .github/scripts/fbgemm_gpu_test.bash          |   4 +-
 .github/scripts/nova_postscript.bash          |  12 +-
 .github/scripts/nova_prescript.bash           | 114 ++++++++++--------
 .github/scripts/utils_build.bash              |   6 +-
 .github/scripts/utils_conda.bash              |  20 ++-
 .github/scripts/utils_cuda.bash               |   4 +-
 .github/scripts/utils_pip.bash                |   4 +-
 .github/scripts/utils_pytorch.bash            |   4 +-
 .github/scripts/utils_rocm.bash               |   2 +-
 .github/scripts/utils_system.bash             |   6 +-
 .../workflows/build_wheels_linux_aarch64.yml  |   4 +-
 .github/workflows/build_wheels_linux_x86.yml  |   4 +
 .../workflows/build_wheels_linux_x86_rocm.yml |  51 ++++++++
 .github/workflows/fbgemm_gpu_pip.yml          |  16 +--
 18 files changed, 195 insertions(+), 98 deletions(-)
 create mode 100644 .github/workflows/build_wheels_linux_x86_rocm.yml

diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index 12b3c49ca4..5354c24ab6 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -24,7 +24,7 @@ prepare_fbgemm_gpu_build () {
     echo "################################################################################"
     echo "# Prepare FBGEMM-GPU Build"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
@@ -154,7 +154,7 @@ __configure_fbgemm_gpu_build () {
     echo "################################################################################"
     echo "# Configure FBGEMM-GPU Build"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
@@ -310,7 +310,7 @@ build_fbgemm_gpu_package () {
   echo "################################################################################"
   echo "# Build FBGEMM-GPU Package (Wheel)"
   echo "#"
-  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
   echo "################################################################################"
   echo ""
 
@@ -322,13 +322,15 @@ build_fbgemm_gpu_package () {
   echo "[BUILD] Checking build_args:"
   echo "${build_args[@]}"
 
-  core=$(lscpu | grep "Core(s)" | awk '{print $NF}') && echo "core = ${core}" || echo "core not found"
-  sockets=$(lscpu | grep "Socket(s)" | awk '{print $NF}') && echo "sockets = ${sockets}" || echo "sockets not found"
-  re='^[0-9]+$'
-  run_multicore=""
+  # shellcheck disable=SC2155
+  local core=$(lscpu | grep "Core(s)" | awk '{print $NF}') && echo "core = ${core}" || echo "core not found"
+  # shellcheck disable=SC2155
+  local sockets=$(lscpu | grep "Socket(s)" | awk '{print $NF}') && echo "sockets = ${sockets}" || echo "sockets not found"
+  local re='^[0-9]+$'
+  local run_multicore=""
   if [[ $core =~ $re && $sockets =~ $re ]] ; then
-    n_core=$((core * sockets))
-    run_multicore=" -j ${n_core}"
+    local n_core=$((core * sockets))
+    local run_multicore=" -j ${n_core}"
   fi
 
   # Distribute Python extensions as wheels on Linux
@@ -380,7 +382,7 @@ build_fbgemm_gpu_install () {
   echo "################################################################################"
   echo "# Build + Install FBGEMM-GPU Package"
   echo "#"
-  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
   echo "################################################################################"
   echo ""
 
@@ -426,9 +428,9 @@ build_fbgemm_gpu_develop () {
   __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1
 
   echo "################################################################################"
-  echo "# Build + Install FBGEMM-GPU Package"
+  echo "# Build + Install FBGEMM-GPU Package (Develop)"
   echo "#"
-  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
   echo "################################################################################"
   echo ""
 
diff --git a/.github/scripts/fbgemm_gpu_docs.bash b/.github/scripts/fbgemm_gpu_docs.bash
index d2b21f5649..0e923afb1b 100644
--- a/.github/scripts/fbgemm_gpu_docs.bash
+++ b/.github/scripts/fbgemm_gpu_docs.bash
@@ -24,7 +24,7 @@ install_docs_tools () {
     echo "################################################################################"
     echo "# Install Documentation Tools"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
@@ -61,7 +61,7 @@ build_fbgemm_gpu_docs () {
     echo "################################################################################"
     echo "# Build FBGEMM-GPU Documentation"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
diff --git a/.github/scripts/fbgemm_gpu_install.bash b/.github/scripts/fbgemm_gpu_install.bash
index e78ca79bd6..02e60e8e99 100644
--- a/.github/scripts/fbgemm_gpu_install.bash
+++ b/.github/scripts/fbgemm_gpu_install.bash
@@ -43,7 +43,7 @@ install_fbgemm_gpu_wheel () {
     echo "################################################################################"
     echo "# Install FBGEMM-GPU from Wheel"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
@@ -79,7 +79,7 @@ install_fbgemm_gpu_pip () {
     echo "################################################################################"
     echo "# Install FBGEMM-GPU Package from PIP"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
diff --git a/.github/scripts/fbgemm_gpu_lint.bash b/.github/scripts/fbgemm_gpu_lint.bash
index 122d547862..fc2ab7d25c 100644
--- a/.github/scripts/fbgemm_gpu_lint.bash
+++ b/.github/scripts/fbgemm_gpu_lint.bash
@@ -24,7 +24,7 @@ install_lint_tools () {
     echo "################################################################################"
     echo "# Install Lint Tools"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
@@ -69,7 +69,7 @@ lint_fbgemm_gpu_flake8 () {
     echo "################################################################################"
     echo "# Run FBGEMM_GPU Lint: flake8"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
@@ -99,7 +99,7 @@ lint_fbgemm_gpu_ufmt () {
     echo "################################################################################"
     echo "# Run FBGEMM_GPU Lint: ufmt"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
@@ -132,7 +132,7 @@ lint_fbgemm_gpu_copyright () {
     echo "################################################################################"
     echo "# Run FBGEMM_GPU Lint: Meta Copyright Headers"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
index b593caa95d..be5a95ab1b 100644
--- a/.github/scripts/fbgemm_gpu_test.bash
+++ b/.github/scripts/fbgemm_gpu_test.bash
@@ -61,7 +61,7 @@ run_fbgemm_gpu_tests () {
     echo "################################################################################"
     echo "# Run FBGEMM-GPU Tests"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
@@ -143,7 +143,7 @@ test_setup_conda_environment () {
     echo "################################################################################"
     echo "# Setup FBGEMM-GPU Build Container (All Steps)"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
diff --git a/.github/scripts/nova_postscript.bash b/.github/scripts/nova_postscript.bash
index c1e5d5bfd5..4602d6bd21 100644
--- a/.github/scripts/nova_postscript.bash
+++ b/.github/scripts/nova_postscript.bash
@@ -5,15 +5,17 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-echo "Current working directory: $(pwd)"
-cd "${FBGEMM_REPO}" || echo "Failed to cd to ${FBGEMM_REPO}"
+echo "[NOVA] Current working directory: $(pwd)"
+cd "${FBGEMM_REPO}" || echo "[NOVA] Failed to cd to ${FBGEMM_REPO}"
 PRELUDE="${FBGEMM_REPO}/.github/scripts/setup_env.bash"
 BUILD_ENV_NAME=${CONDA_ENV}
 GITHUB_ENV=TRUE
 export GITHUB_ENV
 
 # Install FBGEMM_GPU Nightly
-echo "Current working directory: $(pwd)"
+echo "[NOVA] Current working directory: $(pwd)"
+
+# Load the FBGEMM_GPU build scripts infrastructure
 # shellcheck disable=SC1091
 # shellcheck source=.github/scripts/setup_env.bash
 . "${PRELUDE}";
@@ -21,11 +23,11 @@ echo "Current working directory: $(pwd)"
 install_fbgemm_gpu_wheel "${BUILD_ENV_NAME}" fbgemm_gpu/dist/*.whl
 
 # Test with PyTest
-echo "Current working directory: $(pwd)"
+echo "[NOVA] Current working directory: $(pwd)"
 CPU_GPU="${CU_VERSION}"
 if [ "${CU_VERSION}" != 'cpu' ]; then
     CPU_GPU=""
 fi
 $CONDA_RUN python3 -c "import torch; print('cuda.is_available() ', torch.cuda.is_available()); print ('device_count() ',torch.cuda.device_count());"
-cd "${FBGEMM_REPO}/fbgemm_gpu/test" || { echo "Failed to cd to fbgemm_gpu/test from $(pwd)"; };
+cd "${FBGEMM_REPO}/fbgemm_gpu/test" || { echo "[NOVA] Failed to cd to fbgemm_gpu/test from $(pwd)"; };
 run_fbgemm_gpu_tests "${BUILD_ENV_NAME}" "${CPU_GPU}"
diff --git a/.github/scripts/nova_prescript.bash b/.github/scripts/nova_prescript.bash
index 89106525fe..9cb9b84008 100644
--- a/.github/scripts/nova_prescript.bash
+++ b/.github/scripts/nova_prescript.bash
@@ -5,71 +5,91 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-echo "Current working directory: $(pwd)"
-cd "${FBGEMM_REPO}" || echo "Failed to cd to ${FBGEMM_REPO}"
+export PATH="${PATH}:/usr/sbin:/sbin"
+
+echo "[NOVA] Current working directory: $(pwd)"
+cd "${FBGEMM_REPO}" || exit 1
+
 PRELUDE="${FBGEMM_REPO}/.github/scripts/setup_env.bash"
 BUILD_ENV_NAME=${CONDA_ENV}
-echo "--------------------------"
-echo "----- conda env list -----"
-conda env list
-echo "--------------------------"
-echo "PRELUDE = $PRELUDE"
-export PATH="${PATH}:/usr/sbin:/sbin"
-echo "CU_VERSION = ${CU_VERSION}"
-echo "PYTHON_VERSION = ${PYTHON_VERSION}"
-echo "python3 --version = $(python3 --version)"
-echo "ARCH = ${ARCH}"
-echo "---------------------------"
+
+# Load the FBGEMM_GPU build scripts infrastructure
 # shellcheck disable=SC1091
 # shellcheck source=.github/scripts/setup_env.bash
 . "${PRELUDE}";
 
-## Display System Info
+# Display System Info
 print_system_info
 
-## Display GPU Info
+# Display Conda information
+print_conda_info
+
+# Display GPU Info
 print_gpu_info
 
-## Install C/C++ Compilers
+# Install C/C++ Compilers
 install_cxx_compiler "${BUILD_ENV_NAME}"
 
-## Install Build Tools
+# Install Build Tools
 install_build_tools "${BUILD_ENV_NAME}"
 
-## Install cuDNN
-CPU_GPU=${CU_VERSION}
-if [ "${CU_VERSION}" != 'cpu' ]; then
-    ## Nova $CU_VERSION is e.g., cu118
-    cuda_version_num=$(echo "$CU_VERSION" | cut -c 3-)
-    install_cudnn "${BUILD_ENV_NAME}" "$(pwd)/build_only/cudnn" "$cuda_version_num"
-    echo "-------- Finding NVML_LIB_PATH -----------"
-    echo "NVML_LIB_PATH = ${NVML_LIB_PATH}"
-    echo "CONDA_ENV = ${CONDA_ENV}, CUDA_HOME = ${CUDA_HOME}"
-    if [[ ${NVML_LIB_PATH} == "" ]]; then NVML_LIB_PATH=$(find "${CUDA_HOME}" -name libnvidia-ml.so) && export NVML_LIB_PATH && echo "looking in ${CUDA_HOME}" || echo "libnvidia-ml.so not found in ${CUDA_HOME}"; fi
-    if [[ ${NVML_LIB_PATH} == "" ]]; then NVML_LIB_PATH=$(find "${CONDA_ENV}" -name libnvidia-ml.so) && export NVML_LIB_PATH && echo "looking in ${CONDA_ENV}" || echo "libnvidia-ml.so not found in ${CONDA_ENV}"; fi
-    echo "NVML_LIB_PATH = ${NVML_LIB_PATH}"
-    echo "------------------------------------------"
-    CPU_GPU="cuda"
+if [[ $CU_VERSION = cu* ]]; then
+  # Extract the CUDA version number from CU_VERSION
+  cuda_version=$(echo "[NOVA] ${CU_VERSION}" | cut -c 3-)
+  install_cudnn "${BUILD_ENV_NAME}" "$(pwd)/build_only/cudnn" "${cuda_version}"
+
+  echo "[NOVA] -------- Finding NVML_LIB_PATH -----------"
+  if [[ ${NVML_LIB_PATH} == "" ]]; then
+    NVML_LIB_PATH=$(find "${CUDA_HOME}" -name libnvidia-ml.so) &&
+    export NVML_LIB_PATH &&
+    echo "[NOVA] looking in ${CUDA_HOME}" ||
+    echo "[NOVA] libnvidia-ml.so not found in ${CUDA_HOME}";
+  fi
+
+  if [[ ${NVML_LIB_PATH} == "" ]]; then
+    NVML_LIB_PATH=$(find "${CONDA_ENV}" -name libnvidia-ml.so) &&
+    export NVML_LIB_PATH &&
+    echo "[NOVA] looking in ${CONDA_ENV}" ||
+    echo "[NOVA] libnvidia-ml.so not found in ${CONDA_ENV}";
+  fi
+
+  echo "[NOVA] NVML_LIB_PATH = ${NVML_LIB_PATH}"
+  echo "[NOVA] ------------------------------------------"
+
+  echo "[NOVA] Building the CUDA variant of FBGEMM_GPU ..."
+  fbgemm_variant="cuda"
+
+elif [[ $CU_VERSION = rocm* ]]; then
+  echo "[NOVA] Building the ROCm variant of FBGEMM_GPU ..."
+  fbgemm_variant="rocm"
+
+else
+  echo "[NOVA] Building the CPU variant of FBGEMM_GPU ..."
+  fbgemm_variant="cpu"
 fi
 
-cd "${FBGEMM_REPO}/fbgemm_gpu" || { echo "Failed to cd to fbgemm_gpu from $(pwd)"; }
+# Install the necessary Python eggs for building
+cd "${FBGEMM_REPO}/fbgemm_gpu" || exit 1
 prepare_fbgemm_gpu_build "${BUILD_ENV_NAME}"
 
-# reset NOVA flag to run setup.py
+# Reset the BUILD_FROM_NOVA flag to run setup.py for the actual build
 BUILD_FROM_NOVA=0
 export BUILD_FROM_NOVA
 
-## Build FBGEMM_GPU Nightly
-cd "${FBGEMM_REPO}/fbgemm_gpu" || echo "Failed to cd to ${FBGEMM_REPO}/fbgemm_gpu from $(pwd)"
-if [[ ${CHANNEL} == "" ]]; then CHANNEL="nightly"; fi #set nightly by default
-echo "----------------------------------------------"
-echo "build_fbgemm_gpu_package ${BUILD_ENV_NAME} ${CHANNEL} ${CPU_GPU}"
-build_fbgemm_gpu_package "${BUILD_ENV_NAME}" "${CHANNEL}" "${CPU_GPU}"
-echo "----------------------------------------------"
-
-## Temporary workaround - copy dist/ to root repo for smoke test
-echo "Copying dist folder to root repo.."
-(cp -r "${FBGEMM_REPO}/fbgemm_gpu/dist" "${FBGEMM_REPO}") && (echo "dist folder has been copied to ${FBGEMM_REPO}") || echo "Failed to copy dist/ folder to ${FBGEMM_REPO}"
-echo "----------------------------------"
-ls -al "${FBGEMM_REPO}/dist"
-echo "----------------------------------"
+# Build FBGEMM_GPU nightly by default
+if [[ ${CHANNEL} == "" ]]; then
+  CHANNEL="nightly"
+fi
+
+# Build the wheel
+build_fbgemm_gpu_package "${BUILD_ENV_NAME}" "${CHANNEL}" "${fbgemm_variant}"
+
+# Temporary workaround - copy dist/ to root repo for smoke test
+echo "[NOVA] Copying dist folder to root repo ..."
+if print_exec cp -r "${FBGEMM_REPO}/fbgemm_gpu/dist" "${FBGEMM_REPO}"; then
+  echo "[NOVA] dist folder has been copied to ${FBGEMM_REPO}"
+  ls -al "${FBGEMM_REPO}/dist"
+else
+  echo "[NOVA] Failed to copy dist/ folder to ${FBGEMM_REPO}"
+  exit 1
+fi
diff --git a/.github/scripts/utils_build.bash b/.github/scripts/utils_build.bash
index 12febee996..0d4b886f76 100644
--- a/.github/scripts/utils_build.bash
+++ b/.github/scripts/utils_build.bash
@@ -18,7 +18,7 @@ setup_bazel () {
   echo "################################################################################"
   echo "# Setup Bazel"
   echo "#"
-  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
   echo "################################################################################"
   echo ""
 
@@ -54,7 +54,7 @@ install_cxx_compiler () {
     echo "################################################################################"
     echo "# Install C/C++ Compilers"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
@@ -149,7 +149,7 @@ install_build_tools () {
     echo "################################################################################"
     echo "# Install Build Tools"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
diff --git a/.github/scripts/utils_conda.bash b/.github/scripts/utils_conda.bash
index 96aae5dfe3..63bf64d0cc 100644
--- a/.github/scripts/utils_conda.bash
+++ b/.github/scripts/utils_conda.bash
@@ -30,7 +30,7 @@ setup_miniconda () {
     echo "################################################################################"
     echo "# Setup Miniconda"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
@@ -88,7 +88,7 @@ create_conda_environment () {
     echo "################################################################################"
     echo "# Create Conda Environment"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
@@ -132,3 +132,19 @@ create_conda_environment () {
   echo "[SETUP] Installed Python version: $(conda run ${env_prefix} python --version)"
   echo "[SETUP] Successfully created Conda environment: ${env_name}"
 }
+
+print_conda_info () {
+  echo "################################################################################"
+  echo "# Print Conda Environment Info"
+  echo "#"
+  echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
+  echo "################################################################################"
+  echo ""
+  print_exec conda info
+  echo ""
+  print_exec conda info --envs
+  echo ""
+  # shellcheck disable=SC2153
+  echo "PYTHON_VERSION:     ${PYTHON_VERSION}"
+  echo "python3 --version:  $(python3 --version)"
+}
diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash
index 705ef8dc25..285c9a5eac 100644
--- a/.github/scripts/utils_cuda.bash
+++ b/.github/scripts/utils_cuda.bash
@@ -25,7 +25,7 @@ install_cuda () {
     echo "################################################################################"
     echo "# Install CUDA"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
@@ -93,7 +93,7 @@ install_cudnn () {
     echo "################################################################################"
     echo "# Install cuDNN"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
diff --git a/.github/scripts/utils_pip.bash b/.github/scripts/utils_pip.bash
index 4782632a3e..686485711b 100644
--- a/.github/scripts/utils_pip.bash
+++ b/.github/scripts/utils_pip.bash
@@ -31,7 +31,7 @@ install_from_pytorch_pip () {
     echo "################################################################################"
     echo "# Install ${package_name_raw} (PyTorch PIP)"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
@@ -117,7 +117,7 @@ publish_to_pypi () {
     echo "################################################################################"
     echo "# Publish to PyPI"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
diff --git a/.github/scripts/utils_pytorch.bash b/.github/scripts/utils_pytorch.bash
index 8aaea9f4fd..1fb743b653 100644
--- a/.github/scripts/utils_pytorch.bash
+++ b/.github/scripts/utils_pytorch.bash
@@ -33,7 +33,7 @@ install_pytorch_conda () {
     echo "################################################################################"
     echo "# Install PyTorch (Conda)"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
@@ -120,7 +120,7 @@ install_pytorch_pip () {
     echo "################################################################################"
     echo "# Install PyTorch (PIP)"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
diff --git a/.github/scripts/utils_rocm.bash b/.github/scripts/utils_rocm.bash
index 9802fb80fa..8efb8128fb 100644
--- a/.github/scripts/utils_rocm.bash
+++ b/.github/scripts/utils_rocm.bash
@@ -28,7 +28,7 @@ install_rocm_ubuntu () {
     echo "################################################################################"
     echo "# Install ROCm (Ubuntu)"
     echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
     echo ""
   fi
diff --git a/.github/scripts/utils_system.bash b/.github/scripts/utils_system.bash
index 297559d098..de37ec80ef 100644
--- a/.github/scripts/utils_system.bash
+++ b/.github/scripts/utils_system.bash
@@ -56,7 +56,7 @@ free_disk_space () {
   echo "################################################################################"
   echo "# Free Disk Space"
   echo "#"
-  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
   echo "################################################################################"
   echo ""
 
@@ -168,7 +168,7 @@ print_system_info () {
   echo "################################################################################"
   echo "# Print System Info"
   echo "#"
-  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
   echo "################################################################################"
   echo ""
 
@@ -187,7 +187,7 @@ print_ec2_info () {
   echo "################################################################################"
   echo "# Print EC2 Instance Info"
   echo "#"
-  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
   echo "################################################################################"
   echo ""
 
diff --git a/.github/workflows/build_wheels_linux_aarch64.yml b/.github/workflows/build_wheels_linux_aarch64.yml
index 35bb2f42f9..87331980df 100644
--- a/.github/workflows/build_wheels_linux_aarch64.yml
+++ b/.github/workflows/build_wheels_linux_aarch64.yml
@@ -15,8 +15,8 @@ on:
     workflow_dispatch:
 
 concurrency:
-    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-    cancel-in-progress: true
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 
 jobs:
   generate-matrix:
diff --git a/.github/workflows/build_wheels_linux_x86.yml b/.github/workflows/build_wheels_linux_x86.yml
index dfb141d6f6..76b7db4ab9 100644
--- a/.github/workflows/build_wheels_linux_x86.yml
+++ b/.github/workflows/build_wheels_linux_x86.yml
@@ -14,6 +14,10 @@ on:
       - v[0-9]+.[0-9]+.[0-9]+
   workflow_dispatch:
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   generate-matrix:
     uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
diff --git a/.github/workflows/build_wheels_linux_x86_rocm.yml b/.github/workflows/build_wheels_linux_x86_rocm.yml
new file mode 100644
index 0000000000..6139e9dccc
--- /dev/null
+++ b/.github/workflows/build_wheels_linux_x86_rocm.yml
@@ -0,0 +1,51 @@
+name: Build x86 Linux Wheels (ROCm)
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      # Release candidate branch look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-release+
+    tags:
+      # Release candidate tag look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - v[0-9]+.[0-9]+.[0-9]+
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  generate-matrix:
+    if: ${{ github.event_name == 'pull_request' || (inputs.trigger-event == 'push' && startsWith(github.event.ref, 'refs/heads/nightly')) }}
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: linux
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      with-cuda: disable
+      with-rocm: enable
+      with-cpu: disable
+  build:
+    needs: generate-matrix
+    name: pytorch/FBGEMM
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    with:
+      repository: pytorch/FBGEMM
+      ref: ""
+      pre-script: ../.github/scripts/nova_prescript.bash
+      post-script: ../.github/scripts/nova_postscript.bash
+      smoke-test-script: ""
+      env-var-script: .github/scripts/nova_dir.bash
+      package-name: fbgemm_gpu
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      trigger-event: ${{ github.event_name }}
+    secrets:
+      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
index fafab2949b..d0ab554583 100644
--- a/.github/workflows/fbgemm_gpu_pip.yml
+++ b/.github/workflows/fbgemm_gpu_pip.yml
@@ -6,11 +6,13 @@
 name: FBGEMM_GPU PIP Install + Test
 
 on:
-  # PR Trigger (enabled for regression checks and debugging)
+  # Cron Trigger (UTC)
   #
-  pull_request:
-    branches:
-      - main
+  # Based on the the nightly releases schedule in PyTorch infrastructure, the
+  # wheels are published to PyTorch PIP at around 11:30 UTC every day.
+  #
+  schedule:
+    - cron: '30 12 * * *'
 
   # Manual Trigger
   #
@@ -34,7 +36,7 @@ on:
 
 jobs:
   test_pypi_install_cpu:
-    if: ${{ github.event_name == 'pull_request' || (github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant_type == 'cpu') }}
+    if: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant_type == 'cpu') }}
     runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: amazonlinux:2023
@@ -88,7 +90,7 @@ jobs:
 
 
   test_pypi_install_cuda:
-    if: ${{ github.event_name == 'pull_request' || (github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant_type == 'cuda') }}
+    if: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant_type == 'cuda') }}
     runs-on: ${{ matrix.host-machine.instance }}
     defaults:
       run:
@@ -145,7 +147,7 @@ jobs:
 
 
   test_pypi_install_rocm:
-    if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant_type == 'rocm' }}
+    if: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant_type == 'rocm') }}
     runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: "rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}-complete"

From 91facab29a7552e0304cc952402ddca7195326ee Mon Sep 17 00:00:00 2001
From: Supadchaya Puangpontip <supadchaya@meta.com>
Date: Thu, 21 Sep 2023 18:22:15 -0700
Subject: [PATCH 38/94] Skip test_pack_segments until the issue is fixed
 (#2032)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2032

`test_pack_segments` fails on cpu CI. Skip the test temporarily to unblock nightly releases while the issue is being investigated.

https://github.com/pytorch/FBGEMM/actions/runs/6254900720

Reviewed By: q10

Differential Revision: D49524460

fbshipit-source-id: 1c8f976dbed280c989ad0cf6fd14e852473cd686
---
 fbgemm_gpu/test/sparse_ops_test.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/test/sparse_ops_test.py b/fbgemm_gpu/test/sparse_ops_test.py
index 59f633ece3..9ef594259f 100644
--- a/fbgemm_gpu/test/sparse_ops_test.py
+++ b/fbgemm_gpu/test/sparse_ops_test.py
@@ -26,12 +26,17 @@
     from fbgemm_gpu import open_source  # noqa: F401
 
     # pyre-ignore[21]
-    from test_utils import gpu_available, gpu_unavailable, skipIfRocm
+    from test_utils import gpu_available, gpu_unavailable, running_on_github, skipIfRocm
 except Exception:
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/codegen:index_select_ops")
-    from fbgemm_gpu.test.test_utils import gpu_available, gpu_unavailable, skipIfRocm
+    from fbgemm_gpu.test.test_utils import (
+        gpu_available,
+        gpu_unavailable,
+        running_on_github,
+        skipIfRocm,
+    )
 
 
 def unbucketize_indices_value(
@@ -1733,6 +1738,7 @@ def _pack_segments_ref(
         ),
         torch_compile=st.booleans(),
     )
+    @unittest.skipIf(*running_on_github)
     @settings(deadline=None)
     def test_pack_segments(
         self,

From f325a7966d2eb521a96134978bc9e1c653538d7e Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Thu, 21 Sep 2023 21:36:23 -0700
Subject: [PATCH 39/94] Fix nova script (#2033)

Summary:
- Fix a syntax issue with bash associative arrays that caused the Nova build scripts to fail

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2033

Reviewed By: spcyppt

Differential Revision: D49529109

Pulled By: q10

fbshipit-source-id: d2e1ac49e3490bc89b266c7596a19117efa0caa8
---
 .github/scripts/fbgemm_gpu_build.bash |  2 +-
 .github/scripts/fbgemm_gpu_test.bash  | 21 +++++----
 .github/scripts/nova_prescript.bash   |  8 ++--
 .github/scripts/test_torchrec.bash    | 64 +++++++++------------------
 .github/scripts/utils_cuda.bash       |  2 +-
 5 files changed, 38 insertions(+), 59 deletions(-)

diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index 5354c24ab6..b783651d8c 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -290,7 +290,7 @@ build_fbgemm_gpu_package () {
   fbgemm_variant="$3"
   fbgemm_variant_targets="$4"
   if [ "$fbgemm_variant" == "" ]; then
-    echo "Usage: ${FUNCNAME[0]} ENV_NAME PACKAGE_NAME VARIANT [TARGETS]"
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME RELEASE_TYPE VARIANT [VARIANT_TARGETS]"
     echo "Example(s):"
     echo "    ${FUNCNAME[0]} build_env nightly cpu                           # Nightly CPU-only variant"
     echo "    ${FUNCNAME[0]} build_env nightly cuda                          # Nightly CUDA variant for default target(s)"
diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
index be5a95ab1b..5c589dc858 100644
--- a/.github/scripts/fbgemm_gpu_test.bash
+++ b/.github/scripts/fbgemm_gpu_test.bash
@@ -129,13 +129,14 @@ run_fbgemm_gpu_tests () {
 ################################################################################
 
 test_setup_conda_environment () {
-  local python_version="$1"
-  local pytorch_installer="$2"
-  local pytorch_version="$3"
-  local pytorch_variant_type="$4"
-  local pytorch_variant_version="$5"
+  local env_name="$1"
+  local python_version="$2"
+  local pytorch_installer="$3"
+  local pytorch_version="$4"
+  local pytorch_variant_type="$5"
+  local pytorch_variant_version="$6"
   if [ "$pytorch_variant_type" == "" ]; then
-    echo "Usage: ${FUNCNAME[0]} PYTHON_VERSION PYTORCH_INSTALLER PYTORCH_VERSION PYTORCH_VARIANT_TYPE [PYTORCH_VARIANT_VERSION]"
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME PYTHON_VERSION PYTORCH_INSTALLER PYTORCH_VERSION PYTORCH_VARIANT_TYPE [PYTORCH_VARIANT_VERSION]"
     echo "Example(s):"
     echo "    ${FUNCNAME[0]} build_env 3.8 pip test cuda 11.8.0       # Setup environment with pytorch-test for Python 3.8 + CUDA 11.8.0"
     return 1
@@ -148,9 +149,11 @@ test_setup_conda_environment () {
     echo ""
   fi
 
-  local env_name="test_py${python_version}_${pytorch_installer}_pytorch_${pytorch_version}_${pytorch_variant_type}"
-  if [ "$pytorch_variant_version" != "" ]; then
-    local env_name="${env_name}_${pytorch_variant_version}"
+  if [ "$env_name" == "" ]; then
+    local env_name="test_py${python_version}_${pytorch_installer}_pytorch_${pytorch_version}_${pytorch_variant_type}"
+    if [ "$pytorch_variant_version" != "" ]; then
+      local env_name="${env_name}_${pytorch_variant_version}"
+    fi
   fi
 
   echo "Creating the Build Environment: ${env_name} ..."
diff --git a/.github/scripts/nova_prescript.bash b/.github/scripts/nova_prescript.bash
index 9cb9b84008..f52e3b163a 100644
--- a/.github/scripts/nova_prescript.bash
+++ b/.github/scripts/nova_prescript.bash
@@ -57,15 +57,15 @@ if [[ $CU_VERSION = cu* ]]; then
   echo "[NOVA] ------------------------------------------"
 
   echo "[NOVA] Building the CUDA variant of FBGEMM_GPU ..."
-  fbgemm_variant="cuda"
+  export fbgemm_variant="cuda"
 
 elif [[ $CU_VERSION = rocm* ]]; then
   echo "[NOVA] Building the ROCm variant of FBGEMM_GPU ..."
-  fbgemm_variant="rocm"
+  export fbgemm_variant="rocm"
 
 else
   echo "[NOVA] Building the CPU variant of FBGEMM_GPU ..."
-  fbgemm_variant="cpu"
+  export fbgemm_variant="cpu"
 fi
 
 # Install the necessary Python eggs for building
@@ -78,7 +78,7 @@ export BUILD_FROM_NOVA
 
 # Build FBGEMM_GPU nightly by default
 if [[ ${CHANNEL} == "" ]]; then
-  CHANNEL="nightly"
+  export CHANNEL="nightly"
 fi
 
 # Build the wheel
diff --git a/.github/scripts/test_torchrec.bash b/.github/scripts/test_torchrec.bash
index c298c50495..9bdb63cbd1 100644
--- a/.github/scripts/test_torchrec.bash
+++ b/.github/scripts/test_torchrec.bash
@@ -11,41 +11,8 @@ set -e
 # shellcheck source=/dev/null
 . "$(dirname "$(realpath -s "$0")")/setup_env.bash"
 
-create_conda_pytorch_environment () {
-  local env_name="$1"
-  local python_version="$2"
-  local pytorch_channel_name="$3"
-  local cuda_version="$4"
-  if [ "$python_version" == "" ]; then
-    echo "Usage: ${FUNCNAME[0]} ENV_NAME PYTHON_VERSION PYTORCH_CHANNEL_NAME CUDA_VERSION"
-    echo "Example:"
-    echo "    ${FUNCNAME[0]} build_env 3.10 pytorch-nightly 11.7.1"
-    return 1
-  fi
-
-  # Create the Conda environment
-  create_conda_environment "${env_name}" "${python_version}"
-
-  # Convert the channels to versions
-  if [ "${pytorch_channel_name}" == "pytorch-nightly" ]; then
-    pytorch_version="nightly"
-  elif [ "${pytorch_channel_name}" == "pytorch-test" ]; then
-    pytorch_version="test"
-  else
-    pytorch_version="latest"
-  fi
-
-  if [ "${cuda_version}" == "" ]; then
-    # Install the CPU variant of PyTorch
-    install_pytorch_conda "${env_name}" "${pytorch_version}" cpu
-  else
-    # Install CUDA and the GPU variant of PyTorch
-    install_cuda "${env_name}" "${cuda_version}"
-    install_pytorch_conda "${env_name}" "${pytorch_version}"
-  fi
-}
-
 verbose=0
+env_name=test_binary
 torchrec_package_name=""
 python_version=""
 cuda_version="x"
@@ -53,7 +20,8 @@ fbgemm_wheel_path="x"
 miniconda_prefix="${HOME}/miniconda"
 
 usage () {
-  echo "Usage: bash test_torchrec.bash -o PACKAGE_NAME -p PYTHON_VERSION -P PYTORCH_CHANNEL_NAME -c CUDA_VERSION -w FBGEMM_WHEEL_PATH [-m MINICONDA_PREFIX] [-v] [-h]"
+  # shellcheck disable=SC2086
+  echo "Usage: bash $(basename ${BASH_SOURCE[0]}) -o PACKAGE_NAME -p PYTHON_VERSION -P PYTORCH_CHANNEL_NAME -c CUDA_VERSION -w FBGEMM_WHEEL_PATH [-m MINICONDA_PREFIX] [-v] [-h]"
   echo "-v                  : verbose"
   echo "-h                  : help"
   echo "PACKAGE_NAME        : output package name of TorchRec (e.g., torchrec_nightly)"
@@ -65,7 +33,8 @@ usage () {
   echo "FBGEMM_WHEEL_PATH   : path to FBGEMM_GPU's wheel file"
   echo "MINICONDA_PREFIX    : path to install Miniconda (default: \$HOME/miniconda)"
   echo "Example: Python 3.10 + PyTorch nightly (CUDA 11.7), install miniconda at \$HOME/miniconda, using dist/fbgemm_gpu_nightly.whl"
-  echo "       bash test_torchrec.bash -v -o torchrec_nightly -p 3.10 -P pytorch-nightly -c 11.7 -w dist/fbgemm_gpu_nightly.whl"
+  # shellcheck disable=SC2086
+  echo "       bash $(basename ${BASH_SOURCE[0]}) -v -o torchrec_nightly -p 3.10 -P pytorch-nightly -c 11.7 -w dist/fbgemm_gpu_nightly.whl"
 }
 
 while getopts vho:p:P:c:m:w: flag
@@ -112,30 +81,37 @@ echo "## 1. Set up Miniconda"
 setup_miniconda "$miniconda_prefix"
 
 ################################################################################
-echo "## 2. Create test_binary environment"
+echo "## 2. Create Conda environment"
 ################################################################################
 
-create_conda_pytorch_environment test_binary "$python_version" "$pytorch_channel_name" "$cuda_version"
+if [ "${cuda_version}" == "" ]; then
+  pytorch_variant="cuda ${cuda_version}"
+else
+  pytorch_variant="cpu"
+fi
+
+# shellcheck disable=SC2086
+test_setup_conda_environment "$env_name" "$python_version" pip "$pytorch_channel_name" $pytorch_variant
 
 # Comment out FBGEMM_GPU since we will install it from "$fbgemm_wheel_path"
 sed -i 's/fbgemm-gpu/#fbgemm-gpu/g' requirements.txt
-conda run -n test_binary python -m pip install -r requirements.txt
+conda run -n "$env_name" python -m pip install -r requirements.txt
 # Install FBGEMM_GPU from a local wheel file.
-conda run -n test_binary python -m pip install "$fbgemm_wheel_path"
-conda run -n test_binary python -c "import fbgemm_gpu"
+conda run -n "$env_name" python -m pip install "$fbgemm_wheel_path"
+conda run -n "$env_name" python -c "import fbgemm_gpu"
 
 ################################################################################
 echo "## 3. Build TorchRec"
 ################################################################################
 
 rm -rf dist
-conda run -n test_binary python setup.py bdist_wheel --package_name "${torchrec_package_name}" --python-tag="py${python_tag}"
+conda run -n "$env_name" python setup.py bdist_wheel --package_name "${torchrec_package_name}" --python-tag="py${python_tag}"
 
 ################################################################################
 echo "## 4. Import TorchRec"
 ################################################################################
 
-conda run -n test_binary python -m pip install dist/"${torchrec_package_name}"*.whl
-conda run -n test_binary python -c "import torchrec"
+conda run -n "$env_name" python -m pip install dist/"${torchrec_package_name}"*.whl
+conda run -n "$env_name" python -c "import torchrec"
 
 echo "Test succeeded"
diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash
index 285c9a5eac..10efbcf906 100644
--- a/.github/scripts/utils_cuda.bash
+++ b/.github/scripts/utils_cuda.bash
@@ -102,7 +102,7 @@ install_cudnn () {
 
   # Install cuDNN manually
   # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
-  local cudnn_packages=(
+  declare -A cudnn_packages=(
     ["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-${PLATFORM_NAME_LC}-8.3.2.44_cuda11.5-archive.tar.xz"
     ["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-${PLATFORM_NAME_LC}-8.3.2.44_cuda11.5-archive.tar.xz"
     ["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-${PLATFORM_NAME_LC}-8.5.0.96_cuda11-archive.tar.xz"

From 4baa14850fb4ef88e918690c45ccf80e6ff1a262 Mon Sep 17 00:00:00 2001
From: Flavio Sales Truzzi <ftruzzi@meta.com>
Date: Fri, 22 Sep 2023 09:28:28 -0700
Subject: [PATCH 40/94] - Fix meta registration (#2035)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2035

Fixing wrong naming on meta registration

Reviewed By: spcyppt

Differential Revision: D49527418

fbshipit-source-id: 83f89c02e140156f85072484c4d8adba8e837ad4
---
 fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp
index 96b42429d6..87f9e314b7 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp
@@ -68,7 +68,9 @@ Tensor batched_unary_embeddings_forward_meta(
 
 TORCH_LIBRARY_IMPL(fbgemm, Meta, m) {
   m.impl("pack_segments", TORCH_FN(fbgemm_gpu::pack_segments_forward_meta));
-  m.impl("unpack_segments", TORCH_FN(fbgemm_gpu::pack_segments_backward_meta));
+  m.impl(
+      "pack_segments_backward",
+      TORCH_FN(fbgemm_gpu::pack_segments_backward_meta));
   m.impl(
       "asynchronous_complete_cumsum",
       TORCH_FN(fbgemm_gpu::asynchronous_complete_cumsum_meta));

From b568c530bbd9769f05d4c0fbcb3d52dfb2fb0138 Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Fri, 22 Sep 2023 11:59:50 -0700
Subject: [PATCH 41/94] Fix read_write_bytes in device_with_specs for seq TBE
 (#2030)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2030

As titled

Reviewed By: q10

Differential Revision: D49478803

fbshipit-source-id: 1af575a6b69cc225fe7d28c181dce54e6c052641
---
 fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
index ec4d265535..277809733c 100644
--- a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
+++ b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
@@ -2798,7 +2798,7 @@ def device_with_spec(  # noqa C901
         )
     else:
         read_write_bytes = (
-            output_size_multiplier * B * sum(Ds) + param_size_multiplier * B * sum_DLs
+            output_size_multiplier * B * sum_DLs + param_size_multiplier * B * sum_DLs
         )
 
     if use_variable_bag_sizes:

From 56ee565fd1d75ff5106d3c7dd28222118261e18c Mon Sep 17 00:00:00 2001
From: Supadchaya Puangpontip <supadchaya@meta.com>
Date: Fri, 22 Sep 2023 17:06:44 -0700
Subject: [PATCH 42/94] Fix jagged_test_index_select_2d that hangs in OSS and
 revert skip tests (#2036)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2036

Before c++20, `std::atomic_flag` is initialized to an unspecified state, hence the loop `while (lock.test_and_set(std::memory_order_acquire)` is never broken and causes the test to hang in OSS. This diff properly initializes the `std::atomic_flag`.

Reviewed By: q10, sryap

Differential Revision: D49528661

fbshipit-source-id: ba2213cb9bf8c0abbd1e169db03f0e32dd2a7ebb
---
 fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp | 7 +++++++
 fbgemm_gpu/test/jagged_tensor_ops_test.py                  | 6 +-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
index f73cded5f3..084c0488b9 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
@@ -1152,6 +1152,13 @@ void jagged_index_add_2d_kernel(
   const auto num_cols = input.size(1);
   // Allocate one lock per row
   std::atomic_flag* locks = new std::atomic_flag[output.size(0)];
+  // Initialize all locks since before c++20 std::atomic_flag is initialized to
+  // an unspecified state.
+  // https://en.cppreference.com/w/cpp/atomic/atomic_flag/atomic_flag
+  for (auto i = 0; i < output.size(0); i++) {
+    locks[i].clear();
+  }
+
   at::parallel_for(0, num_dense_input_rows, 0, [&](int64_t start, int64_t end) {
     for (const auto dense_input_offset : c10::irange(start, end)) {
       int index_pos;
diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py
index 6ea981c12a..1e6b43c62a 100644
--- a/fbgemm_gpu/test/jagged_tensor_ops_test.py
+++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py
@@ -27,7 +27,6 @@
         gpu_available,
         gpu_unavailable,
         on_arm_platform,
-        running_on_github,
         symint_vector_unsupported,
         TEST_WITH_ROCM,
     )
@@ -38,7 +37,6 @@
         gpu_available,
         gpu_unavailable,
         on_arm_platform,
-        running_on_github,
         symint_vector_unsupported,
         TEST_WITH_ROCM,
     )
@@ -1805,7 +1803,6 @@ def jagged_index_select_2d_ref(
         new_embeddings = torch.index_select(values, 0, all_indices)
         return new_embeddings
 
-    @unittest.skipIf(*running_on_github)
     @given(
         max_seq_length=st.integers(5, 10),
         batch_size=st.integers(1, 128),
@@ -1826,7 +1823,7 @@ def jagged_index_select_2d_ref(
         if (gpu_available and TEST_WITH_ROCM)
         else st.just(True),
     )
-    @settings(max_examples=20, deadline=None)
+    @settings(max_examples=20, deadline=None, verbosity=Verbosity.verbose)
     def test_jagged_index_select_2d(
         self,
         max_seq_length: int,
@@ -1899,7 +1896,6 @@ def test_jagged_index_select_2d(
             atol=1e-2 if jagged_tensor_dtype in [torch.half, torch.bfloat16] else None,
         )
 
-    @unittest.skipIf(*running_on_github)
     @given(
         max_seq_length=st.integers(5, 10),
         batch_size=st.integers(1, 128),

From afdadcdfe1ec36f8d5fe7f97fb3c54f6d042ec67 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Sun, 24 Sep 2023 06:11:45 -0700
Subject: [PATCH 43/94] Add meta-functions for asynchronous_*_cumsum ops
 (#2028)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2028

ATT

Reviewed By: xw285cornell

Differential Revision: D49467255

fbshipit-source-id: 6a25874dc80b37e9d848ede23f688f32a49118f0
---
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp  | 11 --------
 fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp | 25 +++++++++++++++++++
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index eab2829497..a560101fca 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -1126,17 +1126,6 @@ Tensor asynchronous_complete_cumsum_cpu(const Tensor& t_in) {
   return output;
 }
 
-Tensor asynchronous_complete_cumsum_meta(const Tensor& t_in) {
-  const auto num_dims = t_in.dim();
-  TORCH_CHECK(num_dims == 1 || num_dims == 2);
-
-  auto output = num_dims == 1
-      ? at::zeros_symint({t_in.sym_numel() + 1}, t_in.options())
-      : at::zeros_symint(
-            {t_in.sym_size(0), t_in.sym_size(1) + 1}, t_in.options());
-  return output;
-}
-
 template <typename index_t, typename scalar_t>
 void reorder_batched_ad_lengths_(
     const Tensor& cat_ad_lengths,
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp
index 87f9e314b7..85e2f9c60b 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp
@@ -20,6 +20,17 @@ using Tensor = at::Tensor;
 
 namespace fbgemm_gpu {
 
+Tensor asynchronous_complete_cumsum_meta(const Tensor& t_in) {
+  const auto num_dims = t_in.dim();
+  TORCH_CHECK(num_dims == 1 || num_dims == 2);
+
+  auto output = num_dims == 1
+      ? at::zeros_symint({t_in.sym_numel() + 1}, t_in.options())
+      : at::zeros_symint(
+            {t_in.sym_size(0), t_in.sym_size(1) + 1}, t_in.options());
+  return output;
+}
+
 namespace {
 
 Tensor pack_segments_forward_meta(
@@ -62,6 +73,14 @@ Tensor batched_unary_embeddings_forward_meta(
   return at::empty_symint({N, B, T}, weight.options());
 }
 
+Tensor asynchronous_inclusive_cumsum_meta(const Tensor& t_in) {
+  return at::empty_symint(t_in.sym_sizes(), t_in.options());
+}
+
+Tensor asynchronous_exclusive_cumsum_meta(const Tensor& t_in) {
+  return at::empty_symint(t_in.sym_sizes(), t_in.options());
+}
+
 } // namespace
 
 } // namespace fbgemm_gpu
@@ -71,6 +90,12 @@ TORCH_LIBRARY_IMPL(fbgemm, Meta, m) {
   m.impl(
       "pack_segments_backward",
       TORCH_FN(fbgemm_gpu::pack_segments_backward_meta));
+  m.impl(
+      "asynchronous_inclusive_cumsum",
+      TORCH_FN(fbgemm_gpu::asynchronous_inclusive_cumsum_meta));
+  m.impl(
+      "asynchronous_exclusive_cumsum",
+      TORCH_FN(fbgemm_gpu::asynchronous_exclusive_cumsum_meta));
   m.impl(
       "asynchronous_complete_cumsum",
       TORCH_FN(fbgemm_gpu::asynchronous_complete_cumsum_meta));

From 3647f5a377880430ed2130d6335fffb0bbfc138f Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@meta.com>
Date: Sun, 24 Sep 2023 06:22:43 -0700
Subject: [PATCH 44/94] Add meta-function for pad_sequence op (#2029)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2029

ATT

Reviewed By: xw285cornell

Differential Revision: D49468106

fbshipit-source-id: eb4bbf61c51bb2e7d158cd15a417efdd79306a3b
---
 fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
index add79aaa30..1636d3b476 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
@@ -428,6 +428,12 @@ at::Tensor jagged_2d_to_dense(
     at::Tensor offsets,
     c10::SymInt max_sequence_length);
 
+at::Tensor jagged_1d_to_dense_meta(
+    at::Tensor values,
+    at::Tensor offsets,
+    c10::SymInt max_L,
+    int64_t padding_value);
+
 at::Tensor jagged_2d_to_dense_meta(
     at::Tensor values,
     at::Tensor offsets,

From fed7ad61aaa21a2c8be1a607c8f38729de975eac Mon Sep 17 00:00:00 2001
From: Qiang Zhang <drqiangzhang@meta.com>
Date: Mon, 25 Sep 2023 11:43:37 -0700
Subject: [PATCH 45/94] Support variable batch_size for
 block_bucketize_sparse_features (#2012)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2012

This diff add support variable batch size for block bucketize_sparse features for RW sharding.

Reviewed By: sryap

Differential Revision: D48683632

fbshipit-source-id: 14887f126ada0dab557ca7d435315854b42488cc
---
 fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h    |  32 ++--
 .../sparse_block_bucketize_features.cu        | 152 ++++++++++++++----
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp  |  91 +++++++----
 fbgemm_gpu/test/sparse_ops_test.py            |  87 ++++++++++
 4 files changed, 281 insertions(+), 81 deletions(-)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
index 1636d3b476..cc01a595bb 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
@@ -138,13 +138,15 @@ std::tuple<
 
 ///@ingroup sparse-data-cuda
 block_bucketize_sparse_features_cuda(
-    at::Tensor lengths,
-    at::Tensor indices,
-    bool bucketize_pos,
-    bool sequence,
-    at::Tensor block_sizes,
-    int64_t my_size,
-    c10::optional<at::Tensor> weights);
+    const at::Tensor& lengths,
+    const at::Tensor& indices,
+    const bool bucketize_pos,
+    const bool sequence,
+    const at::Tensor& block_sizes,
+    const int64_t my_size,
+    const c10::optional<at::Tensor>& weights,
+    const c10::optional<at::Tensor>& batch_size_per_feature,
+    const int64_t max_batch_size);
 
 std::tuple<
     at::Tensor,
@@ -155,13 +157,15 @@ std::tuple<
 
 ///@ingroup sparse-data-cpu
 block_bucketize_sparse_features_cpu(
-    at::Tensor lengths,
-    at::Tensor indices,
-    bool bucketize_pos,
-    bool sequence,
-    at::Tensor block_sizes,
-    int64_t my_size,
-    c10::optional<at::Tensor> weights);
+    const at::Tensor& lengths,
+    const at::Tensor& indices,
+    const bool bucketize_pos,
+    const bool sequence,
+    const at::Tensor& block_sizes,
+    const int64_t my_size,
+    const c10::optional<at::Tensor>& weights,
+    const c10::optional<at::Tensor>& batch_size_per_feature,
+    const int64_t max_batch_size);
 
 std::tuple<
     at::Tensor,
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu
index 8b0b0c35f4..9fd8d80dc5 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu
+++ b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu
@@ -12,6 +12,28 @@ using Tensor = at::Tensor;
 
 namespace fbgemm_gpu {
 
+// Kernel for calulating lengthh idx to feature id mapping. Used for block
+// bucketize sparse features with variable batch size for row-wise partition
+template <typename offset_t>
+__global__
+__launch_bounds__(kMaxThreads) void _populate_length_to_feature_id_inplace_kernel(
+    const uint64_t max_B,
+    const int T,
+    const offset_t* const __restrict__ batch_sizes,
+    const offset_t* const __restrict__ batch_size_offsets,
+    offset_t* const __restrict__ length_to_feature_idx) {
+  const auto b_t = blockIdx.x * blockDim.x + threadIdx.x;
+
+  const auto t = b_t / max_B;
+  const auto b = b_t % max_B;
+
+  if (t >= T || b >= batch_sizes[t]) {
+    return;
+  }
+
+  length_to_feature_idx[batch_size_offsets[t] + b] = t;
+}
+
 // Kernel for bucketize lengths, with the Block distribution (vs. cyclic,
 // block-cyclic distribution). Used for bucketize sparse feature, especially for
 // checkpointing with row-wise partition (sparse_feature is partitioned
@@ -19,16 +41,17 @@ namespace fbgemm_gpu {
 template <typename offset_t, typename index_t>
 __global__
 __launch_bounds__(kMaxThreads) void _block_bucketize_sparse_features_cuda_kernel1(
-    int32_t lengths_size,
-    int32_t B,
-    const index_t* __restrict__ block_sizes_data,
-    int my_size,
-    const offset_t* __restrict__ offsets_data,
-    const index_t* __restrict__ indices_data,
-    offset_t* __restrict__ new_lengths_data) {
+    const int32_t lengths_size,
+    const int32_t B,
+    const index_t* const __restrict__ block_sizes_data,
+    const int my_size,
+    const offset_t* const __restrict__ offsets_data,
+    const index_t* const __restrict__ indices_data,
+    offset_t* const __restrict__ new_lengths_data,
+    offset_t* __restrict__ length_to_feature_idx) {
   using uindex_t = std::make_unsigned_t<index_t>;
   CUDA_KERNEL_LOOP(b_t, lengths_size) {
-    int32_t t = b_t / B;
+    const auto t = length_to_feature_idx ? length_to_feature_idx[b_t] : b_t / B;
     index_t blk_size = block_sizes_data[t];
     offset_t rowstart = (b_t == 0 ? 0 : offsets_data[b_t - 1]);
     offset_t rowend = offsets_data[b_t];
@@ -71,11 +94,12 @@ __launch_bounds__(kMaxThreads) void _block_bucketize_sparse_features_cuda_kernel
     index_t* __restrict__ new_indices_data,
     scalar_t* __restrict__ new_weights_data,
     index_t* __restrict__ new_pos_data,
-    index_t* __restrict__ unbucketize_permute_data) {
+    index_t* const __restrict__ unbucketize_permute_data,
+    const offset_t* const __restrict__ length_to_feature_idx) {
   using uindex_t = std::make_unsigned_t<index_t>;
   using uoffset_t = std::make_unsigned_t<offset_t>;
   CUDA_KERNEL_LOOP(b_t, lengths_size) {
-    int32_t t = b_t / B;
+    const auto t = length_to_feature_idx ? length_to_feature_idx[b_t] : b_t / B;
     index_t blk_size = block_sizes_data[t];
     offset_t rowstart = (b_t == 0 ? 0 : offsets_data[b_t - 1]);
     offset_t rowend = offsets_data[b_t];
@@ -115,22 +139,24 @@ DLL_PUBLIC std::tuple<
     c10::optional<Tensor>,
     c10::optional<Tensor>>
 block_bucketize_sparse_features_cuda(
-    Tensor lengths,
-    Tensor indices,
-    bool bucketize_pos,
-    bool sequence,
-    Tensor block_sizes,
-    int64_t my_size,
-    c10::optional<Tensor> weights) {
+    const Tensor& lengths,
+    const Tensor& indices,
+    const bool bucketize_pos,
+    const bool sequence,
+    const Tensor& block_sizes,
+    const int64_t my_size,
+    const c10::optional<Tensor>& weights,
+    const c10::optional<Tensor>& batch_sizes,
+    const int64_t max_B) {
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(lengths, indices);
 
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(lengths.get_device());
   // allocate tensors and buffers
-  const int lengths_size = lengths.numel();
-  const int T = block_sizes.numel();
-  const int B = lengths_size / T;
-  const int new_lengths_size = lengths_size * my_size;
+  const auto lengths_size = lengths.numel();
+  const auto T = block_sizes.numel();
+  const auto B = lengths_size / T;
+  const auto new_lengths_size = lengths_size * my_size;
   auto offsets = at::empty({lengths_size}, lengths.options());
   auto new_lengths = at::zeros({new_lengths_size}, lengths.options());
   auto new_offsets = at::empty({new_lengths_size}, lengths.options());
@@ -138,13 +164,48 @@ block_bucketize_sparse_features_cuda(
   auto lengths_contig = lengths.contiguous();
   auto indices_contig = indices.contiguous();
   auto offsets_contig = offsets.contiguous();
+  auto batch_sizes_contig =
+      batch_sizes.value_or(at::empty({T}, lengths.options())).contiguous();
+  auto batch_sizes_offsets_contig =
+      at::empty({T}, batch_sizes_contig.options());
   Tensor new_weights;
   Tensor new_pos;
   Tensor unbucketize_permute;
   // count nonzeros
   offsets_contig = asynchronous_inclusive_cumsum_gpu(lengths);
-  int threads_per_block = 256;
-  int num_blocks = (lengths_size + threads_per_block - 1) / threads_per_block;
+  if (batch_sizes.has_value()) {
+    TORCH_CHECK(max_B > 0);
+    batch_sizes_offsets_contig =
+        asynchronous_exclusive_cumsum_gpu(batch_sizes.value());
+  }
+  auto length_to_feature_idx =
+      at::empty({lengths_size}, lengths_contig.options());
+  if (batch_sizes.has_value()) {
+    constexpr auto threads_per_block = 256;
+    const auto num_blocks =
+        cuda_calc_xblock_count(max_B * T, threads_per_block);
+    AT_DISPATCH_INDEX_TYPES(
+        offsets_contig.scalar_type(),
+        "_populate_length_to_feature_id_inplace_kernel",
+        [&] {
+          using offset_t = index_t;
+          _populate_length_to_feature_id_inplace_kernel<<<
+              num_blocks,
+              threads_per_block,
+              0,
+              at::cuda::getCurrentCUDAStream()>>>(
+              max_B,
+              T,
+              batch_sizes_contig.data_ptr<offset_t>(),
+              batch_sizes_offsets_contig.data_ptr<offset_t>(),
+              length_to_feature_idx.data_ptr<offset_t>());
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+        });
+  }
+
+  constexpr auto threads_per_block = 256;
+  const auto num_blocks =
+      cuda_calc_xblock_count(lengths_size, threads_per_block);
   AT_DISPATCH_INDEX_TYPES(
       offsets_contig.scalar_type(),
       "_block_bucketize_sparse_features_cuda_kernel1",
@@ -165,7 +226,10 @@ block_bucketize_sparse_features_cuda(
                   my_size,
                   offsets_contig.data_ptr<offset_t>(),
                   indices_contig.data_ptr<index_t>(),
-                  new_lengths.data_ptr<offset_t>());
+                  new_lengths.data_ptr<offset_t>(),
+                  batch_sizes.has_value()
+                      ? length_to_feature_idx.data_ptr<offset_t>()
+                      : static_cast<offset_t*>(nullptr));
               C10_CUDA_KERNEL_LAUNCH_CHECK();
             });
       });
@@ -215,7 +279,10 @@ block_bucketize_sparse_features_cuda(
                                 new_indices.data_ptr<index_t>(),
                                 new_weights.data_ptr<scalar_t>(),
                                 new_pos.data_ptr<index_t>(),
-                                unbucketize_permute.data_ptr<index_t>());
+                                unbucketize_permute.data_ptr<index_t>(),
+                                batch_sizes.has_value()
+                                    ? length_to_feature_idx.data_ptr<offset_t>()
+                                    : static_cast<offset_t*>(nullptr));
                         C10_CUDA_KERNEL_LAUNCH_CHECK();
                       });
                 });
@@ -259,7 +326,10 @@ block_bucketize_sparse_features_cuda(
                                 new_indices.data_ptr<index_t>(),
                                 new_weights.data_ptr<scalar_t>(),
                                 nullptr,
-                                unbucketize_permute.data_ptr<index_t>());
+                                unbucketize_permute.data_ptr<index_t>(),
+                                batch_sizes.has_value()
+                                    ? length_to_feature_idx.data_ptr<offset_t>()
+                                    : static_cast<offset_t*>(nullptr));
                         C10_CUDA_KERNEL_LAUNCH_CHECK();
                       });
                 });
@@ -297,7 +367,10 @@ block_bucketize_sparse_features_cuda(
                           new_indices.data_ptr<index_t>(),
                           nullptr,
                           new_pos.data_ptr<index_t>(),
-                          unbucketize_permute.data_ptr<index_t>());
+                          unbucketize_permute.data_ptr<index_t>(),
+                          batch_sizes.has_value()
+                              ? length_to_feature_idx.data_ptr<offset_t>()
+                              : static_cast<offset_t*>(nullptr));
                   C10_CUDA_KERNEL_LAUNCH_CHECK();
                 });
           });
@@ -333,7 +406,10 @@ block_bucketize_sparse_features_cuda(
                           new_indices.data_ptr<index_t>(),
                           nullptr,
                           nullptr,
-                          unbucketize_permute.data_ptr<index_t>());
+                          unbucketize_permute.data_ptr<index_t>(),
+                          batch_sizes.has_value()
+                              ? length_to_feature_idx.data_ptr<offset_t>()
+                              : static_cast<offset_t*>(nullptr));
                   C10_CUDA_KERNEL_LAUNCH_CHECK();
                 });
           });
@@ -379,7 +455,10 @@ block_bucketize_sparse_features_cuda(
                                 new_indices.data_ptr<index_t>(),
                                 new_weights.data_ptr<scalar_t>(),
                                 new_pos.data_ptr<index_t>(),
-                                nullptr);
+                                nullptr,
+                                batch_sizes.has_value()
+                                    ? length_to_feature_idx.data_ptr<offset_t>()
+                                    : static_cast<offset_t*>(nullptr));
                         C10_CUDA_KERNEL_LAUNCH_CHECK();
                       });
                 });
@@ -423,7 +502,10 @@ block_bucketize_sparse_features_cuda(
                                 new_indices.data_ptr<index_t>(),
                                 new_weights.data_ptr<scalar_t>(),
                                 nullptr,
-                                nullptr);
+                                nullptr,
+                                batch_sizes.has_value()
+                                    ? length_to_feature_idx.data_ptr<offset_t>()
+                                    : static_cast<offset_t*>(nullptr));
                         C10_CUDA_KERNEL_LAUNCH_CHECK();
                       });
                 });
@@ -461,7 +543,10 @@ block_bucketize_sparse_features_cuda(
                           new_indices.data_ptr<index_t>(),
                           nullptr,
                           new_pos.data_ptr<index_t>(),
-                          nullptr);
+                          nullptr,
+                          batch_sizes.has_value()
+                              ? length_to_feature_idx.data_ptr<offset_t>()
+                              : static_cast<offset_t*>(nullptr));
                   C10_CUDA_KERNEL_LAUNCH_CHECK();
                 });
           });
@@ -497,7 +582,10 @@ block_bucketize_sparse_features_cuda(
                           new_indices.data_ptr<index_t>(),
                           nullptr,
                           nullptr,
-                          nullptr);
+                          nullptr,
+                          batch_sizes.has_value()
+                              ? length_to_feature_idx.data_ptr<offset_t>()
+                              : static_cast<offset_t*>(nullptr));
                   C10_CUDA_KERNEL_LAUNCH_CHECK();
                 });
           });
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index a560101fca..70d11d2681 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -203,17 +203,18 @@ template <
     typename index_t,
     typename scalar_t>
 void _block_bucketize_sparse_features_cpu(
-    Tensor lengths,
-    Tensor indices,
-    c10::optional<Tensor> weights,
-    bool bucketize_pos,
-    Tensor block_sizes,
-    int64_t my_size,
+    const Tensor& lengths,
+    const Tensor& indices,
+    const c10::optional<Tensor>& weights,
+    const bool bucketize_pos,
+    const Tensor& block_sizes,
+    const int64_t my_size,
     Tensor new_lengths,
     Tensor new_indices,
     c10::optional<Tensor> new_weights,
     c10::optional<Tensor> new_pos,
-    c10::optional<Tensor> unbucketize_permute) {
+    const c10::optional<Tensor>& unbucketize_permute,
+    const c10::optional<Tensor>& batch_sizes) {
   // allocate tensors and buffers
   const auto lengths_size = lengths.numel();
   const auto new_lengths_size = lengths_size * my_size;
@@ -224,21 +225,24 @@ void _block_bucketize_sparse_features_cpu(
   const offset_t* lengths_data = lengths.data_ptr<offset_t>();
   offset_t* offsets_data = offsets.data_ptr<offset_t>();
   const index_t* indices_data = indices.data_ptr<index_t>();
-  scalar_t* weights_data;
-  scalar_t* new_weights_data;
-  index_t* new_pos_data;
-  index_t* unbucketize_permute_data;
-  offset_t* new_lengths_data = new_lengths.data_ptr<offset_t>();
-  offset_t* new_offsets_data = new_offsets.data_ptr<offset_t>();
-  index_t* new_indices_data = new_indices.data_ptr<index_t>();
-  index_t* block_sizes_data = block_sizes.data_ptr<index_t>();
+  scalar_t* weights_data = nullptr;
+  scalar_t* new_weights_data = nullptr;
+  index_t* new_pos_data = nullptr;
+  index_t* unbucketize_permute_data = nullptr;
+  offset_t* const new_lengths_data = new_lengths.data_ptr<offset_t>();
+  offset_t* const new_offsets_data = new_offsets.data_ptr<offset_t>();
+  index_t* const new_indices_data = new_indices.data_ptr<index_t>();
+  const index_t* const block_sizes_data = block_sizes.data_ptr<index_t>();
+  offset_t* batch_sizes_data = nullptr;
+  const auto variable_batch_size = batch_sizes.has_value();
+
   using uindex_t = std::make_unsigned_t<index_t>;
   using uoffset_t = std::make_unsigned_t<offset_t>;
 
-  if (sequence) {
+  if constexpr (sequence) {
     unbucketize_permute_data = unbucketize_permute.value().data_ptr<index_t>();
   }
-  if (has_weight) {
+  if constexpr (has_weight) {
     weights_data = weights.value().data_ptr<scalar_t>();
     new_weights_data = new_weights.value().data_ptr<scalar_t>();
   }
@@ -246,13 +250,19 @@ void _block_bucketize_sparse_features_cpu(
     new_pos_data = new_pos.value().data_ptr<index_t>();
   }
 
+  if (variable_batch_size) {
+    batch_sizes_data = batch_sizes.value().data_ptr<offset_t>();
+  }
+
   // count nonzeros
   prefix_sum(lengths_size, lengths_data, offsets_data);
   assert(offsets_data[lengths_size] == indices.numel());
+  int64_t cur_offset = 0;
   for (const auto t : c10::irange(T)) {
-    auto blk_size = block_sizes_data[t];
-    for (const auto b : c10::irange(B)) {
-      const auto b_t = t * B + b;
+    const auto blk_size = block_sizes_data[t];
+    const auto cur_batch_size = variable_batch_size ? batch_sizes_data[t] : B;
+    for (const auto b : c10::irange(cur_batch_size)) {
+      const auto b_t = (variable_batch_size ? cur_offset : t * B) + b;
       const offset_t rowstart = offsets_data[b_t];
       const offset_t rowend = offsets_data[b_t + 1];
       for (const auto i : c10::irange(rowstart, rowend)) {
@@ -269,15 +279,18 @@ void _block_bucketize_sparse_features_cpu(
         new_lengths_data[p * lengths_size + b_t]++;
       }
     }
+    cur_offset += cur_batch_size;
   }
 
   // bucketize nonzeros
   prefix_sum(new_lengths_size, new_lengths_data, new_offsets_data);
   assert(new_offsets_data[new_lengths_size] == new_indices.numel());
+  cur_offset = 0;
   for (const auto t : c10::irange(T)) {
-    auto blk_size = block_sizes_data[t];
-    for (const auto b : c10::irange(B)) {
-      const auto b_t = t * B + b;
+    const auto blk_size = block_sizes_data[t];
+    const auto cur_batch_size = variable_batch_size ? batch_sizes_data[t] : B;
+    for (const auto b : c10::irange(cur_batch_size)) {
+      const auto b_t = (variable_batch_size ? cur_offset : t * B) + b;
       const offset_t rowstart = offsets_data[b_t];
       const offset_t rowend = offsets_data[b_t + 1];
       for (const auto i : c10::irange(rowstart, rowend)) {
@@ -308,6 +321,7 @@ void _block_bucketize_sparse_features_cpu(
         }
       }
     }
+    cur_offset += cur_batch_size;
   }
 }
 
@@ -819,13 +833,16 @@ std::tuple<
     c10::optional<Tensor>,
     c10::optional<Tensor>>
 block_bucketize_sparse_features_cpu(
-    Tensor lengths,
-    Tensor indices,
-    bool bucketize_pos,
-    bool sequence,
-    Tensor block_sizes,
-    int64_t my_size,
-    c10::optional<Tensor> weights) {
+    const Tensor& lengths,
+    const Tensor& indices,
+    const bool bucketize_pos,
+    const bool sequence,
+    const Tensor& block_sizes,
+    const int64_t my_size,
+    const c10::optional<Tensor>& weights,
+    const c10::optional<Tensor>& batch_sizes,
+    const int64_t /* max_batch_size */ // Only used in GPU variant
+) {
   const auto lengths_size = lengths.numel();
   const auto new_lengths_size = lengths_size * my_size;
   auto new_lengths = at::zeros({new_lengths_size}, lengths.options());
@@ -871,7 +888,8 @@ block_bucketize_sparse_features_cpu(
                             new_indices,
                             new_weights,
                             new_pos,
-                            unbucketize_permute);
+                            unbucketize_permute,
+                            batch_sizes);
                       });
                 });
           });
@@ -905,7 +923,8 @@ block_bucketize_sparse_features_cpu(
                             new_indices,
                             new_weights,
                             new_pos,
-                            unbucketize_permute);
+                            unbucketize_permute,
+                            batch_sizes);
                       });
                 });
           });
@@ -937,7 +956,8 @@ block_bucketize_sparse_features_cpu(
                       new_indices,
                       new_weights,
                       new_pos,
-                      unbucketize_permute);
+                      unbucketize_permute,
+                      batch_sizes);
                 });
           });
     } else {
@@ -964,7 +984,8 @@ block_bucketize_sparse_features_cpu(
                       new_indices,
                       new_weights,
                       new_pos,
-                      unbucketize_permute);
+                      unbucketize_permute,
+                      batch_sizes);
                 });
           });
     }
@@ -2603,7 +2624,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "expand_into_jagged_permute(Tensor permute, Tensor input_offset, Tensor output_offset, int output_size) -> Tensor");
   m.def(
-      "block_bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, bool sequence, Tensor block_sizes, int my_size, Tensor? weights=None) -> (Tensor, Tensor, Tensor?, Tensor?, Tensor?)");
+      "block_bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, bool sequence, Tensor block_sizes, int my_size, Tensor? weights=None, Tensor? batch_sizes=None, int max_B= -1) -> (Tensor, Tensor, Tensor?, Tensor?, Tensor?)");
   m.def(
       "bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, int my_size, Tensor? weights=None) -> (Tensor, Tensor, Tensor?, Tensor?)");
   m.def("asynchronous_exclusive_cumsum(Tensor t_in) -> Tensor");
diff --git a/fbgemm_gpu/test/sparse_ops_test.py b/fbgemm_gpu/test/sparse_ops_test.py
index 9ef594259f..e4981a2a04 100644
--- a/fbgemm_gpu/test/sparse_ops_test.py
+++ b/fbgemm_gpu/test/sparse_ops_test.py
@@ -844,6 +844,93 @@ def test_block_bucketize_sparse_features(
                     unbucketized_indices, indices, rtol=0, atol=0
                 )
 
+    @given(
+        index_type=st.sampled_from([torch.int, torch.long]),
+        has_weight=st.booleans(),
+        bucketize_pos=st.booleans(),
+        sequence=st.booleans(),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=16, deadline=None)
+    def test_block_bucketize_sparse_features_with_variable_batch_sizes(
+        self,
+        index_type: Optional[torch.dtype],
+        has_weight: bool,
+        bucketize_pos: bool,
+        sequence: bool,
+    ) -> None:
+        lengths = torch.tensor([2, 1, 1, 2, 0, 2], dtype=index_type)
+        indices = torch.tensor(
+            [1, 8, 5, 6, 7, 8, 8, 4],
+            dtype=index_type,
+        )
+        batch_sizes = torch.tensor([3, 1, 2], dtype=index_type)
+        weights = (
+            torch.tensor(
+                [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+                dtype=torch.float,
+            )
+            if has_weight
+            else None
+        )
+
+        block_sizes = torch.tensor([5, 10, 8], dtype=index_type)
+        my_size = 2
+        max_B = batch_sizes.max().item()
+
+        new_lengths_ref = torch.tensor(
+            [1, 0, 0, 2, 0, 1, 1, 1, 1, 0, 0, 1],
+            dtype=index_type,
+        )
+        new_indices_ref = torch.tensor(
+            [1, 7, 8, 4, 3, 0, 1, 0],
+            dtype=index_type,
+        )
+
+        (
+            new_lengths_cpu,
+            new_indices_cpu,
+            new_weights_cpu,
+            new_pos_cpu,
+            unbucketize_permute,
+        ) = torch.ops.fbgemm.block_bucketize_sparse_features(
+            lengths,
+            indices,
+            bucketize_pos,
+            sequence,
+            block_sizes,
+            my_size,
+            weights,
+            batch_sizes,
+        )
+        torch.testing.assert_close(new_lengths_cpu, new_lengths_ref, rtol=0, atol=0)
+        torch.testing.assert_close(new_indices_cpu, new_indices_ref, rtol=0, atol=0)
+
+        if gpu_available:
+            (
+                new_lengths_gpu,
+                new_indices_gpu,
+                new_weights_gpu,
+                new_pos_gpu,
+                unbucketize_permute_gpu,
+            ) = torch.ops.fbgemm.block_bucketize_sparse_features(
+                lengths.cuda(),
+                indices.cuda(),
+                bucketize_pos,
+                sequence,
+                block_sizes.cuda(),
+                my_size,
+                weights.cuda() if weights is not None else None,
+                batch_sizes.cuda(),
+                max_B,
+            )
+
+            torch.testing.assert_close(
+                new_lengths_gpu.cpu(), new_lengths_ref, rtol=0, atol=0
+            )
+            torch.testing.assert_close(
+                new_indices_gpu.cpu(), new_indices_ref, rtol=0, atol=0
+            )
+
     @given(
         index_type=st.sampled_from([torch.int, torch.long]),
         has_weight=st.booleans(),

From 85de33b8738c2e882f2944cfd5068439d26c7d01 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@meta.com>
Date: Mon, 25 Sep 2023 12:51:36 -0700
Subject: [PATCH 46/94] Fix incorrect SymInt signature on dense_to_jagged
 (#2039)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2039

Caught this when I was tightening up the error checking at
https://github.com/pytorch/pytorch/pull/109727  Need to fix
the problem before I land the improved error checking.

Reviewed By: zou3519

Differential Revision: D49572882

fbshipit-source-id: 59345bf2bd7b969a6739f3d1cf8bf47c9cdb0e58
---
 fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h                    | 2 +-
 fbgemm_gpu/src/jagged_tensor_ops/dense_to_jagged_forward.cu   | 2 +-
 .../src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp      | 4 ++--
 fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp    | 2 +-
 fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp   | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
index cc01a595bb..9164de0b65 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
@@ -463,7 +463,7 @@ std::tuple<at::Tensor, std::vector<at::Tensor>> jagged_dense_elementwise_mul(
 std::tuple<at::Tensor, std::vector<at::Tensor>> dense_to_jagged(
     const at::Tensor& dense,
     const std::vector<at::Tensor>& offsets,
-    const c10::optional<at::SymInt>& total_L);
+    c10::optional<at::SymInt> total_L);
 
 std::tuple<at::Tensor, std::vector<at::Tensor>>
 jagged_dense_elementwise_add_jagged_output(
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/dense_to_jagged_forward.cu b/fbgemm_gpu/src/jagged_tensor_ops/dense_to_jagged_forward.cu
index a2291bd26a..fadee0695e 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/dense_to_jagged_forward.cu
+++ b/fbgemm_gpu/src/jagged_tensor_ops/dense_to_jagged_forward.cu
@@ -15,7 +15,7 @@ namespace fbgemm_gpu {
 Tensor dense_to_jagged_forward(
     const Tensor& dense,
     const std::vector<Tensor>& offsets,
-    const c10::optional<at::SymInt>& total_L) {
+    c10::optional<at::SymInt> total_L) {
   // D is the embedding dimension
   auto D = dense.size(-1);
 
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
index 5fdc92d759..45e12412c9 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
@@ -284,7 +284,7 @@ class DenseToJaggedOp : public torch::autograd::Function<DenseToJaggedOp> {
             .typed<Tensor(
                 const Tensor& dense,
                 const std::vector<Tensor>& offsets,
-                const c10::optional<at::SymInt>& total_L)>();
+                c10::optional<at::SymInt> total_L)>();
     auto output = op.call(dense, offsets, total_L);
 
     return {output};
@@ -763,7 +763,7 @@ Tensor batched_dense_vec_jagged_2d_mul(
 std::tuple<Tensor, std::vector<Tensor>> dense_to_jagged(
     const Tensor& dense,
     const std::vector<Tensor>& offsets,
-    const c10::optional<at::SymInt>& total_L) {
+    c10::optional<at::SymInt> total_L) {
   return {DenseToJaggedOp::apply(dense, offsets, total_L)[0], offsets};
 }
 
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
index 084c0488b9..00f2c4f960 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
@@ -464,7 +464,7 @@ at::Tensor jagged_to_padded_dense_backward(
 Tensor dense_to_jagged_forward(
     const Tensor& dense,
     const std::vector<Tensor>& offsets,
-    const c10::optional<at::SymInt>& total_L) {
+    c10::optional<at::SymInt> total_L) {
   // D is the embedding dimension
   auto D = dense.size(-1);
 
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp
index 8046e5da93..bf6b4be2b4 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp
@@ -95,7 +95,7 @@ Tensor jagged_dense_elementwise_add_meta(
 Tensor dense_to_jagged_forward_meta(
     const Tensor& dense,
     const std::vector<Tensor>& offsets,
-    const c10::optional<at::SymInt>& total_L) {
+    c10::optional<at::SymInt> total_L) {
   auto dense_values = dense;
   at::SymInt D = dense_values.sym_size(-1);
   TORCH_CHECK_NOT_IMPLEMENTED(
@@ -110,7 +110,7 @@ Tensor dense_to_jagged_forward_meta(
 std::tuple<Tensor, std::vector<Tensor>> dense_to_jagged_meta(
     const Tensor& dense,
     const std::vector<Tensor>& offsets,
-    const c10::optional<at::SymInt>& total_L) {
+    c10::optional<at::SymInt> total_L) {
   return {dense_to_jagged_forward_meta(dense, offsets, total_L), offsets};
 }
 

From 9b681e53f2015e85558a4376f728229182978b28 Mon Sep 17 00:00:00 2001
From: Flavio Sales Truzzi <ftruzzi@meta.com>
Date: Mon, 25 Sep 2023 23:46:21 -0700
Subject: [PATCH 47/94] - Support for CPU/GPU compilation (#2040)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2040

Backwards was not working for CPU.

Moving the Autograd registration to CPU fixes the issue. On CPU only compilation since the GPU code was not built there was no autograd registration.

Also re-activated the test on github.

Reviewed By: spcyppt

Differential Revision: D49615498

fbshipit-source-id: 4e87c7abc4a2f1fbe99cfe34f39e429178aa1244
---
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp | 72 +++++++++++++++++++
 fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp | 74 +-------------------
 fbgemm_gpu/test/sparse_ops_test.py           | 10 +--
 3 files changed, 75 insertions(+), 81 deletions(-)

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index 70d11d2681..58b345367d 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -15,6 +15,7 @@
 #include <torch/library.h>
 #include "ATen/Parallel.h"
 
+#include <ATen/core/dispatch/Dispatcher.h>
 #include <torch/csrc/autograd/custom_function.h>
 #include "fbgemm_gpu/sparse_ops.h"
 #include "fbgemm_gpu/sparse_ops_utils.h"
@@ -55,6 +56,73 @@ using Tensor = at::Tensor;
 
 namespace fbgemm_gpu {
 
+// Custom PackSegments operator that is based on the Caffe2 PackSegments and
+// UnpackSegments.
+// Needed this to support backward pass.
+class PackSegments : public torch::autograd::Function<PackSegments> {
+ public:
+  static torch::autograd::variable_list forward(
+      torch::autograd::AutogradContext* ctx,
+      const Tensor& t_in,
+      const Tensor& lengths,
+      const at::SymInt& max_length) {
+    const at::SymInt total_length = t_in.sym_size(0);
+
+    at::AutoDispatchBelowADInplaceOrView guard;
+
+    static auto custom_pack_segments_op =
+        at::Dispatcher::singleton()
+            .findSchemaOrThrow("fbgemm::pack_segments", "")
+            .typed<at::Tensor(
+                const at::Tensor&, const at::Tensor&, const at::SymInt)>();
+
+    Tensor res = custom_pack_segments_op.call(t_in, lengths, max_length);
+
+    ctx->saved_data["max_length"] = max_length;
+    ctx->saved_data["total_length"] = total_length;
+    ctx->save_for_backward({lengths});
+
+    return {res};
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_output) {
+    TORCH_CHECK(grad_output.size() == 2 or grad_output.size() == 1);
+    const Tensor& grad = grad_output[0];
+    const auto& max_length = ctx->saved_data["max_length"].toSymInt();
+    const auto& total_length = ctx->saved_data["total_length"].toSymInt();
+
+    // Retrieve saved variables for backward.
+    const auto& saved_variables = ctx->get_saved_variables();
+    const auto& lengths = saved_variables[0];
+
+    torch::autograd::variable_list grad_inputs(5);
+
+    static auto custom_pack_segments_backward_op =
+        at::Dispatcher::singleton()
+            .findSchemaOrThrow("fbgemm::pack_segments_backward", "")
+            .typed<at::Tensor(
+                const at::Tensor&,
+                const at::Tensor&,
+                const at::SymInt,
+                const at::SymInt)>();
+
+    grad_inputs[0] = custom_pack_segments_backward_op.call(
+        grad, lengths, total_length, max_length);
+    return grad_inputs;
+  }
+};
+
+Tensor pack_segments_autograd(
+    const Tensor& t_in,
+    const Tensor& lengths,
+    const at::SymInt max_length
+
+) {
+  return PackSegments::apply(t_in, lengths, max_length)[0];
+}
+
 Tensor native_empty_like(const Tensor& self) {
   return at::native::empty_like(
       self,
@@ -2767,3 +2835,7 @@ TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
       "group_index_select_dim0", fbgemm_gpu::group_index_select_dim0);
   DISPATCH_TO_CPU("bottom_k_per_row", fbgemm_gpu::bottom_k_per_row);
 }
+
+TORCH_LIBRARY_IMPL(fbgemm, Autograd, m) {
+  m.impl("pack_segments", &fbgemm_gpu::pack_segments_autograd);
+}
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp
index 8937708f5b..2284b64dd6 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp
@@ -58,73 +58,6 @@ void offset_args(
 }
 } // namespace
 
-// Custom PackSegments operator that is based on the Caffe2 PackSegments and
-// UnpackSegments.
-// Needed this to support backward pass.
-class PackSegments : public torch::autograd::Function<PackSegments> {
- public:
-  static torch::autograd::variable_list forward(
-      torch::autograd::AutogradContext* ctx,
-      const Tensor& t_in,
-      const Tensor& lengths,
-      const at::SymInt& max_length) {
-    const at::SymInt total_length = t_in.sym_size(0);
-
-    at::AutoDispatchBelowADInplaceOrView guard;
-
-    static auto custom_pack_segments_op =
-        torch::Dispatcher::singleton()
-            .findSchemaOrThrow("fbgemm::pack_segments", "")
-            .typed<at::Tensor(
-                const at::Tensor&, const at::Tensor&, const at::SymInt)>();
-
-    Tensor res = custom_pack_segments_op.call(t_in, lengths, max_length);
-
-    ctx->saved_data["max_length"] = max_length;
-    ctx->saved_data["total_length"] = total_length;
-    ctx->save_for_backward({lengths});
-
-    return {res};
-  }
-
-  static torch::autograd::variable_list backward(
-      torch::autograd::AutogradContext* ctx,
-      torch::autograd::variable_list grad_output) {
-    TORCH_CHECK(grad_output.size() == 2 or grad_output.size() == 1);
-    const Tensor& grad = grad_output[0];
-    const auto& max_length = ctx->saved_data["max_length"].toSymInt();
-    const auto& total_length = ctx->saved_data["total_length"].toSymInt();
-
-    // Retrieve saved variables for backward.
-    const auto& saved_variables = ctx->get_saved_variables();
-    const auto& lengths = saved_variables[0];
-
-    torch::autograd::variable_list grad_inputs(5);
-
-    static auto custom_pack_segments_backward_op =
-        torch::Dispatcher::singleton()
-            .findSchemaOrThrow("fbgemm::pack_segments_backward", "")
-            .typed<at::Tensor(
-                const at::Tensor&,
-                const at::Tensor&,
-                const at::SymInt,
-                const at::SymInt)>();
-
-    grad_inputs[0] = custom_pack_segments_backward_op.call(
-        grad, lengths, total_length, max_length);
-    return grad_inputs;
-  }
-};
-
-torch::Tensor pack_segments_autograd(
-    const Tensor& t_in,
-    const Tensor& lengths,
-    const at::SymInt max_length
-
-) {
-  return PackSegments::apply(t_in, lengths, max_length)[0];
-}
-
 class LookupFunctionBatchedUnaryEmbeddingOp
     : public torch::autograd::Function<LookupFunctionBatchedUnaryEmbeddingOp> {
  public:
@@ -610,8 +543,7 @@ Tensor pack_segments_cuda(
     const Tensor& t_in,
     const Tensor& lengths,
     const int64_t max_length) {
-  const auto& res = PackSegments::apply(t_in, lengths, max_length);
-  return res[0];
+  return fbgemm_gpu::pack_segments_forward_cuda(t_in, lengths, max_length)[0];
 }
 
 Tensor index_select_dim0_gpu(
@@ -683,7 +615,3 @@ TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
   DISPATCH_TO_CUDA(
       "group_index_select_dim0", fbgemm_gpu::group_index_select_dim0_gpu);
 }
-
-TORCH_LIBRARY_IMPL(fbgemm, Autograd, m) {
-  m.impl("pack_segments", &fbgemm_gpu::pack_segments_autograd);
-}
diff --git a/fbgemm_gpu/test/sparse_ops_test.py b/fbgemm_gpu/test/sparse_ops_test.py
index e4981a2a04..349bc45c1c 100644
--- a/fbgemm_gpu/test/sparse_ops_test.py
+++ b/fbgemm_gpu/test/sparse_ops_test.py
@@ -26,17 +26,12 @@
     from fbgemm_gpu import open_source  # noqa: F401
 
     # pyre-ignore[21]
-    from test_utils import gpu_available, gpu_unavailable, running_on_github, skipIfRocm
+    from test_utils import gpu_available, gpu_unavailable, skipIfRocm
 except Exception:
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/codegen:index_select_ops")
-    from fbgemm_gpu.test.test_utils import (
-        gpu_available,
-        gpu_unavailable,
-        running_on_github,
-        skipIfRocm,
-    )
+    from fbgemm_gpu.test.test_utils import gpu_available, gpu_unavailable, skipIfRocm
 
 
 def unbucketize_indices_value(
@@ -1825,7 +1820,6 @@ def _pack_segments_ref(
         ),
         torch_compile=st.booleans(),
     )
-    @unittest.skipIf(*running_on_github)
     @settings(deadline=None)
     def test_pack_segments(
         self,

From 58d47266a19f9e60eb2ebdfc26f1a83fbf461fdb Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Tue, 26 Sep 2023 00:37:08 -0700
Subject: [PATCH 48/94] Re-enable tests in `permute_pooled_embedding_test.py`
 (#2034)

Summary:
- Re-enable tests in `permute_pooled_embedding_test.py` by silencing the hypothesis warnings about multiple executors

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2034

Reviewed By: spcyppt

Differential Revision: D49539164

Pulled By: q10

fbshipit-source-id: 893df71eddeb554395a5fd07723af4cb241e883f
---
 .github/scripts/fbgemm_gpu_test.bash          |  16 +-
 .github/scripts/utils_cuda.bash               |   8 +-
 fbgemm_gpu/CMakeLists.txt                     |   2 +
 .../permute_pooled_embedding_modules.py       |   3 +
 fbgemm_gpu/include/fbgemm_gpu/ops_utils.h     |  17 ++
 .../fbgemm_gpu/permute_pooled_embedding_ops.h |  28 +++
 .../permute_pooled_embedding_function.cpp     |  79 ++++++
 .../permute_pooled_embedding_ops.cu           |   1 +
 .../permute_pooled_embedding_ops_cpu.cpp      | 181 ++++++++++++++
 .../permute_pooled_embedding_ops_gpu.cpp      | 226 ------------------
 .../test/permute_pooled_embedding_test.py     |  33 ++-
 11 files changed, 351 insertions(+), 243 deletions(-)
 create mode 100644 fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp

diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
index 5c589dc858..0b7a99334a 100644
--- a/.github/scripts/fbgemm_gpu_test.bash
+++ b/.github/scripts/fbgemm_gpu_test.bash
@@ -187,14 +187,28 @@ test_setup_conda_environment () {
 test_fbgemm_gpu_build_and_install () {
   local env_name="$1"
   local pytorch_variant_type="$2"
+  if [ "$pytorch_variant_type" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME PYTORCH_VARIANT_TYPE"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env cuda   # Build and install FBGEMM_GPU for CUDA (All Steps)"
+    return 1
+  else
+    echo "################################################################################"
+    echo "# Setup FBGEMM-GPU Build Container (All Steps)"
+    echo "#"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
+    echo "################################################################################"
+    echo ""
+  fi
 
   # Assume we are starting from the repository root directory
   cd fbgemm_gpu                                                               || return 1
   prepare_fbgemm_gpu_build    "${env_name}"                                   || return 1
   build_fbgemm_gpu_package    "${env_name}" release "${pytorch_variant_type}" || return 1
+
   # shellcheck disable=SC2164
   cd -
-  install_fbgemm_gpu_wheel  "${env_name}" fbgemm_gpu/dist/*.whl             || return 1
+  install_fbgemm_gpu_wheel    "${env_name}" fbgemm_gpu/dist/*.whl             || return 1
 
   cd fbgemm_gpu/test                        || return 1
   run_fbgemm_gpu_tests        "${env_name}" || return 1
diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash
index 10efbcf906..d068896e54 100644
--- a/.github/scripts/utils_cuda.bash
+++ b/.github/scripts/utils_cuda.bash
@@ -117,11 +117,11 @@ install_cudnn () {
   local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}"
 
   # Get the URL
-  local cudnn_url="${cudnn_packages[cuda_concat_version]}"
+  local cudnn_url="${cudnn_packages[$cuda_concat_version]}"
   if [ "$cudnn_url" == "" ]; then
-    # Default to cuDNN for 11.7 if no CUDA version fits
-    echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7"
-    cudnn_url="${cudnn_packages[117]}"
+    # Default to cuDNN for 11.8 if no CUDA version fits
+    echo "[INSTALL] Defaulting to cuDNN for CUDA 11.8"
+    cudnn_url="${cudnn_packages[118]}"
   fi
 
   # Clear the install path
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 26689a12d5..04c2f051b9 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -561,6 +561,8 @@ set(fbgemm_gpu_sources_static_cpu
     codegen/embedding_forward_quantized_host_cpu.cpp
     codegen/embedding_backward_dense_host_cpu.cpp
     codegen/embedding_bounds_check_host_cpu.cpp
+    src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp
+    src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp
     src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
     src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp
diff --git a/fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules.py b/fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules.py
index 059db15cbf..4ce1cce9ad 100644
--- a/fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules.py
+++ b/fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules.py
@@ -16,6 +16,9 @@
     # pyre-ignore[21]
     from fbgemm_gpu import open_source  # noqa: F401
 except Exception:
+    torch.ops.load_library(
+        "//deeplearning/fbgemm/fbgemm_gpu:permute_pooled_embedding_ops_cpu"
+    )
     torch.ops.load_library(
         "//deeplearning/fbgemm/fbgemm_gpu:permute_pooled_embedding_ops_gpu"
     )
diff --git a/fbgemm_gpu/include/fbgemm_gpu/ops_utils.h b/fbgemm_gpu/include/fbgemm_gpu/ops_utils.h
index d647a6b088..45b6e71172 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/ops_utils.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/ops_utils.h
@@ -8,7 +8,24 @@
 
 #pragma once
 
+#ifdef FBGEMM_GPU_ENABLE_DUMMY_IA32_SERIALIZE
+// Workaround the missing __builtin_ia32_serialize issue
+#if defined(__NVCC__) && \
+    (__CUDACC_VER_MAJOR__ > 11 || __CUDACC_VER_MINOR__ >= 4)
+#if defined(__i386__) || defined(__i686__) || defined(__x86_64__)
+static __inline void __attribute__((
+    __gnu_inline__,
+    __always_inline__,
+    __artificial__,
+    __target__("serialize"))) __builtin_ia32_serialize(void) {
+  abort();
+}
+#endif
+#endif // __NVCC__
+#endif // FBGEMM_GPU_ENABLE_DUMMY_IA32_SERIALIZE
+
 #include <ATen/ATen.h>
+#include <ATen/core/op_registration/op_registration.h>
 #include <torch/library.h>
 
 /*
diff --git a/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops.h b/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops.h
index e62c4105e4..1e08490ede 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops.h
@@ -9,6 +9,12 @@
 #pragma once
 
 #include <ATen/ATen.h>
+#include <torch/script.h>
+#include "fbgemm_gpu/ops_utils.h"
+#include "fbgemm_gpu/sparse_ops_utils.h"
+
+///@defgroup permute-pooled-embs-gpu
+///@defgroup permute-pooled-embs-cpu
 
 namespace fbgemm_gpu {
 
@@ -55,4 +61,26 @@ at::Tensor permute_pooled_embs_gpu(
     const at::Tensor& permute_list,
     const at::Tensor& inv_offset_dim_list,
     const at::Tensor& inv_permute_list);
+
+using torch::autograd::AutogradContext;
+using torch::autograd::Variable;
+using torch::autograd::variable_list;
+
+class PermutePooledEmbsFunction
+    : public torch::autograd::Function<PermutePooledEmbsFunction> {
+ public:
+  static Variable forward(
+      AutogradContext* ctx,
+      const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
+      const at::Tensor& offset_dim_list,
+      const at::Tensor& permute_list,
+      const at::Tensor& inv_offset_dim_list,
+      const at::Tensor& inv_permute_list,
+      const bool& allow_duplicates = false);
+
+  static variable_list backward(
+      AutogradContext* ctx,
+      variable_list grad_output);
+};
+
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp
new file mode 100644
index 0000000000..e0fa06e289
--- /dev/null
+++ b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "fbgemm_gpu/permute_pooled_embedding_ops.h"
+
+using Tensor = at::Tensor;
+
+namespace fbgemm_gpu {
+
+using torch::autograd::AutogradContext;
+using torch::autograd::Variable;
+using torch::autograd::variable_list;
+
+Variable PermutePooledEmbsFunction::forward(
+    AutogradContext* ctx,
+    const Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
+    const Tensor& offset_dim_list,
+    const Tensor& permute_list,
+    const Tensor& inv_offset_dim_list,
+    const Tensor& inv_permute_list,
+    const bool& allow_duplicates) {
+  ctx->saved_data["offset_dim_list"] = offset_dim_list;
+  ctx->saved_data["permute_list"] = permute_list;
+  ctx->saved_data["inv_offset_dim_list"] = inv_offset_dim_list;
+  ctx->saved_data["inv_permute_list"] = inv_permute_list;
+  ctx->saved_data["allow_duplicates"] = allow_duplicates;
+  TORCH_CHECK(
+      offset_dim_list.scalar_type() == at::ScalarType::Long,
+      "offset_dim_list needs to have long/int64 type");
+  TORCH_CHECK(
+      permute_list.scalar_type() == at::ScalarType::Long,
+      "permute_list needs to have long/int64 type");
+
+  const auto schema = allow_duplicates ? "fbgemm::permute_duplicate_pooled_embs"
+                                       : "fbgemm::permute_pooled_embs";
+  const auto permute_pooled_embs_op =
+      torch::Dispatcher::singleton()
+          .findSchemaOrThrow(schema, "")
+          .typed<decltype(permute_pooled_embs_cpu)>();
+  return permute_pooled_embs_op.call(
+      pooled_embs,
+      offset_dim_list,
+      permute_list,
+      inv_offset_dim_list,
+      inv_permute_list);
+}
+
+variable_list PermutePooledEmbsFunction::backward(
+    AutogradContext* ctx,
+    variable_list grad_output) {
+  const auto& offset_dim_list = ctx->saved_data["offset_dim_list"].toTensor();
+  const auto& permute_list = ctx->saved_data["permute_list"].toTensor();
+  const auto& inv_offset_dim_list =
+      ctx->saved_data["inv_offset_dim_list"].toTensor();
+  const auto& inv_permute_list = ctx->saved_data["inv_permute_list"].toTensor();
+  const auto& allow_duplicates = ctx->saved_data["allow_duplicates"].toBool();
+  TORCH_CHECK(
+      allow_duplicates == false,
+      "permute_pooled_embs does not support allow_duplicates in backward!");
+  variable_list grad_inputs(6);
+  static auto permute_pooled_embs_op =
+      torch::Dispatcher::singleton()
+          .findSchemaOrThrow("fbgemm::permute_pooled_embs", "")
+          .typed<decltype(permute_pooled_embs_cpu)>();
+  grad_inputs[0] = permute_pooled_embs_op.call(
+      grad_output[0],
+      inv_offset_dim_list,
+      inv_permute_list,
+      offset_dim_list,
+      permute_list);
+  return grad_inputs;
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu
index 0f4a219f6a..7331e59c20 100644
--- a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu
+++ b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu
@@ -12,6 +12,7 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include "fbgemm_gpu/ops_utils.h"
 
 #include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
 #include "fbgemm_gpu/layout_transform_ops.cuh"
diff --git a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
index 6eafbe306a..2e884f3a3c 100644
--- a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
+++ b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
@@ -5,3 +5,184 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+
+#include <c10/util/irange.h>
+#include <vector>
+#include "fbgemm_gpu/permute_pooled_embedding_ops.h"
+
+using Tensor = at::Tensor;
+
+namespace fbgemm_gpu {
+
+///@ingroup permute-pooled-embs-cpu-impl
+Tensor permute_pooled_embs_cpu_impl(
+    const Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
+    const Tensor& offset_dim_list,
+    const Tensor& permute_list,
+    const Tensor& inv_offset_dim_list,
+    const Tensor& inv_permute_list,
+    const bool& allow_duplicates) {
+  TORCH_CHECK(
+      offset_dim_list.scalar_type() == at::ScalarType::Long,
+      "offset_dim_list needs to have long/int64 type")
+  TORCH_CHECK(
+      permute_list.scalar_type() == at::ScalarType::Long,
+      "permute_list needs to have long/int64 type")
+  auto permute = permute_list.data_ptr<int64_t>();
+  const auto n = permute_list.numel();
+  const auto dims_size = allow_duplicates ? offset_dim_list.numel() : n;
+  std::vector<int64_t> dims;
+  dims.reserve(dims_size - 1);
+  for (const auto i : c10::irange(1, dims_size)) {
+    dims.push_back(offset_dim_list[i].item<int64_t>());
+  }
+  auto ts = pooled_embs.tensor_split(dims, 1);
+  std::vector<Tensor> permuted_ts;
+  permuted_ts.reserve(n);
+  for (const auto i : c10::irange(n)) {
+    permuted_ts.push_back(ts[permute[i]]);
+  }
+  return at::cat(permuted_ts, 1);
+}
+
+///@ingroup permute-pooled-embs-cpu
+at::Tensor permute_pooled_embs_cpu(
+    const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
+    const at::Tensor& offset_dim_list,
+    const at::Tensor& permute_list,
+    const at::Tensor& inv_offset_dim_list,
+    const at::Tensor& inv_permute_list) {
+  return permute_pooled_embs_cpu_impl(
+      pooled_embs,
+      offset_dim_list,
+      permute_list,
+      inv_offset_dim_list,
+      inv_permute_list,
+      false);
+}
+
+///@ingroup permute-duplicate-pooled-embs-cpu
+at::Tensor permute_duplicate_pooled_embs_cpu(
+    const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
+    const at::Tensor& offset_dim_list,
+    const at::Tensor& permute_list,
+    const at::Tensor& inv_offset_dim_list,
+    const at::Tensor& inv_permute_list) {
+  return permute_pooled_embs_cpu_impl(
+      pooled_embs,
+      offset_dim_list,
+      permute_list,
+      inv_offset_dim_list,
+      inv_permute_list,
+      true);
+}
+
+///@ingroup permute-pooled-embs-cpu
+at::Tensor permute_pooled_embs_auto_grad(
+    const Tensor& pooled_embs,
+    const Tensor& offset_dim_list,
+    const Tensor& permute_list,
+    const Tensor& inv_offset_dim_list,
+    const Tensor& inv_permute_list) {
+  return PermutePooledEmbsFunction::apply(
+      pooled_embs,
+      offset_dim_list,
+      permute_list,
+      inv_offset_dim_list,
+      inv_permute_list,
+      false);
+}
+
+///@ingroup permute-pooled-embs-cpu
+at::Tensor permute_pooled_embs_auto_grad_cpu(
+    const Tensor& pooled_embs,
+    const Tensor& offset_dim_list,
+    const Tensor& permute_list,
+    const Tensor& inv_offset_dim_list,
+    const Tensor& inv_permute_list) {
+  return PermutePooledEmbsFunction::apply(
+      pooled_embs,
+      offset_dim_list,
+      permute_list,
+      inv_offset_dim_list,
+      inv_permute_list,
+      false);
+}
+
+///@ingroup permute-duplicate-pooled-embs-cpu
+at::Tensor permute_duplicate_pooled_embs_auto_grad_cpu(
+    const Tensor& pooled_embs,
+    const Tensor& offset_dim_list,
+    const Tensor& permute_list,
+    const Tensor& inv_offset_dim_list,
+    const Tensor& inv_permute_list) {
+  return PermutePooledEmbsFunction::apply(
+      pooled_embs,
+      offset_dim_list,
+      permute_list,
+      inv_offset_dim_list,
+      inv_permute_list,
+      true);
+}
+
+at::Tensor permute_pooled_embs_meta(
+    const Tensor& pooled_embs,
+    const Tensor& /* offset_dim_list */,
+    const Tensor& /* permute_list */,
+    const Tensor& /* inv_offset_dim_list */,
+    const Tensor& /* inv_permute_list */) {
+  return torch::empty_like(pooled_embs);
+}
+
+at::Tensor permute_pooled_embs_auto_grad_meta(
+    const Tensor& pooled_embs,
+    const Tensor& /* offset_dim_list */,
+    const Tensor& /* permute_list */,
+    const Tensor& /* inv_offset_dim_list */,
+    const Tensor& /* inv_permute_list */) {
+  return torch::empty_like(pooled_embs);
+}
+
+} // namespace fbgemm_gpu
+
+TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
+  m.def(
+      "permute_pooled_embs(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor");
+  m.def(
+      "permute_pooled_embs_auto_grad(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor");
+  m.def(
+      "permute_duplicate_pooled_embs(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor");
+  m.def(
+      "permute_duplicate_pooled_embs_auto_grad(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor");
+}
+
+FBGEMM_OP_DISPATCH(
+    CPU,
+    "permute_pooled_embs",
+    fbgemm_gpu::permute_pooled_embs_cpu);
+FBGEMM_OP_DISPATCH(
+    CPU,
+    "permute_pooled_embs_auto_grad",
+    fbgemm_gpu::permute_pooled_embs_auto_grad_cpu);
+FBGEMM_OP_DISPATCH(
+    CPU,
+    "permute_duplicate_pooled_embs",
+    fbgemm_gpu::permute_duplicate_pooled_embs_cpu);
+FBGEMM_OP_DISPATCH(
+    CPU,
+    "permute_duplicate_pooled_embs_auto_grad",
+    fbgemm_gpu::permute_duplicate_pooled_embs_auto_grad_cpu);
+
+FBGEMM_OP_DISPATCH(
+    Meta,
+    "permute_pooled_embs",
+    fbgemm_gpu::permute_pooled_embs_meta);
+FBGEMM_OP_DISPATCH(
+    Meta,
+    "permute_pooled_embs_auto_grad",
+    fbgemm_gpu::permute_pooled_embs_auto_grad_meta);
+
+FBGEMM_OP_DISPATCH(
+    Autograd,
+    "permute_pooled_embs_auto_grad",
+    fbgemm_gpu::permute_pooled_embs_auto_grad);
diff --git a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp
index f0a02b9a4c..cdb89d79a3 100644
--- a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp
+++ b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp
@@ -13,147 +13,11 @@
 #include <vector>
 
 #include "fbgemm_gpu/permute_pooled_embedding_ops.h"
-#include "fbgemm_gpu/sparse_ops_utils.h"
 
 using Tensor = at::Tensor;
 
-///@defgroup permute-pooled-embs-gpu
-///@defgroup permute-pooled-embs-cpu
-
 namespace fbgemm_gpu {
 
-///@ingroup permute-pooled-embs-cpu-impl
-Tensor permute_pooled_embs_cpu_impl(
-    const Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
-    const Tensor& offset_dim_list,
-    const Tensor& permute_list,
-    const Tensor& inv_offset_dim_list,
-    const Tensor& inv_permute_list,
-    const bool& allow_duplicates) {
-  TORCH_CHECK(
-      offset_dim_list.scalar_type() == at::ScalarType::Long,
-      "offset_dim_list needs to have long/int64 type")
-  TORCH_CHECK(
-      permute_list.scalar_type() == at::ScalarType::Long,
-      "permute_list needs to have long/int64 type")
-  auto permute = permute_list.data_ptr<int64_t>();
-  const auto n = permute_list.numel();
-  const auto dims_size = allow_duplicates ? offset_dim_list.numel() : n;
-  std::vector<int64_t> dims;
-  dims.reserve(dims_size - 1);
-  for (const auto i : c10::irange(1, dims_size)) {
-    dims.push_back(offset_dim_list[i].item<int64_t>());
-  }
-  auto ts = pooled_embs.tensor_split(dims, 1);
-  std::vector<Tensor> permuted_ts;
-  permuted_ts.reserve(n);
-  for (const auto i : c10::irange(n)) {
-    permuted_ts.push_back(ts[permute[i]]);
-  }
-  return at::cat(permuted_ts, 1);
-}
-
-///@ingroup permute-pooled-embs-cpu
-at::Tensor permute_pooled_embs_cpu(
-    const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
-    const at::Tensor& offset_dim_list,
-    const at::Tensor& permute_list,
-    const at::Tensor& inv_offset_dim_list,
-    const at::Tensor& inv_permute_list) {
-  return permute_pooled_embs_cpu_impl(
-      pooled_embs,
-      offset_dim_list,
-      permute_list,
-      inv_offset_dim_list,
-      inv_permute_list,
-      false);
-}
-
-///@ingroup permute-duplicate-pooled-embs-cpu
-at::Tensor permute_duplicate_pooled_embs_cpu(
-    const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
-    const at::Tensor& offset_dim_list,
-    const at::Tensor& permute_list,
-    const at::Tensor& inv_offset_dim_list,
-    const at::Tensor& inv_permute_list) {
-  return permute_pooled_embs_cpu_impl(
-      pooled_embs,
-      offset_dim_list,
-      permute_list,
-      inv_offset_dim_list,
-      inv_permute_list,
-      true);
-}
-
-using torch::autograd::AutogradContext;
-using torch::autograd::Variable;
-using torch::autograd::variable_list;
-
-class PermutePooledEmbsFunction
-    : public torch::autograd::Function<PermutePooledEmbsFunction> {
- public:
-  static Variable forward(
-      AutogradContext* ctx,
-      const Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
-      const Tensor& offset_dim_list,
-      const Tensor& permute_list,
-      const Tensor& inv_offset_dim_list,
-      const Tensor& inv_permute_list,
-      const bool& allow_duplicates = false) {
-    ctx->saved_data["offset_dim_list"] = offset_dim_list;
-    ctx->saved_data["permute_list"] = permute_list;
-    ctx->saved_data["inv_offset_dim_list"] = inv_offset_dim_list;
-    ctx->saved_data["inv_permute_list"] = inv_permute_list;
-    ctx->saved_data["allow_duplicates"] = allow_duplicates;
-    TORCH_CHECK(
-        offset_dim_list.scalar_type() == at::ScalarType::Long,
-        "offset_dim_list needs to have long/int64 type");
-    TORCH_CHECK(
-        permute_list.scalar_type() == at::ScalarType::Long,
-        "permute_list needs to have long/int64 type");
-
-    const auto schema = allow_duplicates
-        ? "fbgemm::permute_duplicate_pooled_embs"
-        : "fbgemm::permute_pooled_embs";
-    const auto permute_pooled_embs_op =
-        torch::Dispatcher::singleton()
-            .findSchemaOrThrow(schema, "")
-            .typed<decltype(permute_pooled_embs_cpu)>();
-    return permute_pooled_embs_op.call(
-        pooled_embs,
-        offset_dim_list,
-        permute_list,
-        inv_offset_dim_list,
-        inv_permute_list);
-  }
-  static variable_list backward(
-      AutogradContext* ctx,
-      variable_list grad_output) {
-    const auto& offset_dim_list = ctx->saved_data["offset_dim_list"].toTensor();
-    const auto& permute_list = ctx->saved_data["permute_list"].toTensor();
-    const auto& inv_offset_dim_list =
-        ctx->saved_data["inv_offset_dim_list"].toTensor();
-    const auto& inv_permute_list =
-        ctx->saved_data["inv_permute_list"].toTensor();
-    const auto& allow_duplicates = ctx->saved_data["allow_duplicates"].toBool();
-    TORCH_CHECK(
-        allow_duplicates == false,
-        "permute_pooled_embs does not support allow_duplicates in backward!");
-    variable_list grad_inputs(6);
-    static auto permute_pooled_embs_op =
-        torch::Dispatcher::singleton()
-            .findSchemaOrThrow("fbgemm::permute_pooled_embs", "")
-            .typed<decltype(permute_pooled_embs_cpu)>();
-    grad_inputs[0] = permute_pooled_embs_op.call(
-        grad_output[0],
-        inv_offset_dim_list,
-        inv_permute_list,
-        offset_dim_list,
-        permute_list);
-    return grad_inputs;
-  }
-};
-
 ///@ingroup permute-pooled-embs-gpu
 Tensor permute_pooled_embs_auto_grad_gpu(
     const Tensor& pooled_embs,
@@ -170,56 +34,6 @@ Tensor permute_pooled_embs_auto_grad_gpu(
       false);
 }
 
-///@ingroup permute-pooled-embs-cpu
-Tensor permute_pooled_embs_auto_grad_cpu(
-    const Tensor& pooled_embs,
-    const Tensor& offset_dim_list,
-    const Tensor& permute_list,
-    const Tensor& inv_offset_dim_list,
-    const Tensor& inv_permute_list) {
-  return PermutePooledEmbsFunction::apply(
-      pooled_embs,
-      offset_dim_list,
-      permute_list,
-      inv_offset_dim_list,
-      inv_permute_list,
-      false);
-}
-
-///@ingroup permute-pooled-embs-cpu
-Tensor permute_pooled_embs_auto_grad(
-    const Tensor& pooled_embs,
-    const Tensor& offset_dim_list,
-    const Tensor& permute_list,
-    const Tensor& inv_offset_dim_list,
-    const Tensor& inv_permute_list) {
-  return PermutePooledEmbsFunction::apply(
-      pooled_embs,
-      offset_dim_list,
-      permute_list,
-      inv_offset_dim_list,
-      inv_permute_list,
-      false);
-}
-
-Tensor permute_pooled_embs_meta(
-    const Tensor& pooled_embs,
-    const Tensor& /* offset_dim_list */,
-    const Tensor& /* permute_list */,
-    const Tensor& /* inv_offset_dim_list */,
-    const Tensor& /* inv_permute_list */) {
-  return torch::empty_like(pooled_embs);
-}
-
-Tensor permute_pooled_embs_auto_grad_meta(
-    const Tensor& pooled_embs,
-    const Tensor& /* offset_dim_list */,
-    const Tensor& /* permute_list */,
-    const Tensor& /* inv_offset_dim_list */,
-    const Tensor& /* inv_permute_list */) {
-  return torch::empty_like(pooled_embs);
-}
-
 ///@ingroup permute-duplicate-pooled-embs-gpu
 Tensor permute_duplicate_pooled_embs_auto_grad_gpu(
     const Tensor& pooled_embs,
@@ -236,57 +50,17 @@ Tensor permute_duplicate_pooled_embs_auto_grad_gpu(
       true);
 }
 
-///@ingroup permute-duplicate-pooled-embs-cpu
-Tensor permute_duplicate_pooled_embs_auto_grad_cpu(
-    const Tensor& pooled_embs,
-    const Tensor& offset_dim_list,
-    const Tensor& permute_list,
-    const Tensor& inv_offset_dim_list,
-    const Tensor& inv_permute_list) {
-  return PermutePooledEmbsFunction::apply(
-      pooled_embs,
-      offset_dim_list,
-      permute_list,
-      inv_offset_dim_list,
-      inv_permute_list,
-      true);
-}
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
-  m.def(
-      "permute_pooled_embs(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor");
   DISPATCH_TO_CUDA("permute_pooled_embs", fbgemm_gpu::permute_pooled_embs_gpu);
-  DISPATCH_TO_CPU("permute_pooled_embs", fbgemm_gpu::permute_pooled_embs_cpu);
-  DISPATCH_TO_META("permute_pooled_embs", fbgemm_gpu::permute_pooled_embs_meta);
-  m.def(
-      "permute_pooled_embs_auto_grad(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor");
-  DISPATCH_TO_AUTOGRAD(
-      "permute_pooled_embs_auto_grad",
-      fbgemm_gpu::permute_pooled_embs_auto_grad);
-  DISPATCH_TO_CPU(
-      "permute_pooled_embs_auto_grad",
-      fbgemm_gpu::permute_pooled_embs_auto_grad_cpu);
   DISPATCH_TO_CUDA(
       "permute_pooled_embs_auto_grad",
       fbgemm_gpu::permute_pooled_embs_auto_grad_gpu);
-  DISPATCH_TO_META(
-      "permute_pooled_embs_auto_grad",
-      fbgemm_gpu::permute_pooled_embs_auto_grad_meta);
-  m.def(
-      "permute_duplicate_pooled_embs(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor");
   DISPATCH_TO_CUDA(
       "permute_duplicate_pooled_embs",
       fbgemm_gpu::permute_duplicate_pooled_embs_gpu);
-  DISPATCH_TO_CPU(
-      "permute_duplicate_pooled_embs",
-      fbgemm_gpu::permute_duplicate_pooled_embs_cpu);
-  m.def(
-      "permute_duplicate_pooled_embs_auto_grad(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor");
   DISPATCH_TO_CUDA(
       "permute_duplicate_pooled_embs_auto_grad",
       fbgemm_gpu::permute_duplicate_pooled_embs_auto_grad_gpu);
-  DISPATCH_TO_CPU(
-      "permute_duplicate_pooled_embs_auto_grad",
-      fbgemm_gpu::permute_duplicate_pooled_embs_auto_grad_cpu);
 }
diff --git a/fbgemm_gpu/test/permute_pooled_embedding_test.py b/fbgemm_gpu/test/permute_pooled_embedding_test.py
index 67fef04edd..3365457c30 100644
--- a/fbgemm_gpu/test/permute_pooled_embedding_test.py
+++ b/fbgemm_gpu/test/permute_pooled_embedding_test.py
@@ -20,16 +20,27 @@
 # pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
 if getattr(fbgemm_gpu, "open_source", False):
     # pyre-ignore[21]
-    from test_utils import cpu_and_maybe_gpu, gpu_unavailable
+    from test_utils import cpu_and_maybe_gpu, gpu_unavailable, on_arm_platform
 else:
-    from fbgemm_gpu.test.test_utils import cpu_and_maybe_gpu, gpu_unavailable
+    from fbgemm_gpu.test.test_utils import (
+        cpu_and_maybe_gpu,
+        gpu_unavailable,
+        on_arm_platform,
+    )
 
 typed_gpu_unavailable: Tuple[bool, str] = gpu_unavailable
-
-if getattr(HealthCheck, "not_a_test_method", False):
-    suppressed_list: List[HealthCheck] = [HealthCheck.not_a_test_method]
-else:
-    suppressed_list = []
+typed_on_arm_platform: Tuple[bool, str] = on_arm_platform
+
+suppressed_list: List[HealthCheck] = (
+    [HealthCheck.not_a_test_method]
+    if getattr(HealthCheck, "not_a_test_method", False)
+    else []
+) + (
+    # pyre-fixme[16]: Module `HealthCheck` has no attribute `differing_executors`.
+    [HealthCheck.differing_executors]
+    if getattr(HealthCheck, "differing_executors", False)
+    else []
+)
 
 INTERN_MODULE = "fbgemm_gpu.permute_pooled_embedding_modules"
 FIXED_EXTERN_API = {
@@ -79,7 +90,6 @@ class PooledEmbeddingModulesTest(unittest.TestCase):
     def setUp(self, device_type: torch.device) -> None:
         self.device = device_type
 
-    @unittest.skipIf(True, "Skip until FailedHealthCheck is fixed")
     def test_permutation(self) -> None:
         net = Net().to(self.device)
 
@@ -89,7 +99,7 @@ def test_permutation(self) -> None:
             [6, 7, 8, 9, 0, 1, 5, 2, 3, 4],
         )
 
-    @unittest.skipIf(True, "Skip until FailedHealthCheck is fixed")
+    @unittest.skipIf(*typed_on_arm_platform)
     def test_permutation_autograd(self) -> None:
         net = Net().to(self.device)
 
@@ -122,7 +132,6 @@ def test_compatibility(self) -> None:
                     f"{FWD_COMPAT_MSG}",
                 )
 
-    @unittest.skipIf(True, "Skip until FailedHealthCheck is fixed")
     def test_pooled_table_batched_embedding(self) -> None:
         num_emb_bags = 5
         num_embeddings = 10
@@ -165,7 +174,7 @@ def test_pooled_table_batched_embedding(self) -> None:
             ref_permuted_pooled_emb.to(self.device), permuted_pooled_emb
         )
 
-    @unittest.skipIf(True, "Skip until FailedHealthCheck is fixed")
+    @unittest.skipIf(*typed_on_arm_platform)
     def test_permutation_autograd_meta(self) -> None:
         """
         Test that permute_pooled_embeddings_autograd works with meta tensor and
@@ -180,7 +189,7 @@ def test_permutation_autograd_meta(self) -> None:
         assert output_meta.shape == output_cpu.shape
         assert input.shape == output_meta.shape
 
-    @unittest.skipIf(True, "Skip until FailedHealthCheck is fixed")
+    @unittest.skipIf(*typed_gpu_unavailable)
     def test_duplicate_permutations(self) -> None:
         embs_dims = [2, 3, 1, 4]
         permute = [3, 0, 2, 0, 1, 3]

From 93bfbe98a7d1d9fc6b67f5995786c7069e7e0ee9 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Tue, 26 Sep 2023 15:46:11 -0700
Subject: [PATCH 49/94] Fix condition for ROCm jobs (#2042)

Summary:
- Fix the condition statement to un-skip ROCm Nova jobs

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2042

Reviewed By: spcyppt

Differential Revision: D49637190

Pulled By: q10

fbshipit-source-id: 8a9611ef59e8f92eebb5e01604ad96ed0962a198
---
 .github/workflows/build_wheels_linux_x86_rocm.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_wheels_linux_x86_rocm.yml b/.github/workflows/build_wheels_linux_x86_rocm.yml
index 6139e9dccc..79b5e35459 100644
--- a/.github/workflows/build_wheels_linux_x86_rocm.yml
+++ b/.github/workflows/build_wheels_linux_x86_rocm.yml
@@ -20,7 +20,6 @@ concurrency:
 
 jobs:
   generate-matrix:
-    if: ${{ github.event_name == 'pull_request' || (inputs.trigger-event == 'push' && startsWith(github.event.ref, 'refs/heads/nightly')) }}
     uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
     with:
       package-type: wheel
@@ -31,6 +30,8 @@ jobs:
       with-rocm: enable
       with-cpu: disable
   build:
+    # Only build and publish to nightly channel
+    if: ${{ github.event_name == 'pull_request' || (github.event_name == 'push' && startsWith(github.event.ref, 'refs/heads/nightly')) }}
     needs: generate-matrix
     name: pytorch/FBGEMM
     uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main

From b15df8d6f6bb27b86f98add8c0c9ea6c8fdb8101 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Tue, 26 Sep 2023 17:28:20 -0700
Subject: [PATCH 50/94] Update the PIP install+test workflow schedule (#2044)

Summary:
- Update the PIP install+test workflow to run 4 hours after the Nova pipeline is kicked off, to give ample time for the nightly wheels to be available in PyTorch PIP

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2044

Reviewed By: shintaro-iwasaki

Differential Revision: D49664418

Pulled By: q10

fbshipit-source-id: 9346262f6c82f418f77fe4509c41ef7e4169548d
---
 .github/workflows/fbgemm_gpu_pip.yml | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
index d0ab554583..82182d6562 100644
--- a/.github/workflows/fbgemm_gpu_pip.yml
+++ b/.github/workflows/fbgemm_gpu_pip.yml
@@ -9,10 +9,15 @@ on:
   # Cron Trigger (UTC)
   #
   # Based on the the nightly releases schedule in PyTorch infrastructure, the
-  # wheels are published to PyTorch PIP at around 11:30 UTC every day.
+  # wheels are published to PyTorch PIP at around 11:30 UTC every day.  After
+  # publication, it can take up to 30 minutes for the wheels to be published, as
+  # the re-indexing job is scheduled to run every 30 minutes.  As such, we set
+  # the PIP install + test workflow to be kicked off 4 hours after the publish
+  # job is kicked off to give ample time for the nightly wheel to be available
+  # in PyTorch PIP.
   #
   schedule:
-    - cron: '30 12 * * *'
+    - cron: '30 15 * * *'
 
   # Manual Trigger
   #

From 0346155d7f15fbe8be72687e665078edbe1ca5aa Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@meta.com>
Date: Tue, 26 Sep 2023 20:05:48 -0700
Subject: [PATCH 51/94] More removal of const ref on SymInt in dispatcher call
 sites (#2041)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2041

Same deal as https://www.internalfb.com/diff/D49572882

Reviewed By: zou3519

Differential Revision: D49624991

fbshipit-source-id: feef351c7f375978a249f5139698573c0e298fdb
---
 .../src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp  | 8 ++++----
 .../src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp       | 4 ++--
 .../src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp      | 4 ++--
 .../jagged_tensor_ops/jagged_to_padded_dense_backward.cu  | 2 +-
 .../jagged_tensor_ops/jagged_to_padded_dense_forward.cu   | 2 +-
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp              | 2 +-
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
index 45e12412c9..30060728d7 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
@@ -45,7 +45,7 @@ class JaggedToPaddedDenseOp
             .typed<at::Tensor(
                 const Tensor& values,
                 const std::vector<Tensor>& offsets,
-                const at::ArrayRef<at::SymInt>& max_lengths,
+                at::ArrayRef<at::SymInt> max_lengths,
                 const double padding_value)>();
     Tensor padded_values = op.call(values, offsets, max_lengths, padding_value);
 
@@ -66,7 +66,7 @@ class JaggedToPaddedDenseOp
             .typed<at::Tensor(
                 const Tensor& grad_output,
                 const std::vector<Tensor>& offsets,
-                const at::SymInt& total_L)>();
+                at::SymInt total_L)>();
     auto grad_values = op.call(grad_outputs[0], {offsets}, total_L);
 
     return {
@@ -131,7 +131,7 @@ class JaggedDenseDenseAddJaggedOutputOp
             .typed<at::Tensor(
                 const Tensor& values,
                 const std::vector<Tensor>& offsets,
-                const at::ArrayRef<at::SymInt>& max_lengths,
+                at::ArrayRef<at::SymInt> max_lengths,
                 const double padding_value)>();
     Tensor dense_values_grad_0 = op.call(
         grad_outputs[0],
@@ -308,7 +308,7 @@ class DenseToJaggedOp : public torch::autograd::Function<DenseToJaggedOp> {
             .typed<Tensor(
                 const Tensor& values,
                 const std::vector<Tensor>& offsets,
-                const at::ArrayRef<at::SymInt>& max_lengths,
+                at::ArrayRef<at::SymInt> max_lengths,
                 const double padding_value)>();
     auto dense_values_grad = op.call(
         grad_outputs[0],
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
index 00f2c4f960..820173b6c5 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
@@ -371,7 +371,7 @@ void jagged_dense_elementwise_jagged_output_(
 at::Tensor jagged_to_padded_dense_forward(
     const Tensor& values,
     const std::vector<Tensor>& offsets,
-    const at::ArrayRef<at::SymInt>& max_lengths,
+    c10::SymIntArrayRef max_lengths,
     const double padding_value) {
   const size_t num_jagged_dim = offsets.size();
   TORCH_CHECK(
@@ -429,7 +429,7 @@ at::Tensor jagged_to_padded_dense_forward(
 at::Tensor jagged_to_padded_dense_backward(
     const Tensor& grad_output,
     const std::vector<Tensor>& offsets,
-    const at::SymInt& total_L) {
+    const at::SymInt total_L) {
   auto grad_padded_values = grad_output;
 
   // Canonicalize padded_values by unsqueeze the last dim if the inner dense
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp
index bf6b4be2b4..b9e249cb90 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp
@@ -22,7 +22,7 @@ using Tensor = at::Tensor;
 Tensor jagged_to_padded_dense_forward_meta(
     const Tensor& values,
     const std::vector<Tensor>& offsets,
-    const at::ArrayRef<at::SymInt>& max_lengths,
+    c10::SymIntArrayRef max_lengths,
     const double padding_value = 0) {
   const size_t num_jagged_dim = offsets.size();
   TORCH_CHECK(
@@ -53,7 +53,7 @@ Tensor jagged_to_padded_dense_meta(
 Tensor jagged_to_padded_dense_backward_meta(
     const at::Tensor& grad_output,
     const std::vector<Tensor>& offsets,
-    const at::SymInt& total_L) {
+    at::SymInt total_L) {
   auto grad_padded_values = grad_output;
 
   at::SymInt D = grad_padded_values.sym_size(-1);
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_to_padded_dense_backward.cu b/fbgemm_gpu/src/jagged_tensor_ops/jagged_to_padded_dense_backward.cu
index 31f6266498..e460652fab 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_to_padded_dense_backward.cu
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_to_padded_dense_backward.cu
@@ -15,7 +15,7 @@ namespace fbgemm_gpu {
 at::Tensor jagged_to_padded_dense_backward(
     const Tensor& grad_output,
     const std::vector<Tensor>& offsets,
-    const at::SymInt& total_L) {
+    at::SymInt total_L) {
   auto grad_padded_values = grad_output;
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(grad_padded_values.get_device());
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu b/fbgemm_gpu/src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu
index fe95121306..8689d23939 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu
@@ -21,7 +21,7 @@ namespace fbgemm_gpu {
 at::Tensor jagged_to_padded_dense_forward(
     const Tensor& values,
     const std::vector<Tensor>& offsets,
-    const at::ArrayRef<at::SymInt>& max_lengths,
+    c10::SymIntArrayRef max_lengths,
     const double padding_value) {
   const size_t num_jagged_dim = offsets.size();
   TORCH_CHECK(
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index 58b345367d..dc6c013138 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -65,7 +65,7 @@ class PackSegments : public torch::autograd::Function<PackSegments> {
       torch::autograd::AutogradContext* ctx,
       const Tensor& t_in,
       const Tensor& lengths,
-      const at::SymInt& max_length) {
+      at::SymInt max_length) {
     const at::SymInt total_length = t_in.sym_size(0);
 
     at::AutoDispatchBelowADInplaceOrView guard;

From e4ed378a7c8a81a1ba6eaed1741c01534636b74a Mon Sep 17 00:00:00 2001
From: Qiang Zhang <drqiangzhang@meta.com>
Date: Tue, 26 Sep 2023 20:39:01 -0700
Subject: [PATCH 52/94] Unify block bucketize sparse features API to use
 batch_size_per_feature instead of batch_sizes (#2043)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2043

Per joshuadeng request

Api for block bucketize still seems to be batch_sizes in a lot of places instead of batch_size_per_feature
https://fburl.com/code/sinbxle2

Unify it to be all batch_size_per_feature

Reviewed By: joshuadeng

Differential Revision: D49646111

fbshipit-source-id: 8834c686926ea575b507a5598c3b75bf90d624c7
---
 .../sparse_block_bucketize_features.cu        | 33 ++++++++++---------
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp  | 18 +++++-----
 2 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu
index 9fd8d80dc5..d2f281fd49 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu
+++ b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu
@@ -19,7 +19,7 @@ __global__
 __launch_bounds__(kMaxThreads) void _populate_length_to_feature_id_inplace_kernel(
     const uint64_t max_B,
     const int T,
-    const offset_t* const __restrict__ batch_sizes,
+    const offset_t* const __restrict__ batch_size_per_feature,
     const offset_t* const __restrict__ batch_size_offsets,
     offset_t* const __restrict__ length_to_feature_idx) {
   const auto b_t = blockIdx.x * blockDim.x + threadIdx.x;
@@ -27,7 +27,7 @@ __launch_bounds__(kMaxThreads) void _populate_length_to_feature_id_inplace_kerne
   const auto t = b_t / max_B;
   const auto b = b_t % max_B;
 
-  if (t >= T || b >= batch_sizes[t]) {
+  if (t >= T || b >= batch_size_per_feature[t]) {
     return;
   }
 
@@ -146,7 +146,7 @@ block_bucketize_sparse_features_cuda(
     const Tensor& block_sizes,
     const int64_t my_size,
     const c10::optional<Tensor>& weights,
-    const c10::optional<Tensor>& batch_sizes,
+    const c10::optional<Tensor>& batch_size_per_feature,
     const int64_t max_B) {
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(lengths, indices);
 
@@ -165,7 +165,8 @@ block_bucketize_sparse_features_cuda(
   auto indices_contig = indices.contiguous();
   auto offsets_contig = offsets.contiguous();
   auto batch_sizes_contig =
-      batch_sizes.value_or(at::empty({T}, lengths.options())).contiguous();
+      batch_size_per_feature.value_or(at::empty({T}, lengths.options()))
+          .contiguous();
   auto batch_sizes_offsets_contig =
       at::empty({T}, batch_sizes_contig.options());
   Tensor new_weights;
@@ -173,14 +174,14 @@ block_bucketize_sparse_features_cuda(
   Tensor unbucketize_permute;
   // count nonzeros
   offsets_contig = asynchronous_inclusive_cumsum_gpu(lengths);
-  if (batch_sizes.has_value()) {
+  if (batch_size_per_feature.has_value()) {
     TORCH_CHECK(max_B > 0);
     batch_sizes_offsets_contig =
-        asynchronous_exclusive_cumsum_gpu(batch_sizes.value());
+        asynchronous_exclusive_cumsum_gpu(batch_size_per_feature.value());
   }
   auto length_to_feature_idx =
       at::empty({lengths_size}, lengths_contig.options());
-  if (batch_sizes.has_value()) {
+  if (batch_size_per_feature.has_value()) {
     constexpr auto threads_per_block = 256;
     const auto num_blocks =
         cuda_calc_xblock_count(max_B * T, threads_per_block);
@@ -227,7 +228,7 @@ block_bucketize_sparse_features_cuda(
                   offsets_contig.data_ptr<offset_t>(),
                   indices_contig.data_ptr<index_t>(),
                   new_lengths.data_ptr<offset_t>(),
-                  batch_sizes.has_value()
+                  batch_size_per_feature.has_value()
                       ? length_to_feature_idx.data_ptr<offset_t>()
                       : static_cast<offset_t*>(nullptr));
               C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -280,7 +281,7 @@ block_bucketize_sparse_features_cuda(
                                 new_weights.data_ptr<scalar_t>(),
                                 new_pos.data_ptr<index_t>(),
                                 unbucketize_permute.data_ptr<index_t>(),
-                                batch_sizes.has_value()
+                                batch_size_per_feature.has_value()
                                     ? length_to_feature_idx.data_ptr<offset_t>()
                                     : static_cast<offset_t*>(nullptr));
                         C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -327,7 +328,7 @@ block_bucketize_sparse_features_cuda(
                                 new_weights.data_ptr<scalar_t>(),
                                 nullptr,
                                 unbucketize_permute.data_ptr<index_t>(),
-                                batch_sizes.has_value()
+                                batch_size_per_feature.has_value()
                                     ? length_to_feature_idx.data_ptr<offset_t>()
                                     : static_cast<offset_t*>(nullptr));
                         C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -368,7 +369,7 @@ block_bucketize_sparse_features_cuda(
                           nullptr,
                           new_pos.data_ptr<index_t>(),
                           unbucketize_permute.data_ptr<index_t>(),
-                          batch_sizes.has_value()
+                          batch_size_per_feature.has_value()
                               ? length_to_feature_idx.data_ptr<offset_t>()
                               : static_cast<offset_t*>(nullptr));
                   C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -407,7 +408,7 @@ block_bucketize_sparse_features_cuda(
                           nullptr,
                           nullptr,
                           unbucketize_permute.data_ptr<index_t>(),
-                          batch_sizes.has_value()
+                          batch_size_per_feature.has_value()
                               ? length_to_feature_idx.data_ptr<offset_t>()
                               : static_cast<offset_t*>(nullptr));
                   C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -456,7 +457,7 @@ block_bucketize_sparse_features_cuda(
                                 new_weights.data_ptr<scalar_t>(),
                                 new_pos.data_ptr<index_t>(),
                                 nullptr,
-                                batch_sizes.has_value()
+                                batch_size_per_feature.has_value()
                                     ? length_to_feature_idx.data_ptr<offset_t>()
                                     : static_cast<offset_t*>(nullptr));
                         C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -503,7 +504,7 @@ block_bucketize_sparse_features_cuda(
                                 new_weights.data_ptr<scalar_t>(),
                                 nullptr,
                                 nullptr,
-                                batch_sizes.has_value()
+                                batch_size_per_feature.has_value()
                                     ? length_to_feature_idx.data_ptr<offset_t>()
                                     : static_cast<offset_t*>(nullptr));
                         C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -544,7 +545,7 @@ block_bucketize_sparse_features_cuda(
                           nullptr,
                           new_pos.data_ptr<index_t>(),
                           nullptr,
-                          batch_sizes.has_value()
+                          batch_size_per_feature.has_value()
                               ? length_to_feature_idx.data_ptr<offset_t>()
                               : static_cast<offset_t*>(nullptr));
                   C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -583,7 +584,7 @@ block_bucketize_sparse_features_cuda(
                           nullptr,
                           nullptr,
                           nullptr,
-                          batch_sizes.has_value()
+                          batch_size_per_feature.has_value()
                               ? length_to_feature_idx.data_ptr<offset_t>()
                               : static_cast<offset_t*>(nullptr));
                   C10_CUDA_KERNEL_LAUNCH_CHECK();
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index dc6c013138..274df50962 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -282,7 +282,7 @@ void _block_bucketize_sparse_features_cpu(
     c10::optional<Tensor> new_weights,
     c10::optional<Tensor> new_pos,
     const c10::optional<Tensor>& unbucketize_permute,
-    const c10::optional<Tensor>& batch_sizes) {
+    const c10::optional<Tensor>& batch_size_per_feature) {
   // allocate tensors and buffers
   const auto lengths_size = lengths.numel();
   const auto new_lengths_size = lengths_size * my_size;
@@ -302,7 +302,7 @@ void _block_bucketize_sparse_features_cpu(
   index_t* const new_indices_data = new_indices.data_ptr<index_t>();
   const index_t* const block_sizes_data = block_sizes.data_ptr<index_t>();
   offset_t* batch_sizes_data = nullptr;
-  const auto variable_batch_size = batch_sizes.has_value();
+  const auto variable_batch_size = batch_size_per_feature.has_value();
 
   using uindex_t = std::make_unsigned_t<index_t>;
   using uoffset_t = std::make_unsigned_t<offset_t>;
@@ -319,7 +319,7 @@ void _block_bucketize_sparse_features_cpu(
   }
 
   if (variable_batch_size) {
-    batch_sizes_data = batch_sizes.value().data_ptr<offset_t>();
+    batch_sizes_data = batch_size_per_feature.value().data_ptr<offset_t>();
   }
 
   // count nonzeros
@@ -908,7 +908,7 @@ block_bucketize_sparse_features_cpu(
     const Tensor& block_sizes,
     const int64_t my_size,
     const c10::optional<Tensor>& weights,
-    const c10::optional<Tensor>& batch_sizes,
+    const c10::optional<Tensor>& batch_size_per_feature,
     const int64_t /* max_batch_size */ // Only used in GPU variant
 ) {
   const auto lengths_size = lengths.numel();
@@ -957,7 +957,7 @@ block_bucketize_sparse_features_cpu(
                             new_weights,
                             new_pos,
                             unbucketize_permute,
-                            batch_sizes);
+                            batch_size_per_feature);
                       });
                 });
           });
@@ -992,7 +992,7 @@ block_bucketize_sparse_features_cpu(
                             new_weights,
                             new_pos,
                             unbucketize_permute,
-                            batch_sizes);
+                            batch_size_per_feature);
                       });
                 });
           });
@@ -1025,7 +1025,7 @@ block_bucketize_sparse_features_cpu(
                       new_weights,
                       new_pos,
                       unbucketize_permute,
-                      batch_sizes);
+                      batch_size_per_feature);
                 });
           });
     } else {
@@ -1053,7 +1053,7 @@ block_bucketize_sparse_features_cpu(
                       new_weights,
                       new_pos,
                       unbucketize_permute,
-                      batch_sizes);
+                      batch_size_per_feature);
                 });
           });
     }
@@ -2692,7 +2692,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "expand_into_jagged_permute(Tensor permute, Tensor input_offset, Tensor output_offset, int output_size) -> Tensor");
   m.def(
-      "block_bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, bool sequence, Tensor block_sizes, int my_size, Tensor? weights=None, Tensor? batch_sizes=None, int max_B= -1) -> (Tensor, Tensor, Tensor?, Tensor?, Tensor?)");
+      "block_bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, bool sequence, Tensor block_sizes, int my_size, Tensor? weights=None, Tensor? batch_size_per_feature=None, int max_B= -1) -> (Tensor, Tensor, Tensor?, Tensor?, Tensor?)");
   m.def(
       "bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, int my_size, Tensor? weights=None) -> (Tensor, Tensor, Tensor?, Tensor?)");
   m.def("asynchronous_exclusive_cumsum(Tensor t_in) -> Tensor");

From 1595e2abf1f0ec1c0c4094bba9bdbd2750c34980 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Wed, 27 Sep 2023 21:43:50 -0700
Subject: [PATCH 53/94] Fix package variant versioning (#2045)

Summary:
- Fix package variant versioning for ROCm Nova builds

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2045

Reviewed By: sryap

Differential Revision: D49716498

Pulled By: q10

fbshipit-source-id: ff2e9f98c2acfe79fee2fb7af703e8c21587164d
---
 .github/scripts/fbgemm_gpu_build.bash |   9 +-
 fbgemm_gpu/docs/BuildInstructions.md  |   9 +-
 fbgemm_gpu/setup.py                   | 177 ++++++++++++++++----------
 3 files changed, 126 insertions(+), 69 deletions(-)

diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index b783651d8c..269641cef8 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -58,7 +58,9 @@ prepare_fbgemm_gpu_build () {
 __configure_fbgemm_gpu_build_cpu () {
   # Update the package name and build args depending on if CUDA is specified
   echo "[BUILD] Setting CPU-only build args ..."
-  build_args=(--cpu_only)
+  build_args=(
+    --package_variant=cpu
+  )
 }
 
 __configure_fbgemm_gpu_build_rocm () {
@@ -88,7 +90,9 @@ __configure_fbgemm_gpu_build_rocm () {
   print_exec conda env config vars set ${env_prefix} PYTORCH_ROCM_ARCH="${arch_list}"
 
   echo "[BUILD] Setting ROCm build args ..."
-  build_args=()
+  build_args=(
+    --package_variant=rocm
+  )
 }
 
 __configure_fbgemm_gpu_build_cuda () {
@@ -133,6 +137,7 @@ __configure_fbgemm_gpu_build_cuda () {
   # shellcheck disable=SC2155,SC2086
   local nvml_lib_path=$(conda run --no-capture-output ${env_prefix} printenv NVML_LIB_PATH)
   build_args=(
+    --package_variant=cuda
     --nvml_lib_path="${nvml_lib_path}"
     -DTORCH_CUDA_ARCH_LIST="'${arch_list}'"
   )
diff --git a/fbgemm_gpu/docs/BuildInstructions.md b/fbgemm_gpu/docs/BuildInstructions.md
index e67c5a60f2..04090a1b78 100644
--- a/fbgemm_gpu/docs/BuildInstructions.md
+++ b/fbgemm_gpu/docs/BuildInstructions.md
@@ -354,6 +354,7 @@ unset TORCH_CUDA_ARCH_LIST
 # Build the wheel artifact only
 python setup.py bdist_wheel \
     --package_name="${package_name}" \
+    --package_variant=cuda \
     --python-tag="${python_tag}" \
     --plat-name="manylinux1_${ARCH}" \
     --nvml_lib_path=${NVML_LIB_PATH} \
@@ -361,6 +362,7 @@ python setup.py bdist_wheel \
 
 # Build and install the library into the Conda environment
 python setup.py install \
+    --package_variant=cuda \
     --nvml_lib_path=${NVML_LIB_PATH} \
     -DTORCH_CUDA_ARCH_LIST="${cuda_arch_list}"
 ```
@@ -384,11 +386,12 @@ package_name=fbgemm_gpu_rocm
 # Build the wheel artifact only
 python setup.py bdist_wheel \
     --package_name="${package_name}" \
+    --package_variant=rocm \
     --python-tag="${python_tag}" \
     --plat-name="manylinux1_${ARCH}"
 
 # Build and install the library into the Conda environment
-python setup.py install develop
+python setup.py install develop --package_variant=rocm
 ```
 
 ### CPU-Only Build
@@ -406,12 +409,12 @@ package_name=fbgemm_gpu_cpu
 # Build the wheel artifact only
 python setup.py bdist_wheel \
     --package_name="${package_name}" \
+    --package_variant=cpu \
     --python-tag="${python_tag}" \
     --plat-name="manylinux1_${ARCH}" \
-    --cpu_only
 
 # Build and install the library into the Conda environment
-python setup.py install --cpu_only
+python setup.py install --package_variant=cpu
 ```
 
 ### Post-Build Checks (For Developers)
diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py
index dfe98d384f..161890a7fc 100644
--- a/fbgemm_gpu/setup.py
+++ b/fbgemm_gpu/setup.py
@@ -24,47 +24,20 @@
 from tabulate import tabulate
 
 
-def generate_package_version(package_name: str, version_variant: str):
-    print("[SETUP.PY] Generating the package version ...")
-
-    if "nightly" in package_name:
-        # Use date stamp for nightly versions
-        print("[SETUP.PY] Package is for NIGHTLY; using timestamp for the versioning")
-        today = date.today()
-        version = f"{today.year}.{today.month}.{today.day}"
-
-    elif "test" in package_name:
-        # Use date stamp for nightly versions
-        print("[SETUP.PY] Package is for TEST: using random number for the versioning")
-        version = (f"0.0.{random.randint(0, 1000)}",)
-
-    else:
-        # Use git tag / branch / commit info to generate a PEP-440-compliant version string
-        print("[SETUP.PY] Package is for RELEASE: using git info for the versioning")
-        print(
-            f"[SETUP.PY] TAG: {gitversion.get_tag()}, BRANCH: {gitversion.get_branch()}, SHA: {gitversion.get_sha()}"
-        )
-        # Remove the local version identifier, if any (e.g. 0.4.0rc0.post0+git.6a63116c.dirty => 0.4.0rc0.post0)
-        # Then remove post0 (keep postN for N > 0) (e.g. 0.4.0rc0.post0 => 0.4.0rc0)
-        version = re.sub(".post0$", "", gitversion.version_from_git().split("+")[0])
-    version = str(version) + version_variant
-    print(f"[SETUP.PY] Setting the package version: {version}")
-    return version
-
-
 def parse_args(argv: List[str]) -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="fbgemm_gpu setup")
     parser.add_argument(
-        "--cpu_only",
-        dest="cpu_only",
-        action="store_true",
-        help="build for cpu-only (no GPU support)",
+        "--package_variant",
+        type=str,
+        choices=["cpu", "cuda", "rocm"],
+        default="cpu",
+        help="The FBGEMM_GPU variant to build.",
     )
     parser.add_argument(
         "--package_name",
         type=str,
         default="fbgemm_gpu",
-        help="the name of this output wheel",
+        help="The candidate name of the output wheel.",
     )
     parser.add_argument(
         "--nvml_lib_path",
@@ -171,7 +144,7 @@ def _get_cxx11_abi():
     os.environ["CMAKE_BUILD_PARALLEL_LEVEL"] = str(os.cpu_count() // 2)
 
     cmake_args = [f"-DCMAKE_PREFIX_PATH={torch_root}", _get_cxx11_abi()]
-    if args.cpu_only:
+    if args.package_variant == "cpu":
         cmake_args.append("-DFBGEMM_CPU_ONLY=ON")
     if args.nvml_lib_path:
         cmake_args.append(f"-DNVML_LIB_PATH={args.nvml_lib_path}")
@@ -181,6 +154,100 @@ def _get_cxx11_abi():
 class FbgemmGpuInstaller(PipInstall):
     """FBGEMM_GPU PIP Installer"""
 
+    @classmethod
+    def extract_package_name(cls, args_package_name: str) -> str:
+        package_name: str = ""
+
+        if "BUILD_FROM_NOVA" in os.environ:
+            nova_flag = os.getenv("BUILD_FROM_NOVA")
+            print(f"[SETUP.PY] BUILD_FROM_NOVA={nova_flag}")
+
+            # The package name is the same for all build variants in Nova
+            package_name = "fbgemm_gpu"
+
+            if str(nova_flag) != "0":
+                # Skip build clean and build wheel steps in Nova workflow since
+                # they are done in pre-script
+                print("[SETUP.PY] Build from Nova detected... exiting.")
+                sys.exit(0)
+
+        else:
+            package_name = args_package_name
+
+        print(f"[SETUP.PY] Extracted the package name: '{package_name}'")
+        return package_name
+
+    @classmethod
+    def extract_variant_version(cls, variant: str) -> str:
+        variant_version: str = ""
+
+        if variant == "cpu":
+            variant_version = "+cpu"
+        elif variant == "cuda":
+            set_cuda_environment_variables()
+            if torch.version.cuda is not None:
+                cuda_version = torch.version.cuda.split(".")
+                variant_version = f"+cu{cuda_version[0]}{cuda_version[1]}"
+            else:
+                sys.exit(
+                    "[SETUP.PY] Installed PyTorch variant is not CUDA; cannot determine the CUDA version!"
+                )
+        elif variant == "rocm":
+            if torch.version.hip is not None:
+                rocm_version = torch.version.hip.split(".")
+                variant_version = f"+rocm{rocm_version[0]}.{rocm_version[1]}"
+            else:
+                sys.exit(
+                    "[SETUP.PY] Installed PyTorch variant is not ROCm; cannot determine the ROCm version!"
+                )
+        else:
+            sys.exit(
+                f"[SETUP.PY] Unrecognized build variant variant '{variant}'; cannot proceed with FBGEMM_GPU build!"
+            )
+
+        if "BUILD_FROM_NOVA" not in os.environ:
+            # If not building from Nova, use the fbgemm_gpu-<variant>
+            # PyPI does not accept version+xx in the name convention.
+            print("[SETUP.PY] Not building FBGEMM_GPU from Nova.")
+            variant_version = ""
+
+        print(f"[SETUP.PY] Extracted the package variant+version: '{variant_version}'")
+        return variant_version
+
+    @classmethod
+    def generate_package_version(cls, package_name: str, variant_version: str):
+        print("[SETUP.PY] Generating the package version ...")
+
+        if "nightly" in package_name:
+            # Use date stamp for nightly versions
+            print(
+                "[SETUP.PY] Package is for NIGHTLY; using timestamp for the versioning"
+            )
+            today = date.today()
+            version = f"{today.year}.{today.month}.{today.day}"
+
+        elif "test" in package_name and "BUILD_FROM_NOVA" not in os.environ:
+            # Use random numbering for test versions
+            print(
+                "[SETUP.PY] Package is for TEST: using random number for the versioning"
+            )
+            version = (f"0.0.{random.randint(0, 1000)}",)
+
+        else:
+            # Use git tag / branch / commit info to generate a PEP-440-compliant version string
+            print(
+                "[SETUP.PY] Package is for RELEASE: using git info for the versioning"
+            )
+            print(
+                f"[SETUP.PY] TAG: {gitversion.get_tag()}, BRANCH: {gitversion.get_branch()}, SHA: {gitversion.get_sha()}"
+            )
+            # Remove the local version identifier, if any (e.g. 0.4.0rc0.post0+git.6a63116c.dirty => 0.4.0rc0.post0)
+            # Then remove post0 (keep postN for N > 0) (e.g. 0.4.0rc0.post0 => 0.4.0rc0)
+            version = re.sub(".post0$", "", gitversion.version_from_git().split("+")[0])
+        version = str(version) + variant_version
+        print(f"[SETUP.PY] Setting the full package version string: {version}")
+        return version
+
     @classmethod
     def generate_version_file(cls, package_version: str) -> None:
         with open("fbgemm_gpu/_fbgemm_gpu_version.py", "w") as file:
@@ -262,41 +329,23 @@ def run(self):
 def main(argv: List[str]) -> None:
     # Handle command line args before passing to main setup() method.
     args, unknown = parse_args(argv)
-    print("Parsed Arguments: ", args)
+    print(f"[SETUP.PY] Parsed Arguments: {args}")
     if len(unknown) != 0 and (len(unknown) != 1 or unknown[0] != "clean"):
-        print("Unknown Arguments: ", unknown)
+        print(f"[SETUP.PY] Unknown Arguments: {unknown}")
 
-    if args.cpu_only:
-        version_variant = "+cpu"
-    else:
-        set_cuda_environment_variables()
-        if torch.version.cuda is not None:
-            cuda_version = torch.version.cuda.split(".")
-            version_variant = "+cu" + str(cuda_version[0]) + str(cuda_version[1])
-        else:
-            # rocm or other gpus - to be specified if we offcially support them
-            version_variant = ""
-
-    # Skip Nova build steps since it will be done in pre-script
-    if "BUILD_FROM_NOVA" in os.environ:
-        build_from_nova = os.getenv("BUILD_FROM_NOVA")
-        print("build_from_nova", build_from_nova)
-        # Package name is the same for all variants in Nova
-        package_name = "fbgemm_gpu"
-        if str(build_from_nova) != "0":
-            # Skip build clean and build wheel steps in Nova workflow since they are done in pre-script
-            print("Build from Nova detected... exiting")
-            sys.exit(0)
-    else:
-        # If not building from Nova, use the fbgemm_gpu-<variant>
-        # PyPi does not accept version+xx in the name convention.
-        version_variant = ""
-        package_name = args.package_name
     # Repair command line args for setup.
     sys.argv = [sys.argv[0]] + unknown
 
-    # Determine the package version
-    package_version = generate_package_version(args.package_name, version_variant)
+    # Extract the package name
+    package_name = FbgemmGpuInstaller.extract_package_name(args.package_name)
+
+    # Extract the variant version, e.g. cpu, cu121, rocm5.6
+    variant_version = FbgemmGpuInstaller.extract_variant_version(args.package_variant)
+
+    # Generate the full package version string
+    package_version = FbgemmGpuInstaller.generate_package_version(
+        package_name, variant_version
+    )
 
     # Generate the version file
     FbgemmGpuInstaller.generate_version_file(package_version)

From 7c9e21d5b7e75358288fc44eaa434cd9c2a050c4 Mon Sep 17 00:00:00 2001
From: Supadchaya Puangpontip <supadchaya@meta.com>
Date: Thu, 28 Sep 2023 11:56:14 -0700
Subject: [PATCH 54/94] Fix package version logic (#2048)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2048

- Fix package version logic
- Change default package variant to cuda
- Fix rocm variants for testing (rocm versions on Nova are 5.6 and 5.7)

Reviewed By: sryap

Differential Revision: D49740927

fbshipit-source-id: 870812fd46dbd3e5e2765540b94fb45951811a8f
---
 .github/workflows/fbgemm_gpu_pip.yml | 2 +-
 fbgemm_gpu/setup.py                  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
index 82182d6562..1b5518e506 100644
--- a/.github/workflows/fbgemm_gpu_pip.yml
+++ b/.github/workflows/fbgemm_gpu_pip.yml
@@ -172,7 +172,7 @@ jobs:
         ]
         # ROCm machines are limited, so we only test against Python 3.10
         python-version: [ "3.10" ]
-        rocm-version: [ "5.5.1", "5.6" ]
+        rocm-version: [ "5.6", "5.7" ]
 
     steps:
     - name: Setup Build Container
diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py
index 161890a7fc..097b370526 100644
--- a/fbgemm_gpu/setup.py
+++ b/fbgemm_gpu/setup.py
@@ -30,7 +30,7 @@ def parse_args(argv: List[str]) -> argparse.Namespace:
         "--package_variant",
         type=str,
         choices=["cpu", "cuda", "rocm"],
-        default="cpu",
+        default="cuda",
         help="The FBGEMM_GPU variant to build.",
     )
     parser.add_argument(
@@ -344,7 +344,7 @@ def main(argv: List[str]) -> None:
 
     # Generate the full package version string
     package_version = FbgemmGpuInstaller.generate_package_version(
-        package_name, variant_version
+        args.package_name, variant_version
     )
 
     # Generate the version file

From 09d4f16fa0f2d0b5b6c346875bdddac0ffff4196 Mon Sep 17 00:00:00 2001
From: Jia Chen <grievejia@meta.com>
Date: Thu, 28 Sep 2023 15:35:59 -0700
Subject: [PATCH 55/94] Upgrade & suppress type errors for
 `deeplearning/fbgemm/fbgemm_gpu` (#2047)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2047

cyberman_upgrade

Reviewed By: kinto0

Differential Revision: D49715389

fbshipit-source-id: 6c2d9167df647fc28edfcacb9d84ef6176cf21fc
---
 fbgemm_gpu/bench/bench_utils.py                            | 2 ++
 fbgemm_gpu/bench/sparse_ops_benchmark.py                   | 2 ++
 .../bench/split_table_batched_embeddings_benchmark.py      | 1 +
 fbgemm_gpu/fbgemm_gpu/enums.py                             | 3 +--
 .../split_table_batched_embeddings_ops_training.py         | 6 ++++++
 fbgemm_gpu/test/jagged_tensor_ops_test.py                  | 7 ++++++-
 fbgemm_gpu/test/layout_transform_ops_test.py               | 4 ++++
 fbgemm_gpu/test/split_table_batched_embeddings_test.py     | 6 +++++-
 fbgemm_gpu/test/ssd_split_table_batched_embeddings_test.py | 6 +++++-
 9 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/fbgemm_gpu/bench/bench_utils.py b/fbgemm_gpu/bench/bench_utils.py
index 8ee8f8a2a4..b174c32fcd 100644
--- a/fbgemm_gpu/bench/bench_utils.py
+++ b/fbgemm_gpu/bench/bench_utils.py
@@ -195,6 +195,7 @@ def benchmark_requests(
         if torch.cuda.is_available():
             end_event.record()
             torch.cuda.synchronize()
+            # pyre-fixme[61]: `end_event` is undefined, or not always defined.
             it_time = start_event.elapsed_time(end_event) * 1.0e-3
             times.append(it_time)
         else:
@@ -285,6 +286,7 @@ def benchmark_requests_refer(
         if torch.cuda.is_available():
             end_event.record()
             torch.cuda.synchronize()
+            # pyre-fixme[61]: `end_event` is undefined, or not always defined.
             it_time = start_event.elapsed_time(end_event) * 1.0e-3
             times.append(it_time)
         else:
diff --git a/fbgemm_gpu/bench/sparse_ops_benchmark.py b/fbgemm_gpu/bench/sparse_ops_benchmark.py
index 7f647c790e..bf0ad96c1e 100644
--- a/fbgemm_gpu/bench/sparse_ops_benchmark.py
+++ b/fbgemm_gpu/bench/sparse_ops_benchmark.py
@@ -104,6 +104,7 @@ def gen_inverse_index(curr_size: int, final_size: int) -> np.array:
     else:
         raise RuntimeError(f"Does not support data type {input_precision}")
 
+    # pyre-fixme[16]: Module `cuda` has no attribute `IntTensor`.
     indices = torch.cuda.IntTensor(gen_inverse_index(unique_batch_size, batch_size))
 
     input = torch.rand(unique_batch_size, row_size, dtype=dtype, device="cuda")
@@ -264,6 +265,7 @@ def gen_inverse_index(curr_size: int, final_size: int) -> np.array:
     offset_indices_group = []
     indices_group = []
     for i in range(num_groups):
+        # pyre-fixme[16]: Module `cuda` has no attribute `IntTensor`.
         indices = torch.cuda.IntTensor(gen_inverse_index(unique_batch_size, batch_size))
         if sort_indices:
             indices, _ = indices.sort()
diff --git a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
index 277809733c..84a87c96fb 100644
--- a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
+++ b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
@@ -2843,6 +2843,7 @@ def device_with_spec(  # noqa C901
     else:
         # Obtain B * L from indices len
         # pyre-ignore[19]
+        # pyre-fixme[61]: `D` is undefined, or not always defined.
         grad_output = torch.randn(requests[0][0].numel(), D).to(get_device())
     # backward
     time_per_iter = benchmark_requests(
diff --git a/fbgemm_gpu/fbgemm_gpu/enums.py b/fbgemm_gpu/fbgemm_gpu/enums.py
index 8942fa8486..9ea090f919 100644
--- a/fbgemm_gpu/fbgemm_gpu/enums.py
+++ b/fbgemm_gpu/fbgemm_gpu/enums.py
@@ -17,8 +17,7 @@ def create_enums(
 ) -> None:
     for enum_name, items in query_op():
         # Create matching python enumeration
-        # pyre-fixme[6]: For 2nd argument expected `None` but got `List[Tuple[str,
-        #  int]]`.
+        # pyre-fixme[19]: Expected 1 positional argument.
         new_enum = enum.Enum(enum_name, items)
         # and store it in the module
         namespace[enum_name] = new_enum
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
index bedf38f5c8..6b3b540ea4 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
@@ -350,6 +350,8 @@ def __init__(  # noqa C901
         self.embedding_specs = embedding_specs
         (rows, dims, locations, compute_devices) = zip(*embedding_specs)
         T_ = len(self.embedding_specs)
+        # pyre-fixme[8]: Attribute has type `List[int]`; used as
+        #  `Tuple[Union[ComputeDevice, EmbeddingLocation, int]]`.
         self.dims: List[int] = dims
         assert T_ > 0
         # mixed D is not supported by no bag kernels
@@ -699,6 +701,10 @@ def __init__(  # noqa C901
                     persistent=False,
                 )
 
+        # pyre-fixme[6]: For 1st argument expected `List[int]` but got
+        #  `Tuple[Union[ComputeDevice, EmbeddingLocation, int]]`.
+        # pyre-fixme[6]: For 2nd argument expected `List[EmbeddingLocation]` but got
+        #  `Tuple[Union[ComputeDevice, EmbeddingLocation, int]]`.
         cache_state = construct_cache_state(rows, locations, self.feature_table_map)
 
         # Add table-wise cache miss counter
diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py
index 1e6b43c62a..99efbd37fa 100644
--- a/fbgemm_gpu/test/jagged_tensor_ops_test.py
+++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py
@@ -645,7 +645,9 @@ def _to_padded_dense(
                 cur_offset = i
                 is_zero = False
                 for d in range(len(max_lengths)):
+                    # pyre-fixme[6]: For 1st argument expected `Union[None, _NestedSe...
                     begin = offsets[d][cur_offset].item()
+                    # pyre-fixme[6]: For 1st argument expected `Union[None, _NestedSe...
                     end = offsets[d][cur_offset + 1].item()
                     # pyre-fixme[6]: For 1st param expected `int` but got
                     #  `Union[bool, float, int]`.
@@ -654,7 +656,10 @@ def _to_padded_dense(
                         break
                     cur_offset = begin + jagged_coord[d]
                 dense[(i,) + jagged_coord] = (
-                    padding_value if is_zero else values[cur_offset]
+                    padding_value
+                    if is_zero
+                    # pyre-fixme[6]: For 1st argument expected `Union[None, _NestedSe...
+                    else values[cur_offset]
                 )
         return dense.squeeze(-1) if values.ndim == 1 else dense
 
diff --git a/fbgemm_gpu/test/layout_transform_ops_test.py b/fbgemm_gpu/test/layout_transform_ops_test.py
index 37bd533774..4bfc603d65 100644
--- a/fbgemm_gpu/test/layout_transform_ops_test.py
+++ b/fbgemm_gpu/test/layout_transform_ops_test.py
@@ -122,7 +122,9 @@ def test_recat_embedding_grad_output_mixed_D_batch(self, B: int, W: int) -> None
             )
             for i in range(W)
         ]
+        # pyre-fixme[16]: Module `cuda` has no attribute `LongTensor`.
         dim_sum_per_rank_tensor = torch.cuda.LongTensor(dim_sum_per_rank)
+        # pyre-fixme[16]: Module `cuda` has no attribute `LongTensor`.
         cumsum_dim_sum_per_rank_tensor = torch.cuda.LongTensor(
             np.cumsum([0] + dim_sum_per_rank)[:-1]
         )
@@ -160,7 +162,9 @@ def test_recat_embedding_grad_output_mixed_D_batch(self, B: int, W: int) -> None
             )
             for i in range(W)
         ]
+        # pyre-fixme[16]: Module `cuda` has no attribute `LongTensor`.
         dim_sum_per_rank_tensor = torch.cuda.LongTensor(dim_sum_per_rank)
+        # pyre-fixme[16]: Module `cuda` has no attribute `LongTensor`.
         cumsum_dim_sum_per_rank_tensor = torch.cuda.LongTensor(
             np.cumsum([0] + dim_sum_per_rank)[:-1]
         )
diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
index e022d1ae26..62f98e64bb 100644
--- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py
+++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -4419,7 +4419,11 @@ def execute_nbit_forward_(  # noqa C901
                     shifts = np.random.uniform(-2, 2, size=(E,)).astype(np.float16)
 
                 scale_shift[:, :] = torch.tensor(
-                    np.stack([scales, shifts], axis=1).astype(np.float16).view(np.uint8)
+                    # pyre-fixme[61]: `scales` is undefined, or not always defined.
+                    # pyre-fixme[61]: `shifts` is undefined, or not always defined.
+                    np.stack([scales, shifts], axis=1)
+                    .astype(np.float16)
+                    .view(np.uint8)
                 )
 
             fake_quantize_embs(
diff --git a/fbgemm_gpu/test/ssd_split_table_batched_embeddings_test.py b/fbgemm_gpu/test/ssd_split_table_batched_embeddings_test.py
index fe0b607141..b4b03d740d 100644
--- a/fbgemm_gpu/test/ssd_split_table_batched_embeddings_test.py
+++ b/fbgemm_gpu/test/ssd_split_table_batched_embeddings_test.py
@@ -654,7 +654,11 @@ def test_nbit_ssd_cache(
                     shifts = np.random.uniform(-2, 2, size=(E,)).astype(np.float16)
 
                 scale_shift[:, :] = torch.tensor(
-                    np.stack([scales, shifts], axis=1).astype(np.float16).view(np.uint8)
+                    # pyre-fixme[61]: `scales` is undefined, or not always defined.
+                    # pyre-fixme[61]: `shifts` is undefined, or not always defined.
+                    np.stack([scales, shifts], axis=1)
+                    .astype(np.float16)
+                    .view(np.uint8)
                 )
 
             D_bytes = rounded_row_size_in_bytes(

From 32c969f98851192ad5439113b17540421ae5f650 Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Thu, 28 Sep 2023 18:07:50 -0700
Subject: [PATCH 56/94] Remove FP64 from TBE CPU tests (#2049)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2049

FP64 is no longer supported in TBE CPU since D48895311

Reviewed By: spcyppt

Differential Revision: D49746255

fbshipit-source-id: 4976cda0233b1d276b5ea628eff4cb9075943aff
---
 .../split_table_batched_embeddings_test.py    | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
index 62f98e64bb..7c9afcb506 100644
--- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py
+++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -1453,10 +1453,6 @@ def test_backward_dense(  # noqa C901
         )
 
         per_sample_weights = to_device(xw.contiguous().view(-1), use_cpu)
-        if use_cpu:
-            # NOTE: GPU version of DenseTableBatchedEmbeddingBagsCodegen doesn't support double.
-            cc = cc.double()
-            per_sample_weights = per_sample_weights.double()
         per_sample_weights.requires_grad = True
         indices.requires_grad = False
         offsets.requires_grad = False
@@ -1494,13 +1490,8 @@ def test_backward_dense(  # noqa C901
                 )
 
         per_sample_weights = to_device(xw.contiguous().view(-1), use_cpu)
-        if use_cpu:
-            # NOTE: GPU version of DenseTableBatchedEmbeddingBagsCodegen doesn't support double.
-            cc = cc.double()
-            per_sample_weights = per_sample_weights.double()
-        else:
-            cc = cc.float()
-            per_sample_weights = per_sample_weights.float()
+        cc = cc.float()
+        per_sample_weights = per_sample_weights.float()
         per_sample_weights.requires_grad = True
         indices.requires_grad = False
         offsets.requires_grad = False
@@ -2531,10 +2522,6 @@ def execute_backward_adagrad_(  # noqa C901
             output_dtype=output_dtype,
         )
         per_sample_weights = to_device(xw.contiguous().view(-1), use_cpu)
-        if use_cpu:
-            # NOTE: GPU version of SplitTableBatchedEmbeddingBagsCodegen doesn't support double.
-            cc = cc.double()
-            per_sample_weights = per_sample_weights.double()
         per_sample_weights.requires_grad = True
         indices.requires_grad = False
         offsets.requires_grad = False
@@ -2552,8 +2539,6 @@ def execute_backward_adagrad_(  # noqa C901
         )
 
         per_sample_weights = to_device(xw.contiguous().view(-1), use_cpu)
-        if use_cpu:
-            per_sample_weights = per_sample_weights.double()
         per_sample_weights.requires_grad = True
         indices.requires_grad = False
         offsets.requires_grad = False

From d1b8766bcc1e50a7f676d132e78573dbb6400bfd Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Thu, 28 Sep 2023 20:55:42 -0700
Subject: [PATCH 57/94] Fix publication of ROCm wheels (#2052)

Summary:
Move Nova ROCm jobs into the same workflow file to prevent incorrect publication of ROCm wheels

Separating build_wheels_linux.yml for rocm causes rocm wheels to be uploaded into the wrong folder on pytorch.org.

On Nova, when publishing with one architecture, it pushes wheels to nightly/ rather than a specific architecure. pip fetches that nightly regardless of the underlying architecture.  When publishing with multiple architectures, it uploads to the architecture-specific folder under nightly (e.g., cpu, cu118, rocm5.5).

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2052

Reviewed By: spcyppt

Differential Revision: D49760071

Pulled By: q10

fbshipit-source-id: 34abb06cdcd0a294ff7a5937fdb966b204c3b717
---
 .github/workflows/build_wheels_linux_x86.yml  |  2 +-
 .../workflows/build_wheels_linux_x86_rocm.yml | 52 -------------------
 2 files changed, 1 insertion(+), 53 deletions(-)
 delete mode 100644 .github/workflows/build_wheels_linux_x86_rocm.yml

diff --git a/.github/workflows/build_wheels_linux_x86.yml b/.github/workflows/build_wheels_linux_x86.yml
index 76b7db4ab9..f2d962a75f 100644
--- a/.github/workflows/build_wheels_linux_x86.yml
+++ b/.github/workflows/build_wheels_linux_x86.yml
@@ -27,7 +27,7 @@ jobs:
       test-infra-repository: pytorch/test-infra
       test-infra-ref: main
       with-cuda: enable
-      with-rocm: disable
+      with-rocm: enable
       with-cpu: enable
   build:
     needs: generate-matrix
diff --git a/.github/workflows/build_wheels_linux_x86_rocm.yml b/.github/workflows/build_wheels_linux_x86_rocm.yml
deleted file mode 100644
index 79b5e35459..0000000000
--- a/.github/workflows/build_wheels_linux_x86_rocm.yml
+++ /dev/null
@@ -1,52 +0,0 @@
-name: Build x86 Linux Wheels (ROCm)
-
-on:
-  pull_request:
-  push:
-    branches:
-      - nightly
-      - main
-      # Release candidate branch look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-release+
-    tags:
-      # Release candidate tag look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-      - v[0-9]+.[0-9]+.[0-9]+
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
-    with:
-      package-type: wheel
-      os: linux
-      test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
-      with-cuda: disable
-      with-rocm: enable
-      with-cpu: disable
-  build:
-    # Only build and publish to nightly channel
-    if: ${{ github.event_name == 'pull_request' || (github.event_name == 'push' && startsWith(github.event.ref, 'refs/heads/nightly')) }}
-    needs: generate-matrix
-    name: pytorch/FBGEMM
-    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
-    with:
-      repository: pytorch/FBGEMM
-      ref: ""
-      pre-script: ../.github/scripts/nova_prescript.bash
-      post-script: ../.github/scripts/nova_postscript.bash
-      smoke-test-script: ""
-      env-var-script: .github/scripts/nova_dir.bash
-      package-name: fbgemm_gpu
-      test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
-      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
-      trigger-event: ${{ github.event_name }}
-    secrets:
-      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}

From 49f5794cf68b85ba601bbbccce1989f1a562554e Mon Sep 17 00:00:00 2001
From: Wei Su <wsu@meta.com>
Date: Fri, 29 Sep 2023 11:57:02 -0700
Subject: [PATCH 58/94] Fill embedding tables with randomized scales and bias
 in split-TBE benchmarks (#2031)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2031

Same as title

Reviewed By: sryap

Differential Revision: D49433995

fbshipit-source-id: 8f7cc876a4284aabe36374d8e95ff2fa043e5ebe
---
 fbgemm_gpu/bench/bench_utils.py               | 35 +++++++++++++++++++
 ...plit_table_batched_embeddings_benchmark.py |  8 +++++
 2 files changed, 43 insertions(+)

diff --git a/fbgemm_gpu/bench/bench_utils.py b/fbgemm_gpu/bench/bench_utils.py
index b174c32fcd..02ffda22f6 100644
--- a/fbgemm_gpu/bench/bench_utils.py
+++ b/fbgemm_gpu/bench/bench_utils.py
@@ -12,13 +12,17 @@
 from dataclasses import dataclass
 from typing import Callable, List, Optional, Tuple
 
+import numpy as np
+
 import torch
+from fbgemm_gpu.split_embedding_configs import SparseType
 from fbgemm_gpu.split_embedding_utils import (  # noqa: F401
     b_indices,
     generate_requests,  # noqa: F401
     get_device,  # noqa: F401
     round_up,  # noqa: F401
 )
+from torch import nn
 
 logging.basicConfig(level=logging.DEBUG)
 
@@ -455,3 +459,34 @@ def benchmark_vbe(
     return VBEBenchmarkOutput(
         avg, fwd, bwd, compressed_avg, compressed_fwd, reindex, compressed_bwd
     )
+
+
+def fill_random_scale_bias(
+    emb: nn.Module,
+    T: int,
+    weights_precision: SparseType,
+) -> None:
+    for t in range(T):
+        (weights, scale_shift) = emb.split_embedding_weights()[t]
+        if scale_shift is not None:
+            (E, R) = scale_shift.shape
+            assert R == 4
+            scales = None
+            shifts = None
+            if weights_precision == SparseType.INT8:
+                scales = np.random.uniform(0.001, 0.01, size=(E,)).astype(np.float16)
+                shifts = np.random.normal(-2, 2, size=(E,)).astype(np.float16)
+            elif weights_precision == SparseType.INT4:
+                scales = np.random.uniform(0.01, 0.1, size=(E,)).astype(np.float16)
+                shifts = np.random.normal(-2, 2, size=(E,)).astype(np.float16)
+            elif weights_precision == SparseType.INT2:
+                scales = np.random.uniform(0.1, 1, size=(E,)).astype(np.float16)
+                shifts = np.random.normal(-2, 2, size=(E,)).astype(np.float16)
+            scale_shift.copy_(
+                torch.tensor(
+                    np.stack([scales, shifts], axis=1)
+                    .astype(np.float16)
+                    .view(np.uint8),
+                    device=scale_shift.device,
+                )
+            )
diff --git a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
index 84a87c96fb..df57b397a6 100644
--- a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
+++ b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
@@ -62,6 +62,7 @@
         benchmark_requests_refer,
         benchmark_torch_function,
         benchmark_vbe,
+        fill_random_scale_bias,
     )
 else:
     from fbgemm_gpu.bench.bench_utils import (
@@ -70,6 +71,7 @@
         benchmark_requests_refer,
         benchmark_torch_function,
         benchmark_vbe,
+        fill_random_scale_bias,
     )
 
 
@@ -815,6 +817,7 @@ def nbit_cpu(  # noqa C901
         fp8_exponent_bias=fp8_exponent_bias,
     ).cpu()
     emb.fill_random_weights()
+    fill_random_scale_bias(emb, T, weights_precision)
 
     nparams_byte = sum(w.numel() for (w, _) in emb.split_embedding_weights())
     param_size_multiplier = weights_precision.bit_rate() / 8.0
@@ -987,6 +990,7 @@ def nbit_device(  # noqa C901
         fp8_exponent_bias=fp8_exponent_bias,
     ).cuda()
     emb.fill_random_weights()
+    fill_random_scale_bias(emb, T, weights_precision)
 
     nparams_byte = sum(w.numel() for (w, _) in emb.split_embedding_weights())
     param_size_multiplier = weights_precision.bit_rate() / 8.0
@@ -1267,6 +1271,7 @@ def nbit_device_with_spec(  # noqa C901
     else:
         emb = emb.cuda()
     emb.fill_random_weights()
+    fill_random_scale_bias(emb, T, weights_precision)
 
     nparams_byte = sum(w.numel() for (w, _) in emb.split_embedding_weights())
     param_size_multiplier = weights_precision.bit_rate() / 8.0
@@ -1843,6 +1848,7 @@ def bench_uvm_cls(
             uvm_host_mapped=uvm_host_mapped,
         ).cuda()
         emb.fill_random_weights()
+        fill_random_scale_bias(emb, T, weights_precision)
 
         nvtx_range = (
             f"UVM-RECORD-CACHE-{name.upper()}"
@@ -2015,6 +2021,7 @@ def nbit_cache(  # noqa C901
         cache_assoc=cache_assoc,
     ).cuda()
     emb_nc.fill_random_weights()
+    fill_random_scale_bias(emb_nc, T, weights_precision)
 
     emb = IntNBitTableBatchedEmbeddingBagsCodegen(
         [
@@ -2040,6 +2047,7 @@ def nbit_cache(  # noqa C901
         cache_assoc=cache_assoc,
     ).cuda()
     emb.fill_random_weights()
+    fill_random_scale_bias(emb, T, weights_precision)
 
     nparams_byte = sum(w.numel() for (w, _) in emb.split_embedding_weights())
     param_size_multiplier = weights_precision.bit_rate() / 8.0

From 39914eff04dd1d79557ae7be135d6dd81f43b328 Mon Sep 17 00:00:00 2001
From: Yifan Xu <xuyifan@meta.com>
Date: Fri, 29 Sep 2023 15:18:22 -0700
Subject: [PATCH 59/94] Back out "jagged bmm CPU operator optimization" (#2053)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2053

Original commit changeset: 6688a5cd68fd

Original Phabricator Diff: D45936724

Reviewed By: renganxu

Differential Revision: D49783731

fbshipit-source-id: 54c1bd4ec355325d88ec1b22fe2335e8a07936e3
---
 .../jagged_tensor_ops_cpu.cpp                 | 46 ++-----------------
 1 file changed, 4 insertions(+), 42 deletions(-)

diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
index 820173b6c5..7ae207adb8 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
@@ -17,10 +17,6 @@
 #include "fbgemm_gpu/sparse_ops.h"
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
 namespace fbgemm_gpu {
 
 ///@defgroup jagged-tensor-ops-cpu Jagged Tensor Operators
@@ -1243,11 +1239,7 @@ void jagged_softmax_kernel(
     const int64_t max_L) {
   const int B = offsets.size(0) - 1;
   const int D = values.size(1);
-
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-  for (auto b = 0; b < B; b++) {
+  for (const auto b : c10::irange(B)) {
     const int row_start = offsets[b];
     const int row_end = offsets[b + 1];
     const int length = std::min(row_end - row_start, (int)max_L);
@@ -1284,10 +1276,6 @@ Tensor jagged_softmax_forward(
   const int D = values.size(1);
   auto output = at::empty_like(values);
 
-#ifdef _OPENMP
-  omp_set_num_threads(10);
-#endif
-
   if (B > 0 && D > 0) {
     AT_DISPATCH_INDEX_TYPES(
         offsets.scalar_type(), "jagged_softmax_kernel_1", [&] {
@@ -1317,11 +1305,7 @@ void jagged_softmax_backward_kernel(
     const int64_t max_L) {
   const int B = offsets.size(0) - 1;
   const int D = grad_output.size(1);
-
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-  for (auto b = 0; b < B; b++) {
+  for (const auto b : c10::irange(B)) {
     const int row_start = offsets[b];
     const int row_end = offsets[b + 1];
     const int length = std::min(row_end - row_start, (int)max_L);
@@ -1354,10 +1338,6 @@ Tensor jagged_softmax_backward(
   const int D = grad_output.size(1);
   auto grad_input = at::empty_like(grad_output);
 
-#ifdef _OPENMP
-  omp_set_num_threads(10);
-#endif
-
   if (B > 0 && D > 0) {
     AT_DISPATCH_INDEX_TYPES(
         offsets.scalar_type(), "jagged_backward_kernel_1", [&] {
@@ -1389,11 +1369,7 @@ void jagged_jagged_bmm_kernel(
   const int B = offsets.size(0) - 1;
   const int M = x_values.size(1);
   const int N = y_values.size(1);
-
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-  for (auto b = 0; b < B; b++) {
+  for (const auto b : c10::irange(B)) {
     const int row_start = offsets[b];
     const int row_end = offsets[b + 1];
     const int length = std::min(row_end - row_start, (int)max_L);
@@ -1421,11 +1397,6 @@ Tensor jagged_jagged_bmm_forward(
   const int M = x_values.size(-1);
   const int N = y_values.size(-1);
   auto output = at::zeros({B, M, N}, x_values.options());
-
-#ifdef _OPENMP
-  omp_set_num_threads(10);
-#endif
-
   if (B > 0 && M > 0 && N > 0) {
     AT_DISPATCH_INDEX_TYPES(
         offsets.scalar_type(), "jagged_jagged_bmm_kernel_1", [&] {
@@ -1459,11 +1430,7 @@ void jagged_dense_bmm_kernel(
   const int B = x_offsets.size(0) - 1;
   const int K = x_values.size(1);
   const int N = y.size(2);
-
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-  for (auto b = 0; b < B; b++) {
+  for (const auto b : c10::irange(B)) {
     const int row_start = x_offsets[b];
     const int row_end = x_offsets[b + 1];
     const int length = std::min(row_end - row_start, (int)max_L);
@@ -1492,11 +1459,6 @@ Tensor jagged_dense_bmm_forward(
   const int N = y.size(-1);
   const int total_L = x_values.size(0);
   auto output = at::zeros({total_L, N}, x_values.options());
-
-#ifdef _OPENMP
-  omp_set_num_threads(10);
-#endif
-
   if (B > 0 && M > 0 && N > 0) {
     AT_DISPATCH_INDEX_TYPES(
         x_offsets.scalar_type(), "jagged_dense_bmm_kernel_1", [&] {

From 7b7ad61a32ec387114ca9a72cc8adc9d03241bb3 Mon Sep 17 00:00:00 2001
From: Andrew Gallagher <andrewjcg@meta.com>
Date: Sat, 30 Sep 2023 10:46:38 -0700
Subject: [PATCH 60/94] Fix aarch64 build break (#2055)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2055

The aarch64 CUDA builds use D46213158 to disable F14 intrinsics for
compilations driven by NVCC/CUDA, instead the typical workaround that x86
uses: D34439017.

However, it looks like there's some issue preventing NVCC from parsing
the `F14SetFallback.h` code.  It turns out that we likely never use this
code from `.cu` sources, so this diff just drops an umbrella header and
uses fine-grained `#include`s to avoid F14.

Reviewed By: meyering

Differential Revision: D49792747

fbshipit-source-id: 8d2ef8cc68bcb2442a5b34e521d548cbb03a4c09
---
 fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops.h b/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops.h
index 1e08490ede..9c17597724 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops.h
@@ -9,7 +9,8 @@
 #pragma once
 
 #include <ATen/ATen.h>
-#include <torch/script.h>
+#include <torch/csrc/api/include/torch/types.h>
+#include <torch/csrc/autograd/custom_function.h>
 #include "fbgemm_gpu/ops_utils.h"
 #include "fbgemm_gpu/sparse_ops_utils.h"
 

From ea18a680b9fe3d82deddfdd677dab516ab2eb8d4 Mon Sep 17 00:00:00 2001
From: Banit Agrawal <bagrawal@meta.com>
Date: Wed, 4 Oct 2023 14:40:15 -0700
Subject: [PATCH 61/94] Use 4k page instead of 2M for managed tensor (#2058)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2058

This diff changes the page size from 2M to 4k for prefaulting/mapping the pages.

Reviewed By: q10, jasonjk-park, zyan0, jianyuh

Differential Revision: D49924136

fbshipit-source-id: fdee08b9a4da54dce902c98ee3aae62ac0d3ad6c
---
 fbgemm_gpu/src/cumem_utils.cu | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/fbgemm_gpu/src/cumem_utils.cu b/fbgemm_gpu/src/cumem_utils.cu
index 9f7ecc308d..ce04a34cf2 100644
--- a/fbgemm_gpu/src/cumem_utils.cu
+++ b/fbgemm_gpu/src/cumem_utils.cu
@@ -224,11 +224,9 @@ Tensor new_host_mapped_tensor(
   // can minimize the cost while holding this global lock.
   void* const ptr = malloc(size_bytes);
 
-  // advise the kernel to allocate large 2M pages
-  madvise(ptr, size_bytes, MADV_HUGEPAGE);
-
-  // pre-fault/map the pages by setting the first byte of the page
-  size_t pageSize = (1 << 21);
+  // Pre-fault/map the pages by setting the first byte of the page
+  // TODO: parallelize the mapping of pages with a threadpool executor
+  const size_t pageSize = (size_t)sysconf(_SC_PAGESIZE);
   uintptr_t alignedPtr = (((uintptr_t)ptr + pageSize - 1) & ~(pageSize - 1));
   for (uintptr_t p = alignedPtr; p < ((uintptr_t)ptr + size_bytes);
        p += pageSize) {

From 17384a9362c3a9f0345dda62f0ce5a24d0b09446 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Wed, 4 Oct 2023 21:01:09 -0700
Subject: [PATCH 62/94] Fix unknown c++ flag detection in CMake (#2057)

Summary:
Unknown -Wno-XXX flags are still appended to GCC via append_cxx_flag_if_supported  because of the behavior mentioned in GCC document:
```
When an unrecognized warning option is requested (e.g., -Wunknown-warning),
GCC emits a diagnostic stating that the option is not recognized.
However, if the -Wno- form is used, the behavior is slightly different:
no diagnostic is produced for -Wno-unknown-warning unless other diagnostics are being produced.
This allows the use of new -Wno- options with old compilers,
but if something goes wrong, the compiler warns that an unrecognized option is present.
```
This PR tries to fix by detection the flag of the -WXXX form.  The same patch was applied in PyTorch.

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2057

Reviewed By: sryap

Differential Revision: D49938257

Pulled By: q10

fbshipit-source-id: 76bbf10bff06fbaddad68f82dfc69f9b92007c1d
---
 CMakeLists.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ab980ebeb3..134523e7d7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,7 +58,13 @@ set(CMAKE_C_STANDARD_REQUIRED ON)
 function(append_cxx_flag_if_supported flag outputvar)
     string(TOUPPER "HAS${flag}" _FLAG_NAME)
     string(REGEX REPLACE "[=-]" "_" _FLAG_NAME "${_FLAG_NAME}")
-    check_cxx_compiler_flag("${flag}" ${_FLAG_NAME})
+    # GCC silents unknown -Wno-XXX flags, so we detect the corresponding -WXXX.
+    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+      string(REGEX REPLACE "Wno-" "W" new_flag "${flag}")
+    else()
+      set(new_flag ${flag})
+    endif()
+    check_cxx_compiler_flag("${new_flag}" ${_FLAG_NAME})
     if(${_FLAG_NAME})
         string(APPEND ${outputvar} " ${flag}")
         set(${outputvar} "${${outputvar}}" PARENT_SCOPE)

From fa3eab6434e52782f9d14798f8fa2fd44fde11d8 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Thu, 5 Oct 2023 01:07:33 -0700
Subject: [PATCH 63/94] Update default CUDA version to 12.1 (#2063)

Summary:
- Update default CUDA version to 12.1
- Add extra build flag for ROCm builds to enable error tracing
- Add script to download wheels from PyTorch PIP

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2063

Reviewed By: spcyppt

Differential Revision: D49942815

Pulled By: q10

fbshipit-source-id: 836aec820a30da9f2139aafb2f9d5fef3b968377
---
 .github/scripts/fbgemm_gpu_build.bash         |   1 +
 .github/scripts/utils_pip.bash                | 118 +++++++++++++++---
 .github/workflows/fbgemm_gpu_cuda_nightly.yml |   2 +-
 .github/workflows/fbgemm_gpu_cuda_release.yml |   2 +-
 4 files changed, 102 insertions(+), 21 deletions(-)

diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index 269641cef8..4355ac2936 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -92,6 +92,7 @@ __configure_fbgemm_gpu_build_rocm () {
   echo "[BUILD] Setting ROCm build args ..."
   build_args=(
     --package_variant=rocm
+    -DTORCH_USE_HIP_DSA=1
   )
 }
 
diff --git a/.github/scripts/utils_pip.bash b/.github/scripts/utils_pip.bash
index 686485711b..1a7d66a54a 100644
--- a/.github/scripts/utils_pip.bash
+++ b/.github/scripts/utils_pip.bash
@@ -13,12 +13,12 @@
 # PyTorch PIP Install Functions
 ################################################################################
 
-install_from_pytorch_pip () {
-  local env_name="$1"
-  local package_name_raw="$2"
-  local package_version="$3"
-  local package_variant_type="$4"
-  local package_variant_version="$5"
+__extract_pip_arguments () {
+  export env_name="$1"
+  export package_name_raw="$2"
+  export package_version="$3"
+  export package_variant_type="$4"
+  export package_variant_version="$5"
   if [ "$package_variant_type" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} ENV_NAME PACKAGE_NAME PACKAGE_VERSION PACKAGE_VARIANT_TYPE [PACKAGE_VARIANT_VERSION]"
     echo "Example(s):"
@@ -40,7 +40,7 @@ install_from_pytorch_pip () {
 
   # Replace underscores with hyphens to materialize the canonical name of the package
   # shellcheck disable=SC2155
-  local package_name=$(echo "${package_name_raw}" | tr '_' '-')
+  export package_name=$(echo "${package_name_raw}" | tr '_' '-')
 
   # Set the package variant
   if [ "$package_variant_type" == "cuda" ]; then
@@ -49,38 +49,66 @@ install_from_pytorch_pip () {
     # shellcheck disable=SC2206
     local cuda_version_arr=(${cuda_version//./ })
     # Convert, i.e. cuda 11.7.1 => cu117
-    local package_variant="cu${cuda_version_arr[0]}${cuda_version_arr[1]}"
+    export package_variant="cu${cuda_version_arr[0]}${cuda_version_arr[1]}"
   elif [ "$package_variant_type" == "rocm" ]; then
     # Extract the ROCM version or default to 5.5.1
     local rocm_version="${package_variant_version:-5.5.1}"
     # shellcheck disable=SC2206
     local rocm_version_arr=(${rocm_version//./ })
     # Convert, i.e. rocm 5.5.1 => rocm5.5
-    local package_variant="rocm${rocm_version_arr[0]}.${rocm_version_arr[1]}"
+    export package_variant="rocm${rocm_version_arr[0]}.${rocm_version_arr[1]}"
   else
-    local package_variant_type="cpu"
-    local package_variant="cpu"
+    export package_variant_type="cpu"
+    export package_variant="cpu"
   fi
   echo "[INSTALL] Extracted package variant: ${package_variant}"
 
   # Set the package name and installation channel
   if [ "$package_version" == "nightly" ] || [ "$package_version" == "test" ]; then
-    local package_package="--pre ${package_name}"
-    local package_channel="https://download.pytorch.org/whl/${package_version}/${package_variant}/"
+    export pip_package="--pre ${package_name}"
+    export pip_channel="https://download.pytorch.org/whl/${package_version}/${package_variant}/"
   elif [ "$package_version" == "latest" ]; then
-    local package_package="${package_name}"
-    local package_channel="https://download.pytorch.org/whl/${package_variant}/"
+    export pip_package="${package_name}"
+    export pip_channel="https://download.pytorch.org/whl/${package_variant}/"
   else
-    local package_package="${package_name}==${package_version}+${package_variant}"
-    local package_channel="https://download.pytorch.org/whl/${package_variant}/"
+    export pip_package="${package_name}==${package_version}+${package_variant}"
+    export pip_channel="https://download.pytorch.org/whl/${package_variant}/"
   fi
+}
+
+install_from_pytorch_pip () {
+  local env_name="$1"
+  local package_name_raw="$2"
+  local package_version="$3"
+  local package_variant_type="$4"
+  local package_variant_version="$5"
+  if [ "$package_variant_type" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME PACKAGE_NAME PACKAGE_VERSION PACKAGE_VARIANT_TYPE [PACKAGE_VARIANT_VERSION]"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env torch 1.11.0 cpu             # Install the CPU variant a specific version"
+    echo "    ${FUNCNAME[0]} build_env torch latest cpu             # Install the CPU variant of the latest stable version"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test cuda 11.7.1  # Install the variant for CUDA 11.7"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm 5.3  # Install the variant for ROCM 5.3"
+    return 1
+  else
+    echo "################################################################################"
+    echo "# Install ${package_name_raw} (PyTorch PIP)"
+    echo "#"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
+    echo "################################################################################"
+    echo ""
+  fi
+
+  test_network_connection || return 1
+
+  __extract_pip_arguments "$env_name" "$package_name_raw" "$package_version" "$package_variant_type" "$package_variant_version"
 
   # shellcheck disable=SC2155
   local env_prefix=$(env_name_or_prefix "${env_name}")
 
-  echo "[INSTALL] Attempting to install [${package_name}, ${package_version}+${package_variant}] through PIP using channel ${package_channel} ..."
+  echo "[INSTALL] Attempting to install [${package_name}, ${package_version}+${package_variant}] from PyTorch PIP using channel ${pip_channel} ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda run ${env_prefix} pip install ${package_package} --extra-index-url ${package_channel}) || return 1
+  (exec_with_retries conda run ${env_prefix} pip install ${pip_package} --extra-index-url ${pip_channel}) || return 1
 
   # Check only applies to non-CPU variants
   if [ "$package_variant_type" != "cpu" ]; then
@@ -97,6 +125,58 @@ install_from_pytorch_pip () {
   fi
 }
 
+################################################################################
+# PyTorch PIP Download Functions
+################################################################################
+
+download_from_pytorch_pip () {
+  local env_name="$1"
+  local package_name_raw="$2"
+  local package_version="$3"
+  local package_variant_type="$4"
+  local package_variant_version="$5"
+  if [ "$package_variant_type" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME PACKAGE_NAME PACKAGE_VERSION PACKAGE_VARIANT_TYPE [PACKAGE_VARIANT_VERSION]"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env torch 1.11.0 cpu             # Download the CPU variant a specific version"
+    echo "    ${FUNCNAME[0]} build_env torch latest cpu             # Download the CPU variant of the latest stable version"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test cuda 11.7.1  # Download the variant for CUDA 11.7"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm 5.3  # Download the variant for ROCM 5.3"
+    return 1
+  else
+    echo "################################################################################"
+    echo "# Download ${package_name_raw} (PyTorch PIP)"
+    echo "#"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
+    echo "################################################################################"
+    echo ""
+  fi
+
+  test_network_connection || return 1
+
+  __extract_pip_arguments "$env_name" "$package_name_raw" "$package_version" "$package_variant_type" "$package_variant_version"
+
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
+  echo "[DOWNLOAD] Removing previously downloaded wheels from current directory ..."
+  # shellcheck disable=SC2035
+  rm -rf *.whl || return 1
+
+  echo "[DOWNLOAD] Attempting to download wheel [${package_name}, ${package_version}+${package_variant}] from PyTorch PIP using channel ${pip_channel} ..."
+  # shellcheck disable=SC2086
+  (exec_with_retries conda run ${env_prefix} pip download ${pip_package} --extra-index-url ${pip_channel}) || return 1
+
+  # Ensure that the package build is of the correct variant
+  # This test usually applies to the nightly builds
+  # shellcheck disable=SC2010
+  if ls -la . | grep "${package_name}-"; then
+    echo "[CHECK] Successfully downloaded the wheel."
+  else
+    echo "[CHECK] The wheel was not found!"
+    return 1
+  fi
+}
 
 ################################################################################
 # PyPI Publish Functions
diff --git a/.github/workflows/fbgemm_gpu_cuda_nightly.yml b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
index 420e879ea8..9635a2c20c 100644
--- a/.github/workflows/fbgemm_gpu_cuda_nightly.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
@@ -136,7 +136,7 @@ jobs:
         python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.8.0", "12.1.1" ]
         # Specify exactly ONE CUDA version for artifact publish
-        cuda-version-publish: [ "11.8.0" ]
+        cuda-version-publish: [ "12.1.1" ]
     needs: build_artifact
 
     steps:
diff --git a/.github/workflows/fbgemm_gpu_cuda_release.yml b/.github/workflows/fbgemm_gpu_cuda_release.yml
index 571ac76b0d..bc1dea236e 100644
--- a/.github/workflows/fbgemm_gpu_cuda_release.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_release.yml
@@ -32,7 +32,7 @@ on:
         type: choice
         required: false
         options: [ "11.8.0", "12.1.1" ]
-        default: "11.8.0"
+        default: "12.1.1"
 
 concurrency:
   # Cancel previous runs in the PR if a new commit is pushed

From 2b046829b218b38e37ab617c3dd134e9033a9a85 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Thu, 5 Oct 2023 01:11:32 -0700
Subject: [PATCH 64/94] Fix inconsistent dll linkage warning (#2059)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2059

Reviewed By: sryap

Differential Revision: D49938275

Pulled By: q10

fbshipit-source-id: 3110c89af9d80e64f2a4715eed0d3e2f93f0e3c8
---
 include/fbgemm/FbgemmFPCommon.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/fbgemm/FbgemmFPCommon.h b/include/fbgemm/FbgemmFPCommon.h
index 95bd5b9fe9..238514a345 100644
--- a/include/fbgemm/FbgemmFPCommon.h
+++ b/include/fbgemm/FbgemmFPCommon.h
@@ -73,6 +73,7 @@ FBGEMM_API void cblas_gemm_compute(
     int thread_id = 0,
     int num_threads = 1);
 
+#if defined(FBGEMM_EXPORTS)
 // autotuned kernel splits for various cases m = 1:mb_max
 template <typename T>
 void cblas_gemm_compute(
@@ -252,6 +253,7 @@ void cblas_gemm_compute(
     }
   }
 }
+#endif
 
 #undef FBGEMM_USE_REF_KERNEL
 } // namespace fbgemm

From 8f7d8c7d8d7c64157903607332553ee5e6a2a3f0 Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Thu, 5 Oct 2023 09:25:34 -0700
Subject: [PATCH 65/94] Fix non-contiguous tensor problem in
 jagged_index_select (#2060)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2060

Before this diff, `jagged_index_select` kernels take raw pointers as
arguments.  This requires the input tensors to be contiguous.
However, the `jagged_index_select` operator did not make sure that the
tensors are contiguous before extracting and passing the raw pointers
to the kernels causing the correctness issue.  This diff replaces the
raw pointer arguments with PyTorch's `PackedTensorAccessor` which
handles non-contiguous tensor accesses automatically.  For some
tensors that their raw pointers are still being used, the operator
makes sure that the tensors are contiguous before using them.

Reviewed By: choudharydhruv

Differential Revision: D49937274

fbshipit-source-id: dcdc751191ae17e3697b99d30145c67ab470a218
---
 .../jagged_index_add_2d_forward.cu            | 55 +++++++++++--------
 .../jagged_index_select_2d_forward.cu         | 55 +++++++++++--------
 fbgemm_gpu/test/jagged_tensor_ops_test.py     | 10 ++++
 3 files changed, 74 insertions(+), 46 deletions(-)

diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_index_add_2d_forward.cu b/fbgemm_gpu/src/jagged_tensor_ops/jagged_index_add_2d_forward.cu
index 100b88be6d..30b604d072 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_index_add_2d_forward.cu
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_index_add_2d_forward.cu
@@ -14,14 +14,14 @@ namespace fbgemm_gpu {
 
 template <typename index_t, typename offset_t, typename scalar_t>
 __global__ __launch_bounds__(kMaxThreads) void jagged_index_add_2d_kernel(
-    scalar_t* output,
-    const scalar_t* values,
-    const offset_t* input_offsets,
-    const index_t* indices,
-    const offset_t* output_offsets,
-    const int64_t num_input_rows,
-    const int64_t num_dense_input_rows,
-    const int64_t num_cols) {
+    at::PackedTensorAccessor64<scalar_t, 2, at::RestrictPtrTraits> output,
+    const at::PackedTensorAccessor64<scalar_t, 2, at::RestrictPtrTraits> values,
+    const at::PackedTensorAccessor32<offset_t, 1, at::RestrictPtrTraits>
+        input_offsets,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> indices,
+    const at::PackedTensorAccessor32<offset_t, 1, at::RestrictPtrTraits>
+        output_offsets,
+    const int64_t num_dense_input_rows) {
   __shared__ int smem[1];
   for (offset_t dense_input_offset = blockIdx.x;
        dense_input_offset < num_dense_input_rows;
@@ -29,8 +29,9 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_index_add_2d_kernel(
     // Binary search
     // TODO: use multiple threads to do bin search to reduce number of steps
     if (threadIdx.x == 0) {
+      const auto num_input_rows = indices.size(0);
       binary_search_range(
-          smem, input_offsets, dense_input_offset, num_input_rows);
+          smem, &input_offsets[0], dense_input_offset, num_input_rows);
     }
     __syncthreads();
 
@@ -46,14 +47,11 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_index_add_2d_kernel(
     const offset_t output_offset =
         (index == 0 ? 0 : output_offsets[index - 1]) + rel_index;
 
-    // Shift buffers
-    const scalar_t* values_ = values + dense_input_offset * num_cols;
-    scalar_t* output_ = output + output_offset * num_cols;
-
     // TODO: Avoid using atoimcAdd (because it could lead to the numerical
     // indeterminism issue)
+    const auto num_cols = output.size(1);
     for (int i = threadIdx.x; i < num_cols; i += blockDim.x) {
-      gpuAtomicAdd(&output_[i], values_[i]);
+      gpuAtomicAdd(&output[output_offset][i], values[dense_input_offset][i]);
     }
   }
 }
@@ -85,7 +83,6 @@ Tensor jagged_index_add_2d_forward_cuda(
   device_guard.set_index(values.get_device());
 
   auto num_cols = values.size(1);
-  const int64_t num_input_rows = indices.numel();
 
   const int64_t max_num_blocks = 1024; // Arbitrarily set to this number of now
   const int64_t max_num_threads = kMaxThreads;
@@ -94,6 +91,9 @@ Tensor jagged_index_add_2d_forward_cuda(
   Tensor output = at::zeros({num_output_rows, num_cols}, values.options());
 
   if (num_blocks > 0) {
+    // input_offsets has to be contiguous since it is passed to
+    // binary_search_range which accepts raw pointers
+    const auto input_offsets_contig = input_offsets.expect_contiguous();
     AT_DISPATCH_ALL_TYPES_AND2(
         at::ScalarType::Half,
         at::ScalarType::BFloat16,
@@ -109,14 +109,23 @@ Tensor jagged_index_add_2d_forward_cuda(
                     dim3(num_cols),
                     0,
                     at::cuda::getCurrentCUDAStream()>>>(
-                    output.data_ptr<scalar_t>(),
-                    values.data_ptr<scalar_t>(),
-                    input_offsets.data_ptr<int64_t>(),
-                    indices.data_ptr<index_t>(),
-                    output_offsets.data_ptr<int64_t>(),
-                    num_input_rows,
-                    num_dense_input_rows,
-                    num_cols);
+                    output.packed_accessor64<
+                        scalar_t,
+                        2,
+                        at::RestrictPtrTraits>(),
+                    values.packed_accessor64<
+                        scalar_t,
+                        2,
+                        at::RestrictPtrTraits>(),
+                    input_offsets_contig->packed_accessor32<
+                        int64_t,
+                        1,
+                        at::RestrictPtrTraits>(),
+                    indices
+                        .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                    output_offsets
+                        .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+                    num_dense_input_rows);
                 C10_CUDA_KERNEL_LAUNCH_CHECK();
               });
         });
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_index_select_2d_forward.cu b/fbgemm_gpu/src/jagged_tensor_ops/jagged_index_select_2d_forward.cu
index b5a8f5ffc5..618a4cd30f 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_index_select_2d_forward.cu
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_index_select_2d_forward.cu
@@ -14,14 +14,14 @@ namespace fbgemm_gpu {
 
 template <typename index_t, typename offset_t, typename scalar_t>
 __global__ __launch_bounds__(kMaxThreads) void jagged_index_select_2d_kernel(
-    scalar_t* output,
-    const scalar_t* input,
-    const offset_t* input_offsets,
-    const index_t* indices,
-    const offset_t* output_offsets,
-    const int64_t num_output_rows,
-    const int64_t num_dense_output_rows,
-    const int64_t num_cols) {
+    at::PackedTensorAccessor64<scalar_t, 2, at::RestrictPtrTraits> output,
+    const at::PackedTensorAccessor64<scalar_t, 2, at::RestrictPtrTraits> input,
+    const at::PackedTensorAccessor32<offset_t, 1, at::RestrictPtrTraits>
+        input_offsets,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> indices,
+    const at::PackedTensorAccessor32<offset_t, 1, at::RestrictPtrTraits>
+        output_offsets,
+    const int64_t num_dense_output_rows) {
   __shared__ int smem[1];
   for (offset_t dense_output_offset = blockIdx.x;
        dense_output_offset < num_dense_output_rows;
@@ -29,8 +29,9 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_index_select_2d_kernel(
     // Binary search
     // TODO: use multiple threads to do bin search to reduce number of steps
     if (threadIdx.x == 0) {
+      const auto num_output_rows = indices.size(0);
       binary_search_range(
-          smem, output_offsets, dense_output_offset, num_output_rows);
+          smem, &output_offsets[0], dense_output_offset, num_output_rows);
     }
     __syncthreads();
 
@@ -46,12 +47,9 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_index_select_2d_kernel(
     const offset_t input_offset =
         (index == 0 ? 0 : input_offsets[index - 1]) + rel_index;
 
-    // Shift buffers
-    scalar_t* output_ = output + dense_output_offset * num_cols;
-    const scalar_t* input_ = input + input_offset * num_cols;
-
+    const auto num_cols = input.size(1);
     for (int i = threadIdx.x; i < num_cols; i += blockDim.x) {
-      output_[i] = input_[i];
+      output[dense_output_offset][i] = input[input_offset][i];
     }
   }
 }
@@ -81,7 +79,6 @@ Tensor jagged_index_select_2d_forward_cuda(
   device_guard.set_index(values.get_device());
 
   auto num_cols = values.size(1);
-  const int64_t num_output_rows = indices.numel();
 
   const int64_t max_num_blocks = 1024; // Arbitrarily set to this number of now
   const int64_t max_num_threads = kMaxThreads;
@@ -91,6 +88,9 @@ Tensor jagged_index_select_2d_forward_cuda(
       at::empty({num_dense_output_rows, num_cols}, values.options());
 
   if (num_blocks > 0) {
+    // output_offsets has to be contiguous since it is passed to
+    // binary_search_range which accepts raw pointers
+    const auto output_offsets_contig = output_offsets.expect_contiguous();
     AT_DISPATCH_ALL_TYPES_AND2(
         at::ScalarType::Half,
         at::ScalarType::BFloat16,
@@ -106,14 +106,23 @@ Tensor jagged_index_select_2d_forward_cuda(
                     dim3(num_cols),
                     0,
                     at::cuda::getCurrentCUDAStream()>>>(
-                    output.data_ptr<scalar_t>(),
-                    values.data_ptr<scalar_t>(),
-                    input_offsets.data_ptr<int64_t>(),
-                    indices.data_ptr<index_t>(),
-                    output_offsets.data_ptr<int64_t>(),
-                    num_output_rows,
-                    num_dense_output_rows,
-                    num_cols);
+                    output.packed_accessor64<
+                        scalar_t,
+                        2,
+                        at::RestrictPtrTraits>(),
+                    values.packed_accessor64<
+                        scalar_t,
+                        2,
+                        at::RestrictPtrTraits>(),
+                    input_offsets
+                        .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+                    indices
+                        .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                    output_offsets_contig->packed_accessor32<
+                        int64_t,
+                        1,
+                        at::RestrictPtrTraits>(),
+                    num_dense_output_rows);
                 C10_CUDA_KERNEL_LAUNCH_CHECK();
               });
         });
diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py
index 99efbd37fa..1787810554 100644
--- a/fbgemm_gpu/test/jagged_tensor_ops_test.py
+++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py
@@ -1827,6 +1827,7 @@ def jagged_index_select_2d_ref(
         else st.just(False)
         if (gpu_available and TEST_WITH_ROCM)
         else st.just(True),
+        check_non_contiguous=st.booleans(),
     )
     @settings(max_examples=20, deadline=None, verbosity=Verbosity.verbose)
     def test_jagged_index_select_2d(
@@ -1838,6 +1839,7 @@ def test_jagged_index_select_2d(
         index_dtype: torch.dtype,
         jagged_tensor_dtype: torch.dtype,
         use_cpu: bool,
+        check_non_contiguous: bool,
     ) -> None:
         device = torch.device("cpu" if use_cpu else "cuda")
         is_float = jagged_tensor_dtype in [torch.float, torch.half, torch.bfloat16]
@@ -1873,6 +1875,10 @@ def test_jagged_index_select_2d(
             )
         values_ref = values.detach().clone()
 
+        if check_non_contiguous:
+            values = values.as_strided(values.shape, (1, values.shape[0]))
+            values_ref = values_ref.as_strided(values.shape, (1, values.shape[0]))
+
         # Only float tensors can require grad
         if is_float:
             values.requires_grad = True
@@ -1891,6 +1897,10 @@ def test_jagged_index_select_2d(
         grad = torch.rand_like(output)
         grad_ref = grad.detach().clone()
 
+        if check_non_contiguous:
+            grad = grad.as_strided(grad.shape, (1, grad.shape[0]))
+            grad_ref = grad_ref.as_strided(grad.shape, (1, grad.shape[0]))
+
         output.backward(grad)
         output_ref.backward(grad_ref)
 

From d307673b41d9cd9d68ae7d9997b1dad87d0ce9c4 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Thu, 5 Oct 2023 16:10:46 -0700
Subject: [PATCH 66/94] =?UTF-8?q?Add=20option=20to=20install=20PyTorch=20f?=
 =?UTF-8?q?rom=20release=20c=E2=80=A6=20(#2065)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
…hannel

- For the FBGEMM_GPU release workflows, add the option to install PyTorch from the PyTorch PIP release channel in addition to the test channel

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2065

Reviewed By: spcyppt

Differential Revision: D49960302

Pulled By: q10

fbshipit-source-id: 1576ace7b8c24834f7db4fe19ccb0d07faa2fb1d
---
 .github/scripts/test_torchrec.bash            |  2 +-
 .github/scripts/utils_pip.bash                | 24 +++++++++----------
 .github/scripts/utils_pytorch.bash            | 16 ++++++-------
 .github/workflows/fbgemm_gpu_cpu_release.yml  | 10 ++++++--
 .github/workflows/fbgemm_gpu_cuda_release.yml | 10 ++++++--
 fbgemm_gpu/README.md                          |  4 ++--
 fbgemm_gpu/docs/BuildInstructions.md          |  8 +++----
 fbgemm_gpu/docs/InstallationInstructions.md   | 16 ++++++-------
 8 files changed, 51 insertions(+), 39 deletions(-)

diff --git a/.github/scripts/test_torchrec.bash b/.github/scripts/test_torchrec.bash
index 9bdb63cbd1..abc67c91cb 100644
--- a/.github/scripts/test_torchrec.bash
+++ b/.github/scripts/test_torchrec.bash
@@ -32,7 +32,7 @@ usage () {
   echo "CUDA_VERSION        : PyTorch's CUDA version (e.g., 11.6, 11.7)"
   echo "FBGEMM_WHEEL_PATH   : path to FBGEMM_GPU's wheel file"
   echo "MINICONDA_PREFIX    : path to install Miniconda (default: \$HOME/miniconda)"
-  echo "Example: Python 3.10 + PyTorch nightly (CUDA 11.7), install miniconda at \$HOME/miniconda, using dist/fbgemm_gpu_nightly.whl"
+  echo "Example: Python 3.10 + PyTorch nightly (CUDA 12.1), install miniconda at \$HOME/miniconda, using dist/fbgemm_gpu_nightly.whl"
   # shellcheck disable=SC2086
   echo "       bash $(basename ${BASH_SOURCE[0]}) -v -o torchrec_nightly -p 3.10 -P pytorch-nightly -c 11.7 -w dist/fbgemm_gpu_nightly.whl"
 }
diff --git a/.github/scripts/utils_pip.bash b/.github/scripts/utils_pip.bash
index 1a7d66a54a..cd08af2c12 100644
--- a/.github/scripts/utils_pip.bash
+++ b/.github/scripts/utils_pip.bash
@@ -23,8 +23,8 @@ __extract_pip_arguments () {
     echo "Usage: ${FUNCNAME[0]} ENV_NAME PACKAGE_NAME PACKAGE_VERSION PACKAGE_VARIANT_TYPE [PACKAGE_VARIANT_VERSION]"
     echo "Example(s):"
     echo "    ${FUNCNAME[0]} build_env torch 1.11.0 cpu             # Install the CPU variant a specific version"
-    echo "    ${FUNCNAME[0]} build_env torch latest cpu             # Install the CPU variant of the latest stable version"
-    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test cuda 11.7.1  # Install the variant for CUDA 11.7"
+    echo "    ${FUNCNAME[0]} build_env torch release cpu            # Install the CPU variant of the latest stable version"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test cuda 12.1.0  # Install the variant for CUDA 12.1"
     echo "    ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm 5.3  # Install the variant for ROCM 5.3"
     return 1
   else
@@ -48,7 +48,7 @@ __extract_pip_arguments () {
     local cuda_version="${package_variant_version:-11.8.0}"
     # shellcheck disable=SC2206
     local cuda_version_arr=(${cuda_version//./ })
-    # Convert, i.e. cuda 11.7.1 => cu117
+    # Convert, i.e. cuda 12.1.0 => cu121
     export package_variant="cu${cuda_version_arr[0]}${cuda_version_arr[1]}"
   elif [ "$package_variant_type" == "rocm" ]; then
     # Extract the ROCM version or default to 5.5.1
@@ -67,7 +67,7 @@ __extract_pip_arguments () {
   if [ "$package_version" == "nightly" ] || [ "$package_version" == "test" ]; then
     export pip_package="--pre ${package_name}"
     export pip_channel="https://download.pytorch.org/whl/${package_version}/${package_variant}/"
-  elif [ "$package_version" == "latest" ]; then
+  elif [ "$package_version" == "release" ]; then
     export pip_package="${package_name}"
     export pip_channel="https://download.pytorch.org/whl/${package_variant}/"
   else
@@ -85,10 +85,10 @@ install_from_pytorch_pip () {
   if [ "$package_variant_type" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} ENV_NAME PACKAGE_NAME PACKAGE_VERSION PACKAGE_VARIANT_TYPE [PACKAGE_VARIANT_VERSION]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env torch 1.11.0 cpu             # Install the CPU variant a specific version"
-    echo "    ${FUNCNAME[0]} build_env torch latest cpu             # Install the CPU variant of the latest stable version"
-    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test cuda 11.7.1  # Install the variant for CUDA 11.7"
-    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm 5.3  # Install the variant for ROCM 5.3"
+    echo "    ${FUNCNAME[0]} build_env torch 1.11.0 cpu             # Install the CPU variant for a specific version"
+    echo "    ${FUNCNAME[0]} build_env torch release cpu            # Install the CPU variant, latest release version"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test cuda 12.1.0  # Install the CUDA 12.1 variant, latest test version"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm 5.3  # Install the ROCM 5.3 variant, latest nightly version"
     return 1
   else
     echo "################################################################################"
@@ -138,10 +138,10 @@ download_from_pytorch_pip () {
   if [ "$package_variant_type" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} ENV_NAME PACKAGE_NAME PACKAGE_VERSION PACKAGE_VARIANT_TYPE [PACKAGE_VARIANT_VERSION]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env torch 1.11.0 cpu             # Download the CPU variant a specific version"
-    echo "    ${FUNCNAME[0]} build_env torch latest cpu             # Download the CPU variant of the latest stable version"
-    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test cuda 11.7.1  # Download the variant for CUDA 11.7"
-    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm 5.3  # Download the variant for ROCM 5.3"
+    echo "    ${FUNCNAME[0]} build_env torch 1.11.0 cpu             # Download the CPU variant for a specific version"
+    echo "    ${FUNCNAME[0]} build_env torch release cpu            # Download the CPU variant, latest stable version"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test cuda 12.1.0  # Download the CUDA 12.1 variant, latest test version"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm 5.3  # Download the ROCM 5.3 variant, latest nightly version"
     return 1
   else
     echo "################################################################################"
diff --git a/.github/scripts/utils_pytorch.bash b/.github/scripts/utils_pytorch.bash
index 1fb743b653..f14e97b2c2 100644
--- a/.github/scripts/utils_pytorch.bash
+++ b/.github/scripts/utils_pytorch.bash
@@ -25,9 +25,9 @@ install_pytorch_conda () {
     echo "Usage: ${FUNCNAME[0]} ENV_NAME PYTORCH_VERSION [CPU]"
     echo "Example(s):"
     echo "    ${FUNCNAME[0]} build_env 1.11.0       # Install a specific version"
-    echo "    ${FUNCNAME[0]} build_env latest       # Install the latest stable release"
-    echo "    ${FUNCNAME[0]} build_env test         # Install the pre-release"
-    echo "    ${FUNCNAME[0]} build_env nightly cpu  # Install the CPU variant of the nightly"
+    echo "    ${FUNCNAME[0]} build_env release      # Install the latest release"
+    echo "    ${FUNCNAME[0]} build_env test         # Install the latest pre-release"
+    echo "    ${FUNCNAME[0]} build_env nightly      # Install the latest nightly"
     return 1
   else
     echo "################################################################################"
@@ -51,7 +51,7 @@ install_pytorch_conda () {
   # Set package name and installation channel
   if [ "$pytorch_version" == "nightly" ] || [ "$pytorch_version" == "test" ]; then
     local pytorch_channel="pytorch-${pytorch_version}"
-  elif [ "$pytorch_version" == "latest" ]; then
+  elif [ "$pytorch_version" == "release" ]; then
     local pytorch_channel="pytorch"
   else
     local pytorch_package="${pytorch_package}==${pytorch_version}"
@@ -111,10 +111,10 @@ install_pytorch_pip () {
   if [ "$pytorch_variant_type" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} ENV_NAME PYTORCH_VERSION PYTORCH_VARIANT_TYPE [PYTORCH_VARIANT_VERSION]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env 1.11.0 cpu         # Install the CPU variant a specific version"
-    echo "    ${FUNCNAME[0]} build_env latest cpu         # Install the CPU variant of the latest stable version"
-    echo "    ${FUNCNAME[0]} build_env test cuda 11.7.1   # Install the variant for CUDA 11.7"
-    echo "    ${FUNCNAME[0]} build_env nightly rocm 5.3   # Install the variant for ROCM 5.3"
+    echo "    ${FUNCNAME[0]} build_env 1.11.0 cpu         # Install the CPU variant for a specific version"
+    echo "    ${FUNCNAME[0]} build_env release cpu        # Install the CPU variant, latest release version"
+    echo "    ${FUNCNAME[0]} build_env test cuda 12.1.0   # Install the CUDA 12.1 variant, latest test version"
+    echo "    ${FUNCNAME[0]} build_env nightly rocm 5.3   # Install the ROCM 5.3 variant, latest nightly version"
     return 1
   else
     echo "################################################################################"
diff --git a/.github/workflows/fbgemm_gpu_cpu_release.yml b/.github/workflows/fbgemm_gpu_cpu_release.yml
index 87aa54ac75..67480ef3f2 100644
--- a/.github/workflows/fbgemm_gpu_cpu_release.yml
+++ b/.github/workflows/fbgemm_gpu_cpu_release.yml
@@ -27,6 +27,12 @@ on:
         type: boolean
         required: false
         default: false
+      pytorch_channel:
+        description: PyTorch Package Channel
+        type: choice
+        required: false
+        options: [ "test", "release" ]
+        default: "test"
 
 concurrency:
   # Cancel previous runs in the PR if a new commit is pushed
@@ -85,7 +91,7 @@ jobs:
       run: . $PRELUDE; install_build_tools $BUILD_ENV
 
     - name: Install PyTorch-CPU Test
-      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV test cpu
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_channel || 'test' }} cpu
 
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
@@ -149,7 +155,7 @@ jobs:
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
 
     - name: Install PyTorch Test
-      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV test cpu
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_channel || 'test' }} cpu
 
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
diff --git a/.github/workflows/fbgemm_gpu_cuda_release.yml b/.github/workflows/fbgemm_gpu_cuda_release.yml
index bc1dea236e..92cc0e1e38 100644
--- a/.github/workflows/fbgemm_gpu_cuda_release.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_release.yml
@@ -33,6 +33,12 @@ on:
         required: false
         options: [ "11.8.0", "12.1.1" ]
         default: "12.1.1"
+      pytorch_channel:
+        description: PyTorch Package Channel
+        type: choice
+        required: false
+        options: [ "test", "release" ]
+        default: "test"
 
 concurrency:
   # Cancel previous runs in the PR if a new commit is pushed
@@ -94,7 +100,7 @@ jobs:
       run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
 
     - name: Install PyTorch Test
-      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV test cuda ${{ matrix.cuda-version }}
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_channel || 'test' }} cuda ${{ matrix.cuda-version }}
 
     - name: Install cuDNN
       run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}
@@ -162,7 +168,7 @@ jobs:
       run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
 
     - name: Install PyTorch Test
-      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV test cuda ${{ matrix.cuda-version }}
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_channel || 'test' }} cuda ${{ matrix.cuda-version }}
 
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
diff --git a/fbgemm_gpu/README.md b/fbgemm_gpu/README.md
index cdab48279b..e53545430d 100644
--- a/fbgemm_gpu/README.md
+++ b/fbgemm_gpu/README.md
@@ -8,8 +8,8 @@ FBGEMM_GPU (FBGEMM GPU Kernels Library) is a collection of high-performance PyTo
 GPU operator libraries for training and inference.  The library provides efficient
 table batched embedding bag, data layout transformation, and quantization supports.
 
-FBGEMM_GPU is currently tested with CUDA 11.7.1 and 11.8 in CI, and with PyTorch
-packages (1.13+) that are built against those CUDA versions.
+FBGEMM_GPU is currently tested with cuda 12.1.0 and 11.8 in CI, and with PyTorch
+packages (2.1+) that are built against those CUDA versions.
 
 Only Intel/AMD CPUs with AVX2 extensions are currently supported.
 
diff --git a/fbgemm_gpu/docs/BuildInstructions.md b/fbgemm_gpu/docs/BuildInstructions.md
index 04090a1b78..2fef4be9f6 100644
--- a/fbgemm_gpu/docs/BuildInstructions.md
+++ b/fbgemm_gpu/docs/BuildInstructions.md
@@ -253,13 +253,13 @@ PyTorch for ROCm builds.
 
 ```sh
 # Install the latest nightly
-conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu117/
+conda run -n "${env_name}" pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121/
 # Install the latest test (RC)
-conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/test/cu117/
+conda run -n "${env_name}" pip install --pre torch --index-url https://download.pytorch.org/whl/test/cu121/
 # Install a specific version
-conda run -n "${env_name}" pip install torch==2.0.0+cu117 --extra-index-url https://download.pytorch.org/whl/cu117/
+conda run -n "${env_name}" pip install torch==2.1.0+cu121 --index-url https://download.pytorch.org/whl/cu121/
 # Install the latest nightly (ROCm 5.3)
-conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/rocm5.3/
+conda run -n "${env_name}" pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm5.3/
 ```
 
 ### Post-Install Checks
diff --git a/fbgemm_gpu/docs/InstallationInstructions.md b/fbgemm_gpu/docs/InstallationInstructions.md
index 2799625491..13e1d75480 100644
--- a/fbgemm_gpu/docs/InstallationInstructions.md
+++ b/fbgemm_gpu/docs/InstallationInstructions.md
@@ -14,20 +14,20 @@ The shortened summary of the installation steps:
 
 ```sh
 # CUDA Nightly
-pip install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu117/
-pip install fbgemm-gpu-nightly
+pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121/
+pip install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu121/
 
 # CUDA Release
-pip install --pre torch --extra-index-url https://download.pytorch.org/whl/test/cu117/
-pip install fbgemm-gpu
+pip install torch --index-url https://download.pytorch.org/whl/cu121/
+pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/cu121/
 
 # CPU-only Nightly
-pip install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu/
-pip install fbgemm-gpu-nightly-cpu
+pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu/
+pip install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cpu/
 
 # CPU-only Release
-pip install --pre torch --extra-index-url https://download.pytorch.org/whl/test/cpu/
-pip install fbgemm-gpu-cpu
+pip install torch --index-url https://download.pytorch.org/whl/cpu/
+pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/cpu/
 
 # Test the installation
 python -c "import torch; import fbgemm_gpu"

From c22d85c0577d8c917167b60836c894f2a702bee0 Mon Sep 17 00:00:00 2001
From: Supadchaya Puangpontip <supadchaya@meta.com>
Date: Thu, 5 Oct 2023 19:16:10 -0700
Subject: [PATCH 67/94] Update ROCm version on CI pipeline to 5.6 and 5.7
 (#2066)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2066

PyTorch no longer releases ROCm 5.5 for nightly. The last nightly release was on 9/21/23. This will cause some tests to fail on CI if the diff relies on
torch's new changes after 9/21 (e.g., https://github.com/pytorch/FBGEMM/actions/runs/6398449444/job/17368614311)

Pytorch dev infra also supports ROCm 5.6 and 5.7 by default. This diff updates the ROCm version on FBGEMM CI to align with PyTorch.

Reviewed By: q10

Differential Revision: D49980627

fbshipit-source-id: bda3a858bcd3b129564f9db85024968855dc87d3
---
 .github/workflows/fbgemm_gpu_ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml
index a1817a3761..8760785a8c 100644
--- a/.github/workflows/fbgemm_gpu_ci.yml
+++ b/.github/workflows/fbgemm_gpu_ci.yml
@@ -47,7 +47,7 @@ jobs:
         ]
         container-image: [ "ubuntu:20.04" ]
         python-version: [ "3.8", "3.9", "3.10" ]
-        rocm-version: [ "5.5.1", "5.6" ]
+        rocm-version: [ "5.6", "5.7" ]
 
     steps:
     - name: Setup Build Container
@@ -116,7 +116,7 @@ jobs:
         ]
         # ROCm machines are limited, so we only test against Python 3.10
         python-version: [ "3.10" ]
-        rocm-version: [ "5.5.1", "5.6" ]
+        rocm-version: [ "5.6", "5.7" ]
 
     steps:
     - name: Setup Build Container

From a684a79addb847820b1a37b48e47110c74a0eb43 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Thu, 5 Oct 2023 22:39:02 -0700
Subject: [PATCH 68/94] Fix more MSVC inconsistent DLL linkage (#2064)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2064

Reviewed By: spcyppt

Differential Revision: D49978436

Pulled By: q10

fbshipit-source-id: 74ec5bcc121b79774e20e14480a624e35f28f1e7
---
 src/EmbeddingSpMDMAvx2.cc          | 2 +-
 src/FbgemmBfloat16ConvertAvx2.cc   | 1 +
 src/FbgemmBfloat16ConvertAvx512.cc | 1 +
 src/FbgemmFloat16ConvertAvx2.cc    | 1 +
 src/FbgemmFloat16ConvertAvx512.cc  | 1 +
 5 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/EmbeddingSpMDMAvx2.cc b/src/EmbeddingSpMDMAvx2.cc
index d616bfda8a..e6d3ed4a7b 100644
--- a/src/EmbeddingSpMDMAvx2.cc
+++ b/src/EmbeddingSpMDMAvx2.cc
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <cassert>
+#define FBGEMM_EXPORTS
 #include <cmath>
 #include "RefImplementations.h"
 #include "fbgemm/FbgemmEmbedding.h"
diff --git a/src/FbgemmBfloat16ConvertAvx2.cc b/src/FbgemmBfloat16ConvertAvx2.cc
index f1cc785ee2..b044dd4460 100644
--- a/src/FbgemmBfloat16ConvertAvx2.cc
+++ b/src/FbgemmBfloat16ConvertAvx2.cc
@@ -10,6 +10,7 @@
     (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)))
 #include <immintrin.h>
 #endif
+#define FBGEMM_EXPORTS
 #include "fbgemm/FbgemmConvert.h"
 
 namespace fbgemm {
diff --git a/src/FbgemmBfloat16ConvertAvx512.cc b/src/FbgemmBfloat16ConvertAvx512.cc
index 0111bc6e7d..b74d9466f3 100644
--- a/src/FbgemmBfloat16ConvertAvx512.cc
+++ b/src/FbgemmBfloat16ConvertAvx512.cc
@@ -10,6 +10,7 @@
     (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)))
 #include <immintrin.h>
 #endif
+#define FBGEMM_EXPORTS
 #include "fbgemm/FbgemmConvert.h"
 
 namespace fbgemm {
diff --git a/src/FbgemmFloat16ConvertAvx2.cc b/src/FbgemmFloat16ConvertAvx2.cc
index 5778428fc8..3cc1ad6e28 100644
--- a/src/FbgemmFloat16ConvertAvx2.cc
+++ b/src/FbgemmFloat16ConvertAvx2.cc
@@ -10,6 +10,7 @@
     (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)))
 #include <immintrin.h>
 #endif
+#define FBGEMM_EXPORTS
 #include "fbgemm/FbgemmConvert.h"
 
 namespace fbgemm {
diff --git a/src/FbgemmFloat16ConvertAvx512.cc b/src/FbgemmFloat16ConvertAvx512.cc
index 380eaa2713..5039f88240 100644
--- a/src/FbgemmFloat16ConvertAvx512.cc
+++ b/src/FbgemmFloat16ConvertAvx512.cc
@@ -10,6 +10,7 @@
     (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)))
 #include <immintrin.h>
 #endif
+#define FBGEMM_EXPORTS
 #include "fbgemm/FbgemmConvert.h"
 
 namespace fbgemm {

From ebbda371a25fe3fd7da33ebb43ed96cad981df1c Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@meta.com>
Date: Mon, 9 Oct 2023 15:33:09 -0700
Subject: [PATCH 69/94] Add autogenerated opcheck tests to
 deeplearning/fbgemm/fbgemm_gpu/test/sparse_ops_test.py (#2050)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2050

Reviewed By: zou3519

Differential Revision: D49736391

fbshipit-source-id: c0f7f66cef89365325726805a30cdfe232721af6
---
 .github/scripts/fbgemm_gpu_test.bash |   2 +-
 fbgemm_gpu/test/failures_dict.json   | 377 +++++++++++++++++++++++++++
 fbgemm_gpu/test/sparse_ops_test.py   |  81 +++++-
 3 files changed, 458 insertions(+), 2 deletions(-)
 create mode 100644 fbgemm_gpu/test/failures_dict.json

diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
index 0b7a99334a..83b9ceefb6 100644
--- a/.github/scripts/fbgemm_gpu_test.bash
+++ b/.github/scripts/fbgemm_gpu_test.bash
@@ -99,7 +99,7 @@ run_fbgemm_gpu_tests () {
 
   echo "[TEST] Installing pytest ..."
   # shellcheck disable=SC2086
-  print_exec conda install ${env_prefix} -y pytest
+  print_exec conda install ${env_prefix} -y pytest expecttest
 
   echo "[TEST] Checking imports ..."
   (test_python_import_package "${env_name}" fbgemm_gpu) || return 1
diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
new file mode 100644
index 0000000000..7a7e1d40fa
--- /dev/null
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -0,0 +1,377 @@
+{
+  "_description": "This is a dict containing failures for tests autogenerated by generate_opcheck_tests. For more details, please see https://docs.google.com/document/d/1Pj5HRZvdOq3xpFpbEjUZp2hBovhy7Wnxw14m6lF2154/edit",
+  "_version": 1,
+  "data": {
+    "fbgemm::asynchronous_complete_cumsum": {},
+    "fbgemm::asynchronous_exclusive_cumsum": {},
+    "fbgemm::asynchronous_inclusive_cumsum": {},
+    "fbgemm::batch_index_select_dim0": {
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_batch_index_select_dim0": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_batch_index_select_dim0": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_autograd_registration__test_batch_index_select_dim0": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_batch_index_select_dim0": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::block_bucketize_sparse_features": {
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_block_bucketize_sparse_features": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_block_bucketize_sparse_features_long_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_block_bucketize_sparse_features_with_variable_batch_sizes": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_block_bucketize_sparse_features": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_block_bucketize_sparse_features_long_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_block_bucketize_sparse_features_with_variable_batch_sizes": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_block_bucketize_sparse_features": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_block_bucketize_sparse_features_long_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_block_bucketize_sparse_features_with_variable_batch_sizes": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::bottom_k_per_row": {
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_bottom_unique_k_per_row": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_bottom_unique_k_per_row": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_bottom_unique_k_per_row": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::bucketize_sparse_features": {
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_bucketize_sparse_features": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_bucketize_sparse_features": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_bucketize_sparse_features": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::cat_reorder_batched_ad_indices": {
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_cat_reorder_batched_ad_indices_cpu": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_cat_reorder_batched_ad_indices_cpu": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_cat_reorder_batched_ad_indices_cpu": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::generic_histogram_binning_calibration_by_feature": {
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_generic_histogram_binning_calibration_by_feature": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_generic_histogram_binning_calibration_by_feature_cpu_gpu": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_generic_histogram_binning_calibration_by_feature": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_generic_histogram_binning_calibration_by_feature_cpu_gpu": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_generic_histogram_binning_calibration_by_feature": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_generic_histogram_binning_calibration_by_feature_cpu_gpu": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::group_index_select_dim0": {
+      "SparseOpsTest.test_autograd_registration__test_group_index_select_dim0": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_group_index_select_dim0": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::histogram_binning_calibration": {
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_histogram_binning_calibration": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_histogram_binning_calibration": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_histogram_binning_calibration": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::histogram_binning_calibration_by_feature": {
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_histogram_binning_calibration_by_feature": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_histogram_binning_calibration_by_feature": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_histogram_binning_calibration_by_feature": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::invert_permute": {
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_invert_permute": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_invert_permute": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_invert_permute": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::offsets_range": {},
+    "fbgemm::pack_segments": {},
+    "fbgemm::permute102_baddbmm_permute102": {
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_permute102_baddbmm_permute102": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_permute102_baddbmm_permute102": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_permute102_baddbmm_permute102": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::permute_1D_sparse_data": {
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_permute_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_permute_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_permute_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_schema__test_permute_indices": {
+        "comment": "flaky",
+        "status": "skip"
+      }
+    },
+    "fbgemm::permute_2D_sparse_data": {
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_permute_embeddings": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_permute_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_permute_indices_with_repeats": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_permute_embeddings": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_permute_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_permute_indices_with_repeats": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_permute_embeddings": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_permute_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_permute_indices_with_repeats": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_schema__test_permute_indices": {
+        "comment": "flaky",
+        "status": "skip"
+      }
+    },
+    "fbgemm::permute_sequence_embeddings": {
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_permute_embeddings": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_permute_embeddings": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_permute_embeddings": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::reorder_batched_ad_indices": {
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_reorder_batched_ad_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_reorder_batched_ad_indices_cpu": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_reorder_batched_ad_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_reorder_batched_ad_indices_cpu": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_reorder_batched_ad_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_reorder_batched_ad_indices_cpu": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::reorder_batched_ad_lengths": {
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_cat_reorder_batched_ad_indices_cpu": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_reorder_batched_ad_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_reorder_batched_ad_indices_cpu": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_reorder_batched_ad_lengths": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_reorder_batched_ad_lengths_cpu": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_cat_reorder_batched_ad_indices_cpu": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_reorder_batched_ad_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_reorder_batched_ad_indices_cpu": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_reorder_batched_ad_lengths": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_reorder_batched_ad_lengths_cpu": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_cat_reorder_batched_ad_indices_cpu": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_reorder_batched_ad_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_reorder_batched_ad_indices_cpu": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_reorder_batched_ad_lengths": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_reorder_batched_ad_lengths_cpu": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::segment_sum_csr": {
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_segment_sum_csr": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_segment_sum_csr": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_faketensor__test_segment_sum_csr": {
+        "comment": "",
+        "status": "xfail"
+      }
+    }
+  }
+}
diff --git a/fbgemm_gpu/test/sparse_ops_test.py b/fbgemm_gpu/test/sparse_ops_test.py
index 349bc45c1c..b4c1143157 100644
--- a/fbgemm_gpu/test/sparse_ops_test.py
+++ b/fbgemm_gpu/test/sparse_ops_test.py
@@ -11,6 +11,7 @@
 import functools
 import itertools
 import logging
+import os
 import random
 import unittest
 from itertools import accumulate
@@ -19,7 +20,11 @@
 import hypothesis.strategies as st
 import numpy as np
 import torch
-from hypothesis import given, settings, Verbosity
+from hypothesis import given, HealthCheck, settings, Verbosity
+
+from torch._utils_internal import get_file_path_2
+from torch.testing._internal.optests import generate_opcheck_tests
+
 
 try:
     # pyre-ignore[21]
@@ -33,6 +38,13 @@
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/codegen:index_select_ops")
     from fbgemm_gpu.test.test_utils import gpu_available, gpu_unavailable, skipIfRocm
 
+suppressed_list: List[HealthCheck] = (
+    # pyre-fixme[16]: Module `HealthCheck` has no attribute `differing_executors`.
+    [HealthCheck.differing_executors]
+    if getattr(HealthCheck, "differing_executors", False)
+    else []
+)
+
 
 def unbucketize_indices_value(
     bucketized_indices: torch.Tensor,
@@ -90,6 +102,7 @@ def permute_scripted(
 
 class SparseOpsTest(unittest.TestCase):
     @staticmethod
+    @settings(suppress_health_check=suppressed_list)
     def permute_indices_ref_(
         lengths: torch.Tensor,
         indices: torch.Tensor,
@@ -2387,5 +2400,71 @@ def validate(
         )
 
 
+failures_dict_path: str = get_file_path_2(
+    "", os.path.dirname(__file__), "failures_dict.json"
+)
+
+# e.g. "test_faketensor__test_cumsum": [unittest.expectedFailure]
+# Please avoid putting tests here, you should put operator-specific
+# skips and failures in deeplearning/fbgemm/fbgemm_gpu/test/failures_dict.json
+# pyre-ignore[24]: Generic type `Callable` expects 2 type parameters.
+additional_decorators: Dict[str, List[Callable]] = {
+    "test_aot_dispatch_dynamic__test_index_select_dim0": [unittest.skip("hangs")],
+    "test_aot_dispatch_static__test_index_select_dim0": [unittest.skip("hangs")],
+    "test_faketensor__test_index_select_dim0": [unittest.skip("hangs")],
+    "test_autograd_registration__test_index_select_dim0": [unittest.skip("hangs")],
+    "test_schema__test_index_select_dim0": [unittest.skip("hangs")],
+    "test_aot_dispatch_dynamic__test_pack_segments": [
+        unittest.skip("ASAN heap buffer overflow")
+    ],
+    "test_aot_dispatch_static__test_pack_segments": [
+        unittest.skip("ASAN heap buffer overflow")
+    ],
+    "test_faketensor__test_pack_segments": [unittest.skip("ASAN heap buffer overflow")],
+    "test_autograd_registration__test_pack_segments": [
+        unittest.skip("ASAN heap buffer overflow")
+    ],
+    "test_schema__test_pack_segments": [unittest.skip("ASAN heap buffer overflow")],
+    "test_aot_dispatch_static__test_group_index_select_dim0": [
+        unittest.skip("CUDA memory error")
+    ],
+    "test_aot_dispatch_dynamic__test_group_index_select_dim0": [
+        unittest.skip("CUDA memory error")
+    ],
+    "test_aot_dispatch_dynamic__test_pack_segments_smaller_max_len": [
+        unittest.skip("RuntimeError: opcheck can only test operators without overloads")
+    ],
+    "test_aot_dispatch_static__test_pack_segments_smaller_max_len": [
+        unittest.skip("RuntimeError: opcheck can only test operators without overloads")
+    ],
+    "test_faketensor__test_pack_segments_smaller_max_len": [
+        unittest.skip("RuntimeError: opcheck can only test operators without overloads")
+    ],
+    "test_autograd_registration__test_pack_segments_smaller_max_len": [
+        unittest.skip("RuntimeError: opcheck can only test operators without overloads")
+    ],
+    "test_schema__test_pack_segments_smaller_max_len": [
+        unittest.skip("RuntimeError: opcheck can only test operators without overloads")
+    ],
+}
+
+# only generate tests on nightly pytorch (current release version is 2.1)
+if torch.__version__ >= "2.2.*":
+    generate_opcheck_tests(
+        SparseOpsTest,
+        ["fb", "fbgemm"],
+        failures_dict_path,
+        # pyre-fixme[6]: For 4th argument expected `List[typing.Callable[...,
+        #  typing.Any]]` but got `Dict[str, List[typing.Callable[..., typing.Any]]]`.
+        additional_decorators,
+        [
+            "test_schema",
+            "test_autograd_registration",
+            "test_faketensor",
+            "test_aot_dispatch_static",
+            "test_aot_dispatch_dynamic",
+        ],
+    )
+
 if __name__ == "__main__":
     unittest.main()

From 3146ef5b128d19deceb05b53ac30f8d94372dbd3 Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Mon, 9 Oct 2023 20:13:33 -0700
Subject: [PATCH 70/94] Improve perf for L=0 cases (#2046)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2046

**Replace `memset` with vector store for the L=0 case (L = pooling factor)**

`memset` causes extra byte writes which results in lower performance.
The compiler seems to not be aware that the memory is aligned with the
cache line when using `memset`.

**Add a short circuit path for writing zeros when every bag in a feature
has L=0**

TBE v2 does an expensive explicit register-shared memory spill before
performing look up.  For L=0, these registers can be bypassed because
there is no look up to be performed.  We bypass the register spill if
all Ls are zeros.

Reviewed By: jasonjk-park

Differential Revision: D49605180

fbshipit-source-id: 66d5886a28bb8c75aa446f08ac1b165d6c298de6
---
 ...edding_forward_split_kernel_v2_template.cu | 40 +++++++++++++++----
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/fbgemm_gpu/codegen/embedding_forward_split_kernel_v2_template.cu b/fbgemm_gpu/codegen/embedding_forward_split_kernel_v2_template.cu
index 3aa66c3778..f9c798ad5f 100644
--- a/fbgemm_gpu/codegen/embedding_forward_split_kernel_v2_template.cu
+++ b/fbgemm_gpu/codegen/embedding_forward_split_kernel_v2_template.cu
@@ -240,8 +240,9 @@ __noinline__ __device__ void process_all_indices_small_Ls(
       const uint32_t num_offsets = smem[params_offset + SAVED_PARAMS::P_num_offsets];
       const uint32_t total_load_D = smem[params_offset + SAVED_PARAMS::P_total_load_D];
       // Write zeros to the sample that L = 0
+      Vec4StepT<1, emb_t> accumulator;
       for (uint32_t i = 0; i < num_offsets; ++i) {
-        memset(output + i * total_load_D, 0, sizeof(output_vec_t));
+        accumulator.store(output + i * total_load_D);
       }
     }
     return;
@@ -299,7 +300,8 @@ __noinline__ __device__ void process_all_indices_small_Ls(
     auto * __restrict__ const output = *reinterpret_cast<output_vec_t**>(&smem[params_offset + SAVED_PARAMS::P_outputs]);
     const auto total_load_D = static_cast<uint32_t>(smem[params_offset + SAVED_PARAMS::P_total_load_D]);
     if (process_d) {
-      memset(output + write_idx + threadIdx.x, 0, sizeof(output_vec_t));
+      Vec4StepT<1, emb_t> accumulator;
+      accumulator.store(output + write_idx + threadIdx.x);
     }
     write_idx += total_load_D;
 
@@ -746,13 +748,34 @@ __global__ void split_embedding_codegen_forward_{{ wdesc }}_v2_kernel(
       return;
     }
 
-    bool is_small_L;
-    if (threadIdx.x == 0) {
-      // Use the small-L optimization if average L <= 8
-      is_small_L = (offsets[(t + 1) * B] - offsets[t * B]) <= (static_cast<index_t>(B) * 8);
+    const auto total_L = offsets[(t + 1) * B] - offsets[t * B];
+    const auto is_zero_total_L = total_L == 0;
+
+    // Short circuit for all zeros
+    if (is_zero_total_L) {
+      const uint32_t D_start = D_offsets[t] / VEC_WIDTH;
+      const uint32_t load_D = (D_offsets[t + 1] / VEC_WIDTH) - D_start;
+      const uint32_t num_warps_per_row = DIV_ROUND_UP(load_D, kWarpSize);
+      if (table_warp_id >= num_warps_per_row * B) {
+        return;
+      }
+      const uint32_t load_d = (table_warp_id % num_warps_per_row) * kWarpSize;
+      if (load_d + threadIdx.x < load_D) {
+        const uint32_t b = table_warp_id / num_warps_per_row;
+        const uint32_t total_load_D = D_offsets[T] / VEC_WIDTH;
+
+        output_vec_t* output_ptr = reinterpret_cast<output_vec_t*>(output) +
+            D_start + b * total_load_D + load_d + threadIdx.x;
+
+        // Write zeros to output
+        Vec4StepT<1, emb_t> accumulator;
+        accumulator.store(output_ptr);
+      }
+      return;
     }
-    is_small_L = shfl_sync(is_small_L, 0);
 
+    // Use the small-L optimization if average L <= 8
+    const auto is_small_L = total_L <= (static_cast<index_t>(B) * 8);
     const uint32_t num_warps_for_small_L = DIV_ROUND_UP(B, NUM_OFFSETS_PER_WARP);
 
     // Early exit for small-L to avoid D_offsets reads
@@ -879,7 +902,8 @@ __global__ void split_embedding_codegen_forward_{{ wdesc }}_v2_kernel(
       if (L == 0) {
         if (load_d + threadIdx.x < load_D) {
           // Write zeros to output
-          memset(output_ptr, 0, sizeof(output_vec_t));
+          Vec4StepT<1, emb_t> accumulator;
+          accumulator.store(output_ptr);
         }
       }
       else {

From 85a91d545b33cd2777c41c0cf06d926b713efe05 Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Mon, 9 Oct 2023 20:26:00 -0700
Subject: [PATCH 71/94] Enable subwarp only for unweighted (#2051)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2051

Disable the subwarp optimization for weighted TBE

- The cause of the NE issue has to be investigated.
- This does not affect the overall E2E performance much since the
  majority of TBEs in the model is unweighted

Reviewed By: jasonjk-park

Differential Revision: D49159119

fbshipit-source-id: 805639d94d4ce7b3be8f275db4dfd0ecc95a539a
---
 .../codegen/embedding_forward_split_kernel_v2_template.cu     | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fbgemm_gpu/codegen/embedding_forward_split_kernel_v2_template.cu b/fbgemm_gpu/codegen/embedding_forward_split_kernel_v2_template.cu
index f9c798ad5f..61b941e4d1 100644
--- a/fbgemm_gpu/codegen/embedding_forward_split_kernel_v2_template.cu
+++ b/fbgemm_gpu/codegen/embedding_forward_split_kernel_v2_template.cu
@@ -940,6 +940,7 @@ __global__ void split_embedding_codegen_forward_{{ wdesc }}_v2_kernel(
 
     // Tail warp
     // STEP_MASK computation assumes STEP = 4
+    {% if not weighted %}
     if (load_D - load_d < kWarpSize) {
       const auto tail_warp_size = load_D % kWarpSize;
       if (tail_warp_size <= 8) {
@@ -955,6 +956,9 @@ __global__ void split_embedding_codegen_forward_{{ wdesc }}_v2_kernel(
     else {
       INVOKE_PROCESS_ALL_INDICES(large_Ls, 32, 0xf)
     }
+    {% else %}
+    INVOKE_PROCESS_ALL_INDICES(large_Ls, 32, 0xf)
+    {% endif %}
 
 #undef INVOKE_PROCESS_ALL_INDICES_HELPER
 #undef INVOKE_PROCESS_ALL_INDICES

From 1e194b70f51db666d2105538b1ec9491ff3f742b Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Tue, 10 Oct 2023 11:22:02 -0700
Subject: [PATCH 72/94] Fix non-contiguous tensor problem in
 keyed_jagged_index_select_dim1 (#2061)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2061

Before this diff, `keyed_jagged_index_select_dim1` kernels take raw
pointers as arguments.  This requires the input tensors to be
contiguous.  However, the `keyed_jagged_index_select_dim1` operator
did not make sure that the tensors are contiguous before extracting
and passing the raw pointers to the kernels causing the correctness
issue.  This diff replaces the raw pointer arguments with PyTorch's
`PackedTensorAccessor` which handles non-contiguous tensor accesses
automatically.  For some tensors that their raw pointers are still
being used, the operator makes sure that the tensors are contiguous
before using them.

Reviewed By: jasonjk-park, venkatrsrinivas

Differential Revision: D49939713

fbshipit-source-id: e941379eeda65fc998be6f506b0e467fe74a48a5
---
 .../keyed_jagged_index_select_dim1.cu         | 174 +++++++++++-------
 fbgemm_gpu/test/jagged_tensor_ops_test.py     |  28 ++-
 2 files changed, 132 insertions(+), 70 deletions(-)

diff --git a/fbgemm_gpu/src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu b/fbgemm_gpu/src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu
index 89948c418a..c588e4ef3a 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu
+++ b/fbgemm_gpu/src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu
@@ -19,13 +19,12 @@ template <
     int NUM_THREADS_PER_BLOCK,
     int MAX_ENTRIES_PER_BLOCK>
 __global__ void index_select_scalar_cumsum_kernel(
-    scalar_t* output,
-    acc_t* output_cumsum,
-    const scalar_t* __restrict__ input,
-    const index_t* __restrict__ indices,
+    at::PackedTensorAccessor32<scalar_t, 1, at::RestrictPtrTraits> output,
+    at::PackedTensorAccessor32<acc_t, 1, at::RestrictPtrTraits> output_cumsum,
+    const at::PackedTensorAccessor32<scalar_t, 1, at::RestrictPtrTraits> input,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> indices,
     const int num_batches,
     const int input_batch_size,
-    const int output_batch_size,
     const int last_block_num_entries,
     int* block_flags,
     acc_t* block_sums) {
@@ -33,6 +32,7 @@ __global__ void index_select_scalar_cumsum_kernel(
   __shared__ typename BlockScan::TempStorage bs_temp_storage;
   __shared__ acc_t smem[MAX_ENTRIES_PER_BLOCK];
   const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int output_batch_size = indices.size(0);
   const int bid = tid / output_batch_size;
   const int num_entries_per_block = blockIdx.x == gridDim.x - 1
       ? last_block_num_entries
@@ -73,26 +73,30 @@ template <
     typename weight_t,
     bool has_weights>
 __global__ void keyed_jagged_index_select_dim1_kernel(
-    scalar_t* output,
-    weight_t* output_weights,
-    const scalar_t* input,
-    const weight_t* weights,
-    const offset_t* input_offsets,
-    const index_t* indices,
-    const offset_t* output_offsets,
+    at::PackedTensorAccessor64<scalar_t, 1, at::RestrictPtrTraits> output,
+    at::PackedTensorAccessor64<weight_t, 1, at::RestrictPtrTraits>
+        output_weights,
+    const at::PackedTensorAccessor64<scalar_t, 1, at::RestrictPtrTraits> input,
+    const at::PackedTensorAccessor64<weight_t, 1, at::RestrictPtrTraits>
+        weights,
+    const at::PackedTensorAccessor32<offset_t, 1, at::RestrictPtrTraits>
+        input_offsets,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> indices,
+    const at::PackedTensorAccessor32<offset_t, 1, at::RestrictPtrTraits>
+        output_offsets,
     const int num_batches,
-    const int input_batch_size,
-    const int output_batch_size,
-    const int64_t num_outputs) {
+    const int input_batch_size) {
   const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int output_batch_size = indices.size(0);
+  const int64_t num_outputs = output.size(0);
 
   if (tid < num_outputs) {
     // Each thread searches index position
     int index_pos;
     binary_search_range(
         &index_pos,
-        output_offsets,
-        (offset_t)tid,
+        &output_offsets[0],
+        static_cast<offset_t>(tid),
         num_batches * output_batch_size);
 
     const offset_t rel_index =
@@ -104,7 +108,7 @@ __global__ void keyed_jagged_index_select_dim1_kernel(
     const offset_t input_offset =
         (index == 0 && bid == 0
              ? 0
-             : input_offsets[bid * input_batch_size + index - 1]) +
+             : input_offsets[bid * input_batch_size + index]) +
         rel_index;
 
     // Store data
@@ -117,24 +121,26 @@ __global__ void keyed_jagged_index_select_dim1_kernel(
 
 template <typename scalar_t, typename index_t, typename offset_t>
 __global__ void keyed_jagged_index_add_dim1_kernel(
-    scalar_t* output,
-    const scalar_t* input,
-    const offset_t* input_offsets,
-    const index_t* indices,
-    const offset_t* output_offsets,
+    at::PackedTensorAccessor64<scalar_t, 1, at::RestrictPtrTraits> output,
+    const at::PackedTensorAccessor64<scalar_t, 1, at::RestrictPtrTraits> input,
+    const at::PackedTensorAccessor32<offset_t, 1, at::RestrictPtrTraits>
+        input_offsets,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> indices,
+    const at::PackedTensorAccessor32<offset_t, 1, at::RestrictPtrTraits>
+        output_offsets,
     const int num_batches,
-    const int input_batch_size,
-    const int output_batch_size,
-    const int64_t num_inputs) {
+    const int output_batch_size) {
   const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int input_batch_size = indices.size(0);
+  const int64_t num_inputs = input.size(0);
 
   if (tid < num_inputs) {
     // Each thread searches index position
     int index_pos;
     binary_search_range(
         &index_pos,
-        input_offsets,
-        (offset_t)tid,
+        &input_offsets[0],
+        static_cast<offset_t>(tid),
         num_batches * input_batch_size);
 
     const offset_t rel_index =
@@ -146,7 +152,7 @@ __global__ void keyed_jagged_index_add_dim1_kernel(
     const offset_t output_offset =
         (index == 0 && bid == 0
              ? 0
-             : output_offsets[bid * output_batch_size + index - 1]) +
+             : output_offsets[bid * output_batch_size + index]) +
         rel_index;
 
     // Store data
@@ -230,13 +236,24 @@ class KeyedJaggedIndexSelectDim1GPUOp
                              MAX_CUMSUM_ENTRIES_PER_BLOCK,
                              0,
                              at::cuda::getCurrentCUDAStream()>>>(
-                              output_lengths.data_ptr<length_t>(),
-                              output_offsets.data_ptr<offset_t>(),
-                              lengths.data_ptr<length_t>(),
-                              indices.data_ptr<index_t>(),
+                              output_lengths.packed_accessor32<
+                                  length_t,
+                                  1,
+                                  at::RestrictPtrTraits>(),
+                              output_offsets.packed_accessor32<
+                                  offset_t,
+                                  1,
+                                  at::RestrictPtrTraits>(),
+                              lengths.packed_accessor32<
+                                  length_t,
+                                  1,
+                                  at::RestrictPtrTraits>(),
+                              indices.packed_accessor32<
+                                  index_t,
+                                  1,
+                                  at::RestrictPtrTraits>(),
                               num_batches,
                               batch_size,
-                              indices.numel(),
                               num_output_lengths -
                                   MAX_CUMSUM_ENTRIES_PER_BLOCK *
                                       (grid_size - 1),
@@ -259,27 +276,32 @@ class KeyedJaggedIndexSelectDim1GPUOp
     }
     grid_size = cuda_calc_xblock_count(num_outputs, kMaxThreads);
 
+    // output_offsets has to be contiguous because it is passed to
+    // binary_search_range which takes raw pointers as arguments
+    const auto output_offsets_contig = output_offsets.expect_contiguous();
+
     if (grid_size != 0) {
-#define LAUNCH_KERNEL(WEIGHTED, WEIGHT_TYPE, OUTPUT_WEIGHTS, WEIGHTS)      \
-  {                                                                        \
-    keyed_jagged_index_select_dim1_kernel<                                 \
-        value_t,                                                           \
-        index_t,                                                           \
-        offset_t,                                                          \
-        WEIGHT_TYPE,                                                       \
-        WEIGHTED>                                                          \
-        <<<grid_size, kMaxThreads, 0, at::cuda::getCurrentCUDAStream()>>>( \
-            output.data_ptr<value_t>(),                                    \
-            OUTPUT_WEIGHTS,                                                \
-            values.data_ptr<value_t>(),                                    \
-            WEIGHTS,                                                       \
-            offsets.data_ptr<offset_t>() + 1,                              \
-            indices.data_ptr<index_t>(),                                   \
-            output_offsets.data_ptr<offset_t>(),                           \
-            num_batches,                                                   \
-            batch_size,                                                    \
-            indices.numel(),                                               \
-            num_outputs);                                                  \
+#define LAUNCH_KERNEL(WEIGHTED, WEIGHT_TYPE, OUTPUT_WEIGHTS, WEIGHTS)        \
+  {                                                                          \
+    keyed_jagged_index_select_dim1_kernel<                                   \
+        value_t,                                                             \
+        index_t,                                                             \
+        offset_t,                                                            \
+        WEIGHT_TYPE,                                                         \
+        WEIGHTED>                                                            \
+        <<<grid_size, kMaxThreads, 0, at::cuda::getCurrentCUDAStream()>>>(   \
+            output.packed_accessor64<value_t, 1, at::RestrictPtrTraits>(),   \
+            OUTPUT_WEIGHTS                                                   \
+                .packed_accessor64<WEIGHT_TYPE, 1, at::RestrictPtrTraits>(), \
+            values.packed_accessor64<value_t, 1, at::RestrictPtrTraits>(),   \
+            WEIGHTS                                                          \
+                .packed_accessor64<WEIGHT_TYPE, 1, at::RestrictPtrTraits>(), \
+            offsets.packed_accessor32<offset_t, 1, at::RestrictPtrTraits>(), \
+            indices.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),  \
+            output_offsets_contig                                            \
+                ->packed_accessor32<offset_t, 1, at::RestrictPtrTraits>(),   \
+            num_batches,                                                     \
+            batch_size);                                                     \
   }
       AT_DISPATCH_ALL_TYPES_AND2(
           at::ScalarType::Half,
@@ -306,13 +328,16 @@ class KeyedJaggedIndexSelectDim1GPUOp
                                 LAUNCH_KERNEL(
                                     true,
                                     weight_t,
-                                    output_weights.data_ptr<weight_t>(),
-                                    weights.value().data_ptr<weight_t>())
+                                    output_weights,
+                                    weights.value())
                               });
+                          C10_CUDA_KERNEL_LAUNCH_CHECK();
                         } else {
-                          LAUNCH_KERNEL(false, scalar_t, nullptr, nullptr)
+                          // has_weights = false, passing output and input as
+                          // dummy tensors for weights
+                          LAUNCH_KERNEL(false, scalar_t, output, values)
+                          C10_CUDA_KERNEL_LAUNCH_CHECK();
                         }
-                        C10_CUDA_KERNEL_LAUNCH_CHECK();
                       });
                 });
           });
@@ -360,6 +385,9 @@ class KeyedJaggedIndexSelectDim1GPUOp
 
     Tensor grad_input = at::zeros({num_outputs}, grad.options());
     auto grid_size = cuda_calc_xblock_count(grad.numel(), kMaxThreads);
+    // grad_offsetshas to be contiguous because it is passed to
+    // binary_search_range which takes raw pointers as arguments
+    const auto grad_offsets_contig = grad_offsets.expect_contiguous();
 
     if (grid_size != 0) {
       AT_DISPATCH_ALL_TYPES_AND2(
@@ -382,16 +410,28 @@ class KeyedJaggedIndexSelectDim1GPUOp
                             kMaxThreads,
                             0,
                             at::cuda::getCurrentCUDAStream()>>>(
-                            grad_input.data_ptr<scalar_t>(),
-                            grad.data_ptr<scalar_t>(),
-                            grad_offsets.data_ptr<offset_t>(),
-                            indices.data_ptr<index_t>(),
-                            output_offsets.data_ptr<offset_t>() +
-                                1, // shift it to make it inclusive cumsum
+                            grad_input.packed_accessor64<
+                                scalar_t,
+                                1,
+                                at::RestrictPtrTraits>(),
+                            grad.packed_accessor64<
+                                scalar_t,
+                                1,
+                                at::RestrictPtrTraits>(),
+                            grad_offsets_contig->packed_accessor32<
+                                offset_t,
+                                1,
+                                at::RestrictPtrTraits>(),
+                            indices.packed_accessor32<
+                                index_t,
+                                1,
+                                at::RestrictPtrTraits>(),
+                            output_offsets.packed_accessor32<
+                                offset_t,
+                                1,
+                                at::RestrictPtrTraits>(),
                             num_batches,
-                            indices.numel(),
-                            output_batch_size,
-                            grad.numel());
+                            output_batch_size);
                         C10_CUDA_KERNEL_LAUNCH_CHECK();
                       });
                 });
diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py
index 1787810554..e202268f67 100644
--- a/fbgemm_gpu/test/jagged_tensor_ops_test.py
+++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py
@@ -2118,6 +2118,7 @@ def test_masked_select_jagged_1d(
             ]  # Disable torch.bfloat16 due to large error bound
         ),
         has_weights=st.booleans(),
+        check_non_contiguous=st.booleans(),
     )
     @settings(max_examples=20, deadline=None)
     def test_keyed_jagged_index_select_dim1(
@@ -2129,6 +2130,7 @@ def test_keyed_jagged_index_select_dim1(
         index_dtype: torch.dtype,
         jagged_tensor_dtype: torch.dtype,
         has_weights: bool,
+        check_non_contiguous: bool,
     ) -> None:
         is_float = jagged_tensor_dtype in [torch.float, torch.half, torch.bfloat16]
         lengths = torch.randint(
@@ -2148,20 +2150,31 @@ def test_keyed_jagged_index_select_dim1(
             dtype=index_dtype,
             device="cuda",
         )
+
+        # If check_non_contiguous=True, create a tensor that is twice as big
+        # and then select only odd indices to make it non contiguous
+        values_numel = int(offsets[-1].item())
+        values_numel = values_numel * 2 if check_non_contiguous else values_numel
+
         if is_float:
             values = torch.rand(
-                int(offsets[-1].item()),
+                values_numel,
                 dtype=jagged_tensor_dtype,
                 device="cuda",
             )
         else:
             values = torch.randint(
                 2**16,
-                (int(offsets[-1].item()),),
+                (values_numel,),
                 dtype=jagged_tensor_dtype,
                 device="cuda",
             )
         values_ref = values.detach().clone()
+
+        if check_non_contiguous:
+            values = values[1::2]
+            values_ref = values_ref[1::2]
+
         if has_weights:
             weights = torch.rand(
                 int(offsets[-1].item()),
@@ -2215,9 +2228,18 @@ def test_keyed_jagged_index_select_dim1(
         if not is_float:
             return
 
-        grad = torch.rand_like(output)
+        # If check_non_contiguous=True, create a tensor that is twice as big
+        # and then select only odd indices to make it non contiguous
+        grad_numel = output.numel()
+        grad_numel = grad_numel * 2 if check_non_contiguous else grad_numel
+
+        grad = torch.rand(grad_numel, dtype=output.dtype, device=output.device)
         grad_ref = grad.detach().clone()
 
+        if check_non_contiguous:
+            grad = grad[1::2]
+            grad_ref = grad_ref[1::2]
+
         output.backward(grad)
         output_ref.backward(grad_ref)
 

From 1605d82da64770a441c7b256deab559b680d8ecf Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Wed, 11 Oct 2023 02:35:50 -0700
Subject: [PATCH 73/94] Split up split_embeddings_cache_cuda.cu (#1881)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1881

- Split up split_embeddings_cache_cuda.cu

Reviewed By: sryap, spcyppt

Differential Revision: D47491160

fbshipit-source-id: 9c025f06dbc003fe8734d64ae5a102b1a37fd5a5
---
 fbgemm_gpu/CMakeLists.txt                     |    8 +-
 .../split_embeddings_cache_cuda.cuh           |   13 +
 .../src/split_embeddings_cache/common.cuh     |   90 +
 .../src/split_embeddings_cache/lfu_cache.cu   |  749 ++++
 .../linearize_cache_indices.cu                |  282 ++
 .../split_embeddings_cache/lru_cache_find.cu  |  242 ++
 .../lru_cache_populate.cu                     |  371 ++
 .../lru_cache_populate_byte.cu                |  702 ++++
 .../src/split_embeddings_cache/lxu_cache.cu   |  527 +++
 .../reset_weight_momentum.cu                  |  323 ++
 fbgemm_gpu/src/split_embeddings_cache_cuda.cu | 3201 -----------------
 11 files changed, 3306 insertions(+), 3202 deletions(-)
 create mode 100644 fbgemm_gpu/src/split_embeddings_cache/common.cuh
 create mode 100644 fbgemm_gpu/src/split_embeddings_cache/lfu_cache.cu
 create mode 100644 fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu
 create mode 100644 fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu
 create mode 100644 fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate.cu
 create mode 100644 fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cu
 create mode 100644 fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu
 create mode 100644 fbgemm_gpu/src/split_embeddings_cache/reset_weight_momentum.cu
 delete mode 100644 fbgemm_gpu/src/split_embeddings_cache_cuda.cu

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 04c2f051b9..39fb050114 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -675,7 +675,13 @@ if(NOT FBGEMM_CPU_ONLY)
       src/sparse_ops/sparse_reorder_batched_ad.cu
       src/sparse_ops/sparse_segment_sum_csr.cu
       src/sparse_ops/sparse_zipf.cu
-      src/split_embeddings_cache_cuda.cu
+      src/split_embeddings_cache/lfu_cache.cu
+      src/split_embeddings_cache/lru_cache_find.cu
+      src/split_embeddings_cache/lru_cache_populate.cu
+      src/split_embeddings_cache/lru_cache_populate_byte.cu
+      src/split_embeddings_cache/lxu_cache.cu
+      src/split_embeddings_cache/linearize_cache_indices.cu
+      src/split_embeddings_cache/reset_weight_momentum.cu
       src/split_embeddings_utils.cu)
 
   set_source_files_properties(${fbgemm_gpu_sources_static_gpu}
diff --git a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh
index be71c24baf..800f238f5e 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh
@@ -13,6 +13,19 @@
 ///@defgroup table-batched-embed-cuda CUDA Operators
 /// The following are CUDA Operators
 
+namespace fbgemm_gpu {
+
+enum uvm_cache_stats_index {
+  num_calls = 0,
+  num_requested_indices = 1,
+  num_unique_indices = 2,
+  num_unique_misses = 3,
+  num_conflict_unique_misses = 4,
+  num_conflict_misses = 5,
+};
+
+} // namespace fbgemm_gpu
+
 ///@ingroup table-batched-embed-cuda
 /// Deduplicate indices.
 std::tuple<at::Tensor, at::Tensor, c10::optional<at::Tensor>>
diff --git a/fbgemm_gpu/src/split_embeddings_cache/common.cuh b/fbgemm_gpu/src/split_embeddings_cache/common.cuh
new file mode 100644
index 0000000000..d6e1eb6be3
--- /dev/null
+++ b/fbgemm_gpu/src/split_embeddings_cache/common.cuh
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// clang-format off
+#include "fbgemm_gpu/cub_namespace_prefix.cuh"
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_run_length_encode.cuh>
+#include <cub/device/device_select.cuh>
+#include <cub/block/block_reduce.cuh>
+#include "fbgemm_gpu/cub_namespace_postfix.cuh"
+// clang-format on
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/core/TensorAccessor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <ATen/cuda/detail/KernelUtils.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <curand_kernel.h>
+#include <ATen/cuda/Atomic.cuh>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
+#include <limits>
+#include <mutex>
+
+#include "fbgemm_gpu/dispatch_macros.h"
+#include "fbgemm_gpu/embedding_common.h"
+#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
+#include "fbgemm_gpu/fbgemm_tensor_accessor.h"
+#include "fbgemm_gpu/ops_utils.h"
+#include "fbgemm_gpu/sparse_ops_utils.h"
+#include "fbgemm_gpu/split_embeddings_cache_cuda.cuh"
+#include "fbgemm_gpu/split_embeddings_utils.cuh"
+
+using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+namespace {
+
+constexpr size_t kCacheMaxThreads = 512;
+constexpr int32_t kCacheLocationMissing = -1;
+constexpr int64_t kCacheStateInvalid = -1;
+
+// // TODO: do we care about 64-bit indices? Currently we just ignore.
+// __host__ DEVICE_INLINE uint32_t cache_slot(int32_t h_in, int32_t C) {
+//   // MurmorHash3 32-bit mixing function.
+//   uint32_t h = (uint32_t)h_in;
+//   h ^= h >> 16;
+//   h *= 0x85ebca6b;
+//   h ^= h >> 13;
+//   h *= 0xc2b2ae35;
+//   h ^= h >> 16;
+//   //
+//   https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+//   return ((uint64_t)h * (uint64_t)C) >> 32;
+// }
+
+__host__ DEVICE_INLINE uint32_t
+cache_slot(const int64_t h_in, const int32_t C) {
+  // MurmurHash3 64-bit mixing function.
+  uint64_t h = (uint64_t)h_in;
+  h ^= h >> 33;
+  h *= 0xff51afd7ed558ccd;
+  h ^= h >> 33;
+  h *= 0xc4ceb9fe1a85ec53;
+  h ^= h >> 33;
+
+  return h % (uint32_t)C;
+}
+
+// Experiments showed that performance of lru/lxu_cache_find_uncached_kernel is
+// not sensitive to grid size as long as the number thread blocks per SM is not
+// too small nor too big.
+constexpr int MAX_THREAD_BLOCKS_PER_SM_FOR_CACHE_KERNELS = 16;
+
+int get_max_thread_blocks_for_cache_kernels_() {
+  return get_device_sm_cnt_() * MAX_THREAD_BLOCKS_PER_SM_FOR_CACHE_KERNELS;
+}
+
+} // namespace
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache.cu b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache.cu
new file mode 100644
index 0000000000..f2080dc5d3
--- /dev/null
+++ b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache.cu
@@ -0,0 +1,749 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "common.cuh"
+
+using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+namespace {
+
+constexpr int32_t kCacheSetBits = 24;
+constexpr int32_t kLFUCounterBits = 40;
+static_assert(kCacheSetBits + kLFUCounterBits == 8 * sizeof(int64_t), "");
+
+template <typename index_t>
+__global__ __launch_bounds__(kMaxThreads) void lfu_update_counts_kernel(
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        unique_indices,
+    const int32_t* __restrict__ N_unique,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        unique_indices_count,
+    pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits> lfu_state) {
+  CUDA_KERNEL_LOOP(n, *N_unique) {
+    const auto idx = unique_indices[n];
+    lfu_state[idx] += unique_indices_count[n];
+  }
+}
+
+void lfu_update_counts_cuda(
+    Tensor unique_indices,
+    Tensor unique_indices_length,
+    Tensor unique_indices_count,
+    Tensor lfu_state) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      unique_indices, unique_indices_length, unique_indices_count, lfu_state);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(unique_indices.get_device());
+
+  const int32_t N = unique_indices.size(0);
+  AT_DISPATCH_INDEX_TYPES(
+      unique_indices.scalar_type(), "lfu_update_counts_cuda", [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lfu_update_counts_kernel";
+#endif
+        lfu_update_counts_kernel<<<
+            std::min(
+                div_round_up(N, kMaxThreads),
+                get_max_thread_blocks_for_cache_kernels_()),
+            kMaxThreads,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            MAKE_PTA_WITH_NAME(func_name, unique_indices, index_t, 1, 32),
+            unique_indices_length.data_ptr<int32_t>(),
+            MAKE_PTA_WITH_NAME(func_name, unique_indices_count, int32_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64));
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
+}
+
+template <typename index_t>
+__global__ __launch_bounds__(kMaxThreads) void lfu_cache_find_uncached_kernel(
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        unique_indices,
+    const int32_t* __restrict__ N_unique,
+    int64_t max_indices,
+    const pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+        lxu_cache_state,
+    uint64_t* __restrict__ cache_sets,
+    const pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
+        lfu_state) {
+  const int32_t C = lxu_cache_state.size(0);
+
+  for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique;
+       n += gridDim.x * blockDim.y) {
+    const int64_t idx = unique_indices[n];
+    if (idx == max_indices) {
+      // cache_sets are initialized with sentinel values in
+      // lfu_cache_find_uncached_cuda
+      continue;
+    }
+    const uint32_t cache_set = cache_slot(idx, C);
+
+    const auto slot = threadIdx.x;
+    const bool found = ::__ldg((&lxu_cache_state[cache_set][0]) + slot) == idx;
+
+#ifdef __HIP_PLATFORM_HCC__
+    if (!__any_sync(0xFFFFFFFFFFFFFFFF, found)) {
+#else
+    if (!__any_sync(0xFFFFFFFF, found)) {
+#endif
+      if (threadIdx.x == 0) {
+        // sort so the highest LFUs come first in the segment.
+        // assume lfu_state[idx] <= 2^40 - 1 and cache_set < 2^24 -1
+        cache_sets[n] =
+            ((static_cast<uint64_t>(cache_set) << kLFUCounterBits)) |
+            ((static_cast<uint64_t>(1) << kLFUCounterBits) - 1 -
+             lfu_state[idx]);
+      }
+    }
+  }
+}
+
+std::pair<Tensor, Tensor> lfu_cache_find_uncached_cuda(
+    Tensor unique_indices,
+    Tensor unique_indices_length,
+    int64_t max_indices,
+    Tensor lxu_cache_state,
+    Tensor lfu_state) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      unique_indices, unique_indices_length, lxu_cache_state, lfu_state);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(unique_indices.get_device());
+
+  auto cache_sets = full_like(
+      unique_indices,
+      static_cast<int64_t>(
+          static_cast<uint64_t>(lxu_cache_state.size(0)) << kLFUCounterBits),
+      unique_indices.options().dtype(at::kLong));
+  const int32_t N = unique_indices.numel();
+  auto sorted_cache_sets = empty_like(cache_sets);
+  auto cache_set_sorted_unique_indices = empty_like(unique_indices);
+
+  AT_DISPATCH_INDEX_TYPES(
+      unique_indices.scalar_type(), "lfu_cache_find_uncached_cuda", [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lfu_cache_find_uncached_kernel";
+#endif
+        // Find uncached indices
+        lfu_cache_find_uncached_kernel<<<
+            std::min(
+                div_round_up(N, kMaxThreads / kWarpSize),
+                get_max_thread_blocks_for_cache_kernels_()),
+            dim3(kWarpSize, kMaxThreads / kWarpSize),
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            MAKE_PTA_WITH_NAME(func_name, unique_indices, index_t, 1, 32),
+            unique_indices_length.data_ptr<int32_t>(),
+            max_indices,
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+            (uint64_t*)cache_sets.data_ptr<int64_t>(),
+            MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64));
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+        // Sort the cache sets and ids
+        size_t temp_storage_bytes = 0;
+        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs(
+            nullptr,
+            temp_storage_bytes,
+            (uint64_t*)cache_sets.data_ptr<int64_t>(),
+            (uint64_t*)sorted_cache_sets.data_ptr<int64_t>(),
+            unique_indices.data_ptr<index_t>(),
+            cache_set_sorted_unique_indices.data_ptr<index_t>(),
+            N,
+            0,
+            int(log2(float(lxu_cache_state.size(0) + 1)) + 1) + kLFUCounterBits,
+            at::cuda::getCurrentCUDAStream(),
+            false));
+        auto temp_storage = at::empty(
+            {static_cast<int64_t>(temp_storage_bytes)},
+            unique_indices.options().dtype(at::kByte));
+        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs(
+            temp_storage.data_ptr(),
+            temp_storage_bytes,
+            (uint64_t*)cache_sets.data_ptr<int64_t>(),
+            (uint64_t*)sorted_cache_sets.data_ptr<int64_t>(),
+            unique_indices.data_ptr<index_t>(),
+            cache_set_sorted_unique_indices.data_ptr<index_t>(),
+            N,
+            0,
+            int(log2(float(lxu_cache_state.size(0) + 1)) + 1) + kLFUCounterBits,
+            at::cuda::getCurrentCUDAStream(),
+            false));
+      });
+  return {sorted_cache_sets, cache_set_sorted_unique_indices};
+}
+
+template <typename emb_t, typename cache_t>
+__global__ __launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_kernel(
+    pta::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> weights,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        cache_hash_size_cumsum,
+    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
+        cache_index_table_map,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        weights_offsets,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        D_offsets,
+    const uint64_t* __restrict__ sorted_cache_sets,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        cache_set_sorted_indices,
+    const int32_t* __restrict__ N_unique,
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+        lxu_cache_state,
+    pta::PackedTensorAccessor64<cache_t, 2, at::RestrictPtrTraits>
+        lxu_cache_weights,
+    const pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
+        lfu_state,
+    bool stochastic_rounding,
+    at::PhiloxCudaState stochastic_rounding_philox_args) {
+  const int32_t C = lxu_cache_state.size(0);
+  for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique;
+       n += gridDim.x * blockDim.y) {
+    // check if this warp is responsible for this whole segment.
+    const bool segment_start =
+        (n == 0 ||
+         (sorted_cache_sets[n - 1] >> kLFUCounterBits) !=
+             (sorted_cache_sets[n] >> kLFUCounterBits));
+
+    if (!segment_start) {
+      // don't have *warp* divergence since we launch full warps in blockDim.x,
+      // so we can just exit this warp entirely.
+      continue;
+    }
+    const uint32_t cache_set = (sorted_cache_sets[n] >> kLFUCounterBits);
+    if (cache_set == C) {
+      // ignore the already-existing elements
+      continue;
+    }
+
+    int32_t SL = 1;
+    while (n + SL < *N_unique &&
+           (sorted_cache_sets[n + SL] >> kLFUCounterBits) == cache_set) {
+      SL += 1;
+    }
+
+    // now, we need to insert the (unique!) values in indices[n:n + SL] into
+    // our slots.
+    const int32_t slot = threadIdx.x;
+    const int64_t current_idx = lxu_cache_state[cache_set][slot];
+    const int64_t current_lfu_cost =
+        (current_idx != static_cast<int64_t>(kCacheStateInvalid))
+        ? lfu_state[current_idx]
+        : -1;
+    int64_t costs[1] = {current_lfu_cost};
+    int32_t slots[1] = {slot};
+
+    BitonicSort<int64_t, int32_t, 1, Comparator<int64_t>>::sort(costs, slots);
+    const int32_t sorted_slot = slots[0];
+    const int64_t sorted_lfu_cost = costs[0];
+
+    for (int32_t l = 0; l < min(SL, kWarpSize); ++l) {
+      const int32_t insert_slot = shfl_sync(sorted_slot, l);
+      const int64_t insert_current_lfu_cost = shfl_sync(sorted_lfu_cost, l);
+      const int64_t insert_idx = cache_set_sorted_indices[n + l];
+      const int64_t insert_lfu_cost = lfu_state[insert_idx];
+
+      if (insert_current_lfu_cost > insert_lfu_cost) {
+        // don't insert.
+        // all subsequent `current_lfu_cost` values are greater, and all
+        // subsequent `insert_lfu_cost` values are smaller, so we can exit
+        // early here.
+        break;
+      }
+      const int32_t t_insert = cache_index_table_map[insert_idx];
+      const int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
+      const int64_t weights_offset_insert = weights_offsets[t_insert];
+      const int32_t D_start_insert = D_offsets[t_insert];
+      const int32_t D_end_insert = D_offsets[t_insert + 1];
+      const int32_t D_insert = D_end_insert - D_start_insert;
+
+      // not empty
+      if (insert_current_lfu_cost != -1) {
+        // ensure that threadIdx.x is the only thread reading/writing to
+        // lxu_cache_state
+        int64_t current_idx =
+            threadIdx.x == 0 ? lxu_cache_state[cache_set][insert_slot] : 0;
+        current_idx = shfl_sync(current_idx, 0);
+        const int32_t t_current = cache_index_table_map[current_idx];
+        const int64_t idx_current =
+            current_idx - cache_hash_size_cumsum[t_current];
+        const int64_t weights_offset_current = weights_offsets[t_current];
+        const int32_t D_start_current = D_offsets[t_current];
+        const int32_t D_end_current = D_offsets[t_current + 1];
+        const int32_t D_current = D_end_current - D_start_current;
+
+        int32_t D_emb = D_current;
+        if constexpr (std::is_same_v<emb_t, uint8_t>) {
+          D_emb += kINT8QparamsBytes;
+        }
+        auto weight_row = WeightRow<emb_t, cache_t, cache_t>(
+            &weights[weights_offset_current + idx_current * D_emb + 0],
+            &lxu_cache_weights[cache_set * kWarpSize + insert_slot][0],
+            D_current,
+            nullptr);
+
+        weight_row.set_stochastic_rounding(
+            stochastic_rounding,
+            stochastic_rounding_philox_args,
+            (blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x +
+             threadIdx.x) *
+                    kWarpSize +
+                l);
+
+        weight_row.warp_evict(D_current, blockDim.x, threadIdx.x);
+      }
+
+      // insert into cache
+      int32_t D_emb = D_insert;
+      if constexpr (std::is_same_v<emb_t, uint8_t>) {
+        D_emb += kINT8QparamsBytes;
+      }
+
+      auto weight_row_cache = WeightRow<emb_t, cache_t, cache_t>(
+          &weights[weights_offset_insert + idx_insert * D_emb + 0],
+          &lxu_cache_weights[cache_set * kWarpSize + insert_slot][0],
+          D_insert,
+          nullptr);
+
+      auto weight_row_emb = WeightRow<emb_t, cache_t, cache_t>(
+          &weights[weights_offset_insert + idx_insert * D_emb + 0],
+          nullptr,
+          D_insert,
+          nullptr);
+
+      weight_row_emb.warp_copy_to(
+          weight_row_cache, D_insert, blockDim.x, threadIdx.x);
+
+      if (threadIdx.x == 0) {
+        lxu_cache_state[cache_set][insert_slot] = insert_idx;
+      }
+    }
+  }
+}
+
+void lfu_cache_insert_cuda(
+    Tensor weights,
+    Tensor cache_hash_size_cumsum,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor D_offsets,
+    Tensor sorted_cache_sets,
+    Tensor cache_set_sorted_unique_indices,
+    Tensor unique_indices_length,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    Tensor lfu_state,
+    bool stochastic_rounding) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      D_offsets,
+      sorted_cache_sets,
+      cache_set_sorted_unique_indices,
+      unique_indices_length,
+      lxu_cache_state,
+      lxu_cache_weights,
+      lfu_state);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(weights.get_device());
+
+  const int32_t N = cache_set_sorted_unique_indices.numel();
+
+  DISPATCH_EMB_CACHE_TYPES(
+      weights.scalar_type(),
+      lxu_cache_weights.scalar_type(),
+      "lfu_cache_insert_kernel_2",
+      ([&] {
+        at::PhiloxCudaState rng_engine_inputs;
+        if (stochastic_rounding && !std::is_same<emb_t, float>::value) {
+          auto gen = at::cuda::detail::getDefaultCUDAGenerator();
+          std::lock_guard<std::mutex> lock(gen.mutex());
+          rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)
+                                  ->philox_cuda_state(4);
+        }
+
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lfu_cache_insert_kernel";
+#endif
+
+        lfu_cache_insert_kernel<emb_t, cache_t>
+            <<<std::min(
+                   div_round_up(N, kCacheMaxThreads / kWarpSize),
+                   get_max_thread_blocks_for_cache_kernels_()),
+               dim3(kWarpSize, kCacheMaxThreads / kWarpSize),
+               0,
+               at::cuda::getCurrentCUDAStream()>>>(
+                MAKE_PTA_WITH_NAME(func_name, weights, emb_t, 1, 64),
+                MAKE_PTA_WITH_NAME(
+                    func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name, cache_index_table_map, int32_t, 1, 64),
+                MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
+                MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
+                (uint64_t*)sorted_cache_sets.data_ptr<int64_t>(),
+                MAKE_PTA_WITH_NAME(
+                    func_name, cache_set_sorted_unique_indices, int64_t, 1, 32),
+                unique_indices_length.data_ptr<int32_t>(),
+                MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name, lxu_cache_weights, cache_t, 2, 64),
+                MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64),
+                stochastic_rounding,
+                rng_engine_inputs);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      }));
+}
+
+} // namespace
+
+DLL_PUBLIC void lfu_cache_populate_cuda(
+    Tensor weights,
+    Tensor cache_hash_size_cumsum,
+    int64_t total_cache_hash_size,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor D_offsets,
+    Tensor linear_cache_indices,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    Tensor lfu_state,
+    bool stochastic_rounding) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      D_offsets,
+      linear_cache_indices,
+      lxu_cache_state,
+      lxu_cache_weights,
+      lfu_state);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(weights.get_device());
+
+  TORCH_CHECK(
+      linear_cache_indices.numel() < std::numeric_limits<int32_t>::max());
+  if (linear_cache_indices.numel() == 0) {
+    // nothing to do
+    return;
+  }
+
+  // get unqiue indices
+  Tensor unique_indices;
+  Tensor unique_indices_length;
+  c10::optional<Tensor> unique_indices_count;
+  std::tie(unique_indices, unique_indices_length, unique_indices_count) =
+      get_unique_indices_cuda(
+          linear_cache_indices, total_cache_hash_size, true);
+
+  // update lfu counts
+  lfu_update_counts_cuda(
+      unique_indices, unique_indices_length, *unique_indices_count, lfu_state);
+
+  // find uncached indices
+  auto cache_sets_and_unique_indices = lfu_cache_find_uncached_cuda(
+      unique_indices,
+      unique_indices_length,
+      total_cache_hash_size,
+      lxu_cache_state,
+      lfu_state);
+  const auto sorted_cache_sets = cache_sets_and_unique_indices.first;
+  const auto cache_set_sorted_unique_indices =
+      cache_sets_and_unique_indices.second;
+
+  // insert caching weights
+  lfu_cache_insert_cuda(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      D_offsets,
+      sorted_cache_sets,
+      cache_set_sorted_unique_indices,
+      unique_indices_length,
+      lxu_cache_state,
+      lxu_cache_weights,
+      lfu_state,
+      stochastic_rounding);
+}
+
+namespace {
+
+// In `lfu_cache_insert_kernel`, we use `emb_t` and `cache_t` for the
+// high-precision cache implementation, where we can have {FP32, FP16, INT8}
+// for embedding precision (data types), and {FP32, FP16} for cache precision
+// (data types).
+//
+// In `lfu_cache_insert_byte_kernel`, we only use uint8_t for the both embedding
+// and cache data type (conforming to the inference TBE kernel logics).
+// - We pass in `weights_tys` to denote the real data types for the embeddings:
+// {FP32, FP16, INT8, INT4, INT2}. For example, FP32 is 4 byte element in the
+// byte tensor, and INT4 is half byte element in the byte tensor.
+// - We only assume that the embedding and cache have the same precisions (the
+// real "precision" is determined by `weights_tys` although the data types are
+// uint8_t only). Basically no "high-precision cache" support for now.
+// - The insert/evict of embedding row from the cache are done in a byte-by-byte
+// manner.
+template <typename index_t>
+__global__
+__launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_byte_kernel(
+    pta::PackedTensorAccessor64<uint8_t, 1, at::RestrictPtrTraits> weights,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        cache_hash_size_cumsum,
+    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
+        cache_index_table_map,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        weights_offsets,
+    const pta::PackedTensorAccessor32<uint8_t, 1, at::RestrictPtrTraits>
+        weights_tys,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        D_offsets,
+    const uint64_t* __restrict__ sorted_cache_sets,
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        cache_set_sorted_indices,
+    const int32_t* __restrict__ N_unique,
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+        lxu_cache_state,
+    pta::PackedTensorAccessor64<uint8_t, 2, at::RestrictPtrTraits>
+        lxu_cache_weights,
+    const pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
+        lfu_state,
+    const int64_t row_alignment) {
+  const int32_t C = lxu_cache_state.size(0);
+  for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique;
+       n += gridDim.x * blockDim.y) {
+    // check if this warp is responsible for this whole segment.
+    const bool segment_start =
+        (n == 0 ||
+         (sorted_cache_sets[n - 1] >> kLFUCounterBits) !=
+             (sorted_cache_sets[n] >> kLFUCounterBits));
+
+    if (!segment_start) {
+      // don't have *warp* divergence since we launch full warps in blockDim.x,
+      // so we can just exit this warp entirely.
+      continue;
+    }
+    const uint32_t cache_set = (sorted_cache_sets[n] >> kLFUCounterBits);
+    if (cache_set == C) {
+      // ignore the already-existing elements
+      continue;
+    }
+
+    int32_t SL = 1;
+    while (n + SL < *N_unique &&
+           (sorted_cache_sets[n + SL] >> kLFUCounterBits) == cache_set) {
+      SL += 1;
+    }
+
+    // now, we need to insert the (unique!) values in indices[n:n + SL] into
+    // our slots.
+    const int32_t slot = threadIdx.x;
+    const int64_t current_idx = lxu_cache_state[cache_set][slot];
+    const int64_t current_lfu_cost =
+        (current_idx != static_cast<int64_t>(kCacheStateInvalid))
+        ? lfu_state[current_idx]
+        : -1;
+    int64_t costs[1] = {current_lfu_cost};
+    int32_t slots[1] = {slot};
+
+    BitonicSort<int64_t, int32_t, 1, Comparator<int64_t>>::sort(costs, slots);
+    const int32_t sorted_slot = slots[0];
+    const int64_t sorted_lfu_cost = costs[0];
+
+    for (int32_t l = 0; l < min(SL, kWarpSize); ++l) {
+      const int32_t insert_slot = shfl_sync(sorted_slot, l);
+      const int64_t insert_current_lfu_cost = shfl_sync(sorted_lfu_cost, l);
+      const index_t insert_idx = cache_set_sorted_indices[n + l];
+      const int64_t insert_lfu_cost = lfu_state[insert_idx];
+
+      if (insert_current_lfu_cost > insert_lfu_cost) {
+        // don't insert.
+        // all subsequent `current_lfu_cost` values are greater, and all
+        // subsequent `insert_lfu_cost` values are smaller, so we can exit
+        // early here.
+        break;
+      }
+      const int32_t t_insert = cache_index_table_map[insert_idx];
+      const SparseType weight_ty_insert =
+          static_cast<SparseType>(weights_tys[t_insert]);
+      const int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
+      const int64_t weights_offset_insert = weights_offsets[t_insert];
+      const int32_t D_start_insert = D_offsets[t_insert];
+      const int32_t D_end_insert = D_offsets[t_insert + 1];
+      const int32_t D_insert = D_end_insert - D_start_insert;
+
+      const int32_t D_insert_bytes = nbit::padded_row_size_in_bytes(
+          D_insert, weight_ty_insert, row_alignment);
+
+      // insert into cache. Note that nbit::padded_row_size_in_bytes pad each
+      // row with row_alignment (16 bytes on GPUs) So each row will be multiple
+      // of 16 bytes (uint4 = 32bit x 4 = 16 bytes).
+      auto row = reinterpret_cast<const uint4*>(
+          &weights[weights_offset_insert + idx_insert * D_insert_bytes + 0]);
+      auto cache_row = reinterpret_cast<uint4*>(
+          &lxu_cache_weights[cache_set * kWarpSize + insert_slot][0]);
+      for (int32_t d = threadIdx.x; d * sizeof(uint4) < D_insert_bytes;
+           d += blockDim.x) {
+        cache_row[d] = row[d];
+      }
+      if (threadIdx.x == 0) {
+        lxu_cache_state[cache_set][insert_slot] = insert_idx;
+      }
+    }
+  }
+}
+
+void lfu_cache_insert_byte_cuda(
+    Tensor weights,
+    Tensor cache_hash_size_cumsum,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor weights_tys,
+    Tensor D_offsets,
+    Tensor sorted_cache_sets,
+    Tensor cache_set_sorted_unique_indices,
+    Tensor unique_indices_length,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    Tensor lfu_state,
+    int64_t row_alignment) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      weights_tys,
+      D_offsets,
+      sorted_cache_sets,
+      cache_set_sorted_unique_indices,
+      unique_indices_length,
+      lxu_cache_state,
+      lxu_cache_weights,
+      lfu_state);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(weights.get_device());
+
+  const int32_t N = cache_set_sorted_unique_indices.numel();
+
+  AT_DISPATCH_INDEX_TYPES(
+      cache_set_sorted_unique_indices.scalar_type(),
+      "lfu_cache_insert_byte_cuda",
+      [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lfu_cache_insert_byte_kernel";
+#endif
+        lfu_cache_insert_byte_kernel<<<
+            std::min(
+                div_round_up(N, kCacheMaxThreads / kWarpSize),
+                get_max_thread_blocks_for_cache_kernels_()),
+            dim3(kWarpSize, kCacheMaxThreads / kWarpSize),
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            MAKE_PTA_WITH_NAME(func_name, weights, uint8_t, 1, 64),
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_index_table_map, int32_t, 1, 64),
+            MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, weights_tys, uint8_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
+            (uint64_t*)sorted_cache_sets.data_ptr<int64_t>(),
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_set_sorted_unique_indices, index_t, 1, 32),
+            unique_indices_length.data_ptr<int32_t>(),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_weights, uint8_t, 2, 64),
+            MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64),
+            row_alignment);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
+}
+
+} // namespace
+
+DLL_PUBLIC void lfu_cache_populate_byte_cuda(
+    Tensor weights,
+    Tensor cache_hash_size_cumsum,
+    int64_t total_cache_hash_size,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor weights_tys,
+    Tensor D_offsets,
+    Tensor linear_cache_indices,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    Tensor lfu_state,
+    int64_t row_alignment) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      weights_tys,
+      D_offsets,
+      linear_cache_indices,
+      lxu_cache_state,
+      lxu_cache_weights,
+      lfu_state);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(weights.get_device());
+
+  TORCH_CHECK(
+      linear_cache_indices.numel() < std::numeric_limits<int32_t>::max());
+  if (linear_cache_indices.numel() == 0) {
+    // nothing to do
+    return;
+  }
+
+  // get unqiue indices
+  Tensor unique_indices;
+  Tensor unique_indices_length;
+  c10::optional<Tensor> unique_indices_count;
+  std::tie(unique_indices, unique_indices_length, unique_indices_count) =
+      get_unique_indices_cuda(
+          linear_cache_indices, total_cache_hash_size, true);
+
+  // update lfu counts
+  lfu_update_counts_cuda(
+      unique_indices, unique_indices_length, *unique_indices_count, lfu_state);
+
+  // find uncached indices
+  const auto cache_sets_and_unique_indices = lfu_cache_find_uncached_cuda(
+      unique_indices,
+      unique_indices_length,
+      total_cache_hash_size,
+      lxu_cache_state,
+      lfu_state);
+  const auto sorted_cache_sets = cache_sets_and_unique_indices.first;
+  const auto cache_set_sorted_unique_indices =
+      cache_sets_and_unique_indices.second;
+
+  // insert caching weights
+  lfu_cache_insert_byte_cuda(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      weights_tys,
+      D_offsets,
+      sorted_cache_sets,
+      cache_set_sorted_unique_indices,
+      unique_indices_length,
+      lxu_cache_state,
+      lxu_cache_weights,
+      lfu_state,
+      row_alignment);
+}
diff --git a/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu b/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu
new file mode 100644
index 0000000000..088ba930e5
--- /dev/null
+++ b/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "common.cuh"
+
+using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+namespace {
+
+template <typename index_t>
+__global__ __launch_bounds__(kMaxThreads) void linearize_cache_indices_kernel(
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        cache_hash_size_cumsum,
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        indices,
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        table_offsets,
+    pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        linear_cache_indices) {
+  const index_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index >= indices.size(0)) {
+    return;
+  }
+
+  // Perform binary search.
+  int left = 0;
+  int right = table_offsets.size(0);
+  while (left != right) {
+    const int middle =
+        left + (right - left) / 2; // Avoid overflow in midpoint calculation
+    if (table_offsets[middle] <= index) {
+      left = middle + 1;
+    } else {
+      right = middle;
+    }
+  }
+  const int table_index = left;
+
+  const auto max_offset =
+      ::__ldg(&cache_hash_size_cumsum[cache_hash_size_cumsum.size(0) - 1]);
+  const auto curr_offset = ::__ldg(&cache_hash_size_cumsum[table_index]);
+  if (curr_offset >= 0 && indices[index] >= 0) {
+    linear_cache_indices[index] = indices[index] + curr_offset;
+  } else {
+    // Either table index is wrong, or index value is negative (due to pruning):
+    // set it to invalid value.
+    linear_cache_indices[index] = max_offset;
+  }
+}
+
+} // namespace
+
+DLL_PUBLIC Tensor linearize_cache_indices_cuda(
+    Tensor cache_hash_size_cumsum,
+    Tensor indices,
+    Tensor offsets) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      cache_hash_size_cumsum, indices, offsets);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(cache_hash_size_cumsum.get_device());
+
+  const auto T = cache_hash_size_cumsum.size(0) - 1;
+  TORCH_CHECK(T > 0);
+  // offsets = [B x T  + 1]
+  const auto B = (offsets.size(0) - 1) / T;
+  TORCH_CHECK(B >= 0);
+
+  auto linear_cache_indices = at::empty_like(indices);
+  const auto num_indices = indices.numel();
+  if (B == 0 || num_indices == 0) {
+    return linear_cache_indices;
+  }
+
+  auto table_offsets = offsets.slice(0, B, B * T, B);
+
+  AT_DISPATCH_INDEX_TYPES(
+      indices.scalar_type(), "linearize_cache_indices_kernel", [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "linearize_cache_indices_kernel";
+#endif
+        linearize_cache_indices_kernel<<<
+            div_round_up(num_indices, kMaxThreads),
+            kMaxThreads,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, indices, index_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, table_offsets, index_t, 1, 32),
+            MAKE_PTA_WITH_NAME(
+                func_name, linear_cache_indices, index_t, 1, 32));
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
+  return linear_cache_indices;
+}
+
+namespace {
+
+template <typename index_t>
+__global__
+__launch_bounds__(kMaxThreads) void linearize_cache_indices_from_row_idx_kernel(
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        cache_hash_size_cumsum,
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        update_table_indices,
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        update_row_indices,
+    pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        linear_cache_indices) {
+  const index_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index >= update_row_indices.size(0)) {
+    return;
+  }
+  const int table_index = update_table_indices[index];
+
+  const auto max_offset =
+      ::__ldg(&cache_hash_size_cumsum[cache_hash_size_cumsum.size(0) - 1]);
+  const auto curr_offset = ::__ldg(&cache_hash_size_cumsum[table_index]);
+  if (curr_offset >= 0 && update_row_indices[index] >= 0) {
+    linear_cache_indices[index] = update_row_indices[index] + curr_offset;
+  } else {
+    // Either table index is wrong, or index value is negative (due to pruning):
+    // set it to invalid value.
+    linear_cache_indices[index] = max_offset;
+  }
+}
+
+} // namespace
+
+DLL_PUBLIC Tensor linearize_cache_indices_from_row_idx_cuda(
+    Tensor cache_hash_size_cumsum,
+    Tensor update_table_indices,
+    Tensor update_row_indices) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      cache_hash_size_cumsum, update_table_indices, update_row_indices);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(cache_hash_size_cumsum.get_device());
+
+  const auto T = cache_hash_size_cumsum.size(0) - 1;
+  TORCH_CHECK(T > 0);
+
+  auto linear_cache_indices = at::empty_like(update_row_indices);
+  const auto num_indices = update_row_indices.numel();
+  if (num_indices == 0) {
+    return linear_cache_indices;
+  }
+
+  AT_DISPATCH_INDEX_TYPES(
+      update_row_indices.scalar_type(),
+      "linearize_cache_indices_from_row_idx_kernel",
+      [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "linearize_cache_indices_from_row_idx_kernel";
+#endif
+        linearize_cache_indices_from_row_idx_kernel<<<
+            div_round_up(num_indices, kMaxThreads),
+            kMaxThreads,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, update_table_indices, index_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, update_row_indices, index_t, 1, 32),
+            MAKE_PTA_WITH_NAME(
+                func_name, linear_cache_indices, index_t, 1, 32));
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
+  return linear_cache_indices;
+}
+
+DLL_PUBLIC std::tuple<Tensor, Tensor, c10::optional<Tensor>>
+get_unique_indices_cuda(
+    Tensor linear_indices,
+    int64_t max_indices,
+    bool compute_count) {
+  TENSOR_ON_CUDA_GPU(linear_indices);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(linear_indices.get_device());
+
+  TORCH_CHECK(linear_indices.numel() < std::numeric_limits<int32_t>::max());
+  const int32_t N = linear_indices.numel();
+  auto sorted_indices = at::empty_like(linear_indices);
+  auto unique_indices = at::empty_like(linear_indices);
+  auto unique_indices_length =
+      at::empty({1}, linear_indices.options().dtype(at::kInt));
+  c10::optional<Tensor> unique_indices_count = c10::nullopt;
+  if (compute_count) {
+    unique_indices_count = at::empty(
+        {linear_indices.numel()}, linear_indices.options().dtype(at::kInt));
+  }
+  AT_DISPATCH_INDEX_TYPES(
+      linear_indices.scalar_type(), "get_unique_indices_cuda", [&] {
+        // sort indices
+        size_t temp_storage_bytes_0 = 0;
+        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortKeys(
+            nullptr,
+            temp_storage_bytes_0,
+            linear_indices.data_ptr<index_t>(),
+            sorted_indices.data_ptr<index_t>(),
+            N,
+            0,
+            int(log2(float(max_indices + 1)) + 1),
+            at::cuda::getCurrentCUDAStream(),
+            false));
+        auto temp_storage_0 = at::empty(
+            {static_cast<index_t>(temp_storage_bytes_0)},
+            linear_indices.options().dtype(at::kByte));
+        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortKeys(
+            temp_storage_0.data_ptr(),
+            temp_storage_bytes_0,
+            linear_indices.data_ptr<index_t>(),
+            sorted_indices.data_ptr<index_t>(),
+            N,
+            0,
+            int(log2(float(max_indices + 1)) + 1),
+            at::cuda::getCurrentCUDAStream(),
+            false));
+        // get unique indices
+        if (compute_count) {
+          size_t temp_storage_bytes_1 = 0;
+          AT_CUDA_CHECK(
+              FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRunLengthEncode::Encode(
+                  nullptr,
+                  temp_storage_bytes_1,
+                  sorted_indices.data_ptr<index_t>(),
+                  unique_indices.data_ptr<index_t>(),
+                  unique_indices_count->data_ptr<int32_t>(),
+                  unique_indices_length.data_ptr<int32_t>(),
+                  N,
+                  at::cuda::getCurrentCUDAStream(),
+                  false));
+          auto temp_storage_1 = at::empty(
+              {static_cast<index_t>(temp_storage_bytes_1)},
+              linear_indices.options().dtype(at::kByte));
+          AT_CUDA_CHECK(
+              FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRunLengthEncode::Encode(
+                  temp_storage_1.data_ptr(),
+                  temp_storage_bytes_1,
+                  sorted_indices.data_ptr<index_t>(),
+                  unique_indices.data_ptr<index_t>(),
+                  unique_indices_count->data_ptr<int32_t>(),
+                  unique_indices_length.data_ptr<int32_t>(),
+                  N,
+                  at::cuda::getCurrentCUDAStream(),
+                  false));
+        } else {
+          size_t temp_storage_bytes_1 = 0;
+          AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceSelect::Unique(
+              nullptr,
+              temp_storage_bytes_1,
+              sorted_indices.data_ptr<index_t>(),
+              unique_indices.data_ptr<index_t>(),
+              unique_indices_length.data_ptr<int32_t>(),
+              N,
+              at::cuda::getCurrentCUDAStream(),
+              false));
+          auto temp_storage_1 = at::empty(
+              {static_cast<index_t>(temp_storage_bytes_1)},
+              linear_indices.options().dtype(at::kByte));
+          AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceSelect::Unique(
+              temp_storage_1.data_ptr(),
+              temp_storage_bytes_1,
+              sorted_indices.data_ptr<index_t>(),
+              unique_indices.data_ptr<index_t>(),
+              unique_indices_length.data_ptr<int32_t>(),
+              N,
+              at::cuda::getCurrentCUDAStream(),
+              false));
+        }
+      });
+  return std::make_tuple(
+      unique_indices, unique_indices_length, unique_indices_count);
+}
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu
new file mode 100644
index 0000000000..95e2639464
--- /dev/null
+++ b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "common.cuh"
+
+using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+namespace {
+
+__global__ __launch_bounds__(kMaxThreads) void emulate_cache_miss_kernel(
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        lxu_cache_locations,
+    const int64_t enforced_misses_per_256,
+    const bool gather_cache_stats,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        uvm_cache_stats) {
+  const int32_t N = lxu_cache_locations.size(0);
+  int64_t n_enforced_misses = 0;
+  CUDA_KERNEL_LOOP(n, N) {
+    if ((n & 0x00FF) < enforced_misses_per_256) {
+      if (lxu_cache_locations[n] >= 0) {
+        n_enforced_misses++;
+      }
+      lxu_cache_locations[n] = kCacheLocationMissing;
+    }
+  }
+  if (gather_cache_stats && n_enforced_misses > 0) {
+    atomicAdd(
+        &uvm_cache_stats[uvm_cache_stats_index::num_conflict_misses],
+        n_enforced_misses);
+  }
+}
+
+} // namespace
+
+DLL_PUBLIC Tensor emulate_cache_miss(
+    Tensor lxu_cache_locations,
+    const int64_t enforced_misses_per_256,
+    const bool gather_cache_stats,
+    Tensor uvm_cache_stats) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      lxu_cache_locations, uvm_cache_stats);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(lxu_cache_locations.get_device());
+
+  const auto N = lxu_cache_locations.numel();
+  if (N == 0) {
+    // nothing to do
+    return lxu_cache_locations;
+  }
+
+  const dim3 blocks(std::min(
+      div_round_up(N, kMaxThreads),
+      get_max_thread_blocks_for_cache_kernels_()));
+
+#ifdef FBGEMM_GPU_MEMCHECK
+  const char* func_name = "emulate_cache_miss_kernel";
+#endif
+
+  emulate_cache_miss_kernel<<<
+      blocks,
+      kMaxThreads,
+      0,
+      at::cuda::getCurrentCUDAStream()>>>(
+      MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32),
+      enforced_misses_per_256,
+      gather_cache_stats,
+      MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats, int32_t, 1, 32));
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+  return lxu_cache_locations;
+}
+
+namespace {
+template <typename index_t>
+__global__ __launch_bounds__(kMaxThreads) void lru_cache_find_uncached_kernel(
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        unique_indices,
+    const int32_t* __restrict__ N_unique,
+    int64_t max_indices,
+    const pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+        lxu_cache_state,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> cache_sets,
+    int64_t time_stamp,
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
+    const bool gather_cache_stats,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        uvm_cache_stats,
+    const bool lock_cache_line,
+    pta::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits>
+        lxu_cache_locking_counter) {
+  if (gather_cache_stats) {
+    if (blockIdx.x == 0 && threadIdx.x == 0 && threadIdx.y == 0) {
+      atomicAdd(
+          &uvm_cache_stats[uvm_cache_stats_index::num_calls], 1); // N_called.
+      atomicAdd(
+          &uvm_cache_stats[uvm_cache_stats_index::num_requested_indices],
+          unique_indices.size(0)); // N_requested_indices.
+      atomicAdd(
+          &uvm_cache_stats[uvm_cache_stats_index::num_unique_indices],
+          *N_unique); // N_unique_indices.
+    }
+  }
+
+  const int32_t C = lxu_cache_state.size(0);
+  int32_t n_misses = 0;
+
+  for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique;
+       n += gridDim.x * blockDim.y) {
+    int64_t idx = unique_indices[n];
+    if (idx == max_indices) {
+      // cache_sets are initialized with sentinel values in
+      // lru_cache_find_uncached_cuda
+      continue;
+    }
+    int32_t cache_set = cache_slot(idx, C);
+
+    const auto slot = threadIdx.x;
+    const bool found = ::__ldg((&lxu_cache_state[cache_set][0]) + slot) == idx;
+    if (found) {
+      // mark it as recently accessed so we don't evict.
+      lru_state[cache_set][slot] = time_stamp;
+      if (lock_cache_line) {
+        lxu_cache_locking_counter[cache_set][slot] += 1;
+      }
+    }
+
+#ifdef __HIP_PLATFORM_HCC__
+    if (!__any_sync(0xFFFFFFFFFFFFFFFF, found)) {
+#else
+    if (!__any_sync(0xFFFFFFFF, found)) {
+#endif
+      if (threadIdx.x == 0) {
+        cache_sets[n] = cache_set;
+        n_misses++;
+      }
+    }
+  }
+  if (gather_cache_stats && threadIdx.x == 0) {
+    atomicAdd(
+        &uvm_cache_stats[uvm_cache_stats_index::num_unique_misses],
+        n_misses); // N_unique_misses.
+  }
+}
+
+} // namespace
+
+DLL_PUBLIC std::pair<Tensor, Tensor> lru_cache_find_uncached_cuda(
+    Tensor unique_indices,
+    Tensor unique_indices_length,
+    int64_t max_indices,
+    Tensor lxu_cache_state,
+    int64_t time_stamp,
+    Tensor lru_state,
+    bool gather_cache_stats,
+    Tensor uvm_cache_stats,
+    bool lock_cache_line,
+    Tensor lxu_cache_locking_counter) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      unique_indices,
+      unique_indices_length,
+      lxu_cache_state,
+      lru_state,
+      uvm_cache_stats,
+      lxu_cache_locking_counter);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(unique_indices.get_device());
+
+  // Fill with sentinel value
+  auto cache_sets = full_like(
+      unique_indices,
+      lxu_cache_state.size(0),
+      unique_indices.options().dtype(at::kInt));
+  const int32_t N = unique_indices.numel();
+  auto sorted_cache_sets = empty_like(cache_sets);
+  auto cache_set_sorted_unique_indices = empty_like(unique_indices);
+
+  AT_DISPATCH_INDEX_TYPES(
+      unique_indices.scalar_type(), "lru_cache_find_uncached_cuda", [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lru_cache_find_uncached_kernel";
+#endif
+        // Find uncached indices
+        lru_cache_find_uncached_kernel<<<
+            std::min(
+                div_round_up(N, kMaxThreads / kWarpSize),
+                get_max_thread_blocks_for_cache_kernels_()),
+            dim3(kWarpSize, kMaxThreads / kWarpSize),
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            MAKE_PTA_WITH_NAME(func_name, unique_indices, index_t, 1, 32),
+            unique_indices_length.data_ptr<int32_t>(),
+            max_indices,
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+            MAKE_PTA_WITH_NAME(func_name, cache_sets, int32_t, 1, 32),
+            time_stamp,
+            MAKE_PTA_WITH_NAME(func_name, lru_state, int64_t, 2, 32),
+            gather_cache_stats,
+            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats, int32_t, 1, 32),
+            lock_cache_line,
+            MAKE_PTA_WITH_NAME(
+                func_name, lxu_cache_locking_counter, int32_t, 2, 32));
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+        // Sort the cache sets and ids
+        size_t temp_storage_bytes = 0;
+        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs(
+            nullptr,
+            temp_storage_bytes,
+            cache_sets.data_ptr<int32_t>(),
+            sorted_cache_sets.data_ptr<int32_t>(),
+            unique_indices.data_ptr<index_t>(),
+            cache_set_sorted_unique_indices.data_ptr<index_t>(),
+            N,
+            0,
+            int(log2(float(lxu_cache_state.size(0) + 1)) + 1),
+            at::cuda::getCurrentCUDAStream(),
+            false));
+        auto temp_storage = at::empty(
+            {static_cast<index_t>(temp_storage_bytes)},
+            unique_indices.options().dtype(at::kByte));
+        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs(
+            temp_storage.data_ptr(),
+            temp_storage_bytes,
+            cache_sets.data_ptr<int32_t>(),
+            sorted_cache_sets.data_ptr<int32_t>(),
+            unique_indices.data_ptr<index_t>(),
+            cache_set_sorted_unique_indices.data_ptr<index_t>(),
+            N,
+            0,
+            int(log2(float(lxu_cache_state.size(0) + 1)) + 1),
+            at::cuda::getCurrentCUDAStream(),
+            false));
+      });
+  return {sorted_cache_sets, cache_set_sorted_unique_indices};
+}
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate.cu b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate.cu
new file mode 100644
index 0000000000..13896890ae
--- /dev/null
+++ b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate.cu
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "common.cuh"
+
+using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+namespace {
+
+template <typename emb_t, typename cache_t>
+__global__ __launch_bounds__(kMaxThreads) void lru_cache_insert_kernel(
+    pta::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> weights,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        cache_hash_size_cumsum,
+    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
+        cache_index_table_map,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        weights_offsets,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        D_offsets,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        sorted_cache_sets,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        cache_set_sorted_indices,
+    const int32_t* __restrict__ N_unique,
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+        lxu_cache_state,
+    pta::PackedTensorAccessor64<cache_t, 2, at::RestrictPtrTraits>
+        lxu_cache_weights,
+    const int64_t time_stamp,
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
+    const bool stochastic_rounding,
+    at::PhiloxCudaState stochastic_rounding_philox_args,
+    const bool gather_cache_stats,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        uvm_cache_stats,
+    const bool lock_cache_line,
+    pta::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits>
+        lxu_cache_locking_counter) {
+  const int32_t C = lxu_cache_state.size(0);
+  int32_t n_conflict_misses = 0;
+  for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique;
+       n += gridDim.x * blockDim.y) {
+    // check if this warp is responsible for this whole segment.
+    const bool segment_start =
+        (n == 0 || sorted_cache_sets[n - 1] != sorted_cache_sets[n]);
+
+    if (!segment_start) {
+      // don't have *warp* divergence since we launch full warps in blockDim.x,
+      // so we can just exit this warp entirely.
+      continue;
+    }
+    const int32_t cache_set = sorted_cache_sets[n];
+    if (cache_set == C) {
+      // ignore the already-existing elements
+      continue;
+    }
+
+    int32_t SL = 1;
+    while (n + SL < *N_unique && sorted_cache_sets[n + SL] == cache_set) {
+      SL += 1;
+    }
+    int32_t n_inserted = 0; // also used as index to insert
+
+    // now, we need to insert the (unique!) values in indices[n:n + SL] into
+    // our slots.
+    const int32_t slot = threadIdx.x;
+    const int64_t slot_time = lru_state[cache_set][slot];
+    int64_t costs[1] = {slot_time};
+    int32_t slots[1] = {slot};
+
+    BitonicSort<int64_t, int32_t, 1, Comparator<int64_t>>::sort(costs, slots);
+    const int32_t sorted_slot = slots[0];
+    const int64_t sorted_lru_cost = costs[0];
+
+    for (int32_t l = 0; l < min(SL, kWarpSize); ++l) {
+      const int32_t insert_slot = shfl_sync(sorted_slot, l);
+      if (lock_cache_line) {
+        auto count = lxu_cache_locking_counter[cache_set][insert_slot];
+        if (count > 0) {
+          continue; // cache slot is in use
+        }
+      }
+      const int64_t insert_current_lru_cost = shfl_sync(sorted_lru_cost, l);
+      if (insert_current_lru_cost == time_stamp) {
+        break;
+      }
+      const int64_t insert_idx = cache_set_sorted_indices[n + n_inserted];
+      const int32_t t_insert = cache_index_table_map[insert_idx];
+      const int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
+      const int64_t weights_offset_insert = weights_offsets[t_insert];
+      const int32_t D_start_insert = D_offsets[t_insert];
+      const int32_t D_end_insert = D_offsets[t_insert + 1];
+      const int32_t D_insert = D_end_insert - D_start_insert;
+
+      // ensure that threadIdx.x is the only thread reading/writing to
+      // lxu_cache_state
+      int64_t current_idx =
+          threadIdx.x == 0 ? lxu_cache_state[cache_set][insert_slot] : 0;
+      current_idx = shfl_sync(current_idx, 0);
+
+      // not empty
+      if (current_idx != static_cast<int64_t>(kCacheStateInvalid)) {
+        // evict from slot to backing storage
+        const int32_t t_current = cache_index_table_map[current_idx];
+        const int64_t idx_current =
+            current_idx - cache_hash_size_cumsum[t_current];
+        const int64_t weights_offset_current = weights_offsets[t_current];
+        const int32_t D_start_current = D_offsets[t_current];
+        const int32_t D_end_current = D_offsets[t_current + 1];
+        const int32_t D_current = D_end_current - D_start_current;
+        int32_t D_emb = D_current;
+        if constexpr (std::is_same_v<emb_t, uint8_t>) {
+          D_emb += kINT8QparamsBytes;
+        }
+
+        auto weight_row = WeightRow<emb_t, cache_t, cache_t>(
+            &weights[weights_offset_current + idx_current * D_emb + 0],
+            &lxu_cache_weights[cache_set * kWarpSize + insert_slot][0],
+            D_current,
+            nullptr);
+
+        weight_row.set_stochastic_rounding(
+            stochastic_rounding,
+            stochastic_rounding_philox_args,
+            (blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x +
+             threadIdx.x) *
+                    kWarpSize +
+                l);
+
+        weight_row.warp_evict(D_current, blockDim.x, threadIdx.x);
+      }
+
+      int32_t D_emb = D_insert;
+      if constexpr (std::is_same_v<emb_t, uint8_t>) {
+        D_emb += kINT8QparamsBytes;
+      }
+
+      auto weight_row_cache = WeightRow<emb_t, cache_t, cache_t>(
+          &weights[weights_offset_insert + idx_insert * D_emb + 0],
+          &lxu_cache_weights[cache_set * kWarpSize + insert_slot][0],
+          D_insert,
+          nullptr);
+
+      auto weight_row_emb = WeightRow<emb_t, cache_t, cache_t>(
+          &weights[weights_offset_insert + idx_insert * D_emb + 0],
+          nullptr,
+          D_insert,
+          nullptr);
+
+      weight_row_emb.warp_copy_to(
+          weight_row_cache, D_insert, blockDim.x, threadIdx.x);
+
+      if (threadIdx.x == 0) {
+        lxu_cache_state[cache_set][insert_slot] = insert_idx;
+        lru_state[cache_set][insert_slot] = time_stamp;
+        if (lock_cache_line) {
+          lxu_cache_locking_counter[cache_set][insert_slot] += 1;
+        }
+      }
+
+      n_inserted++;
+    }
+    n_conflict_misses += (SL - n_inserted);
+  }
+  if (gather_cache_stats && n_conflict_misses > 0 && threadIdx.x == 0) {
+    atomicAdd(
+        &uvm_cache_stats[uvm_cache_stats_index::num_conflict_unique_misses],
+        n_conflict_misses);
+  }
+}
+
+void lru_cache_insert_cuda(
+    Tensor weights,
+    Tensor cache_hash_size_cumsum,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor D_offsets,
+    Tensor sorted_cache_sets,
+    Tensor cache_set_sorted_unique_indices,
+    Tensor unique_indices_length,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    const int64_t time_stamp,
+    Tensor lru_state,
+    const bool stochastic_rounding,
+    bool gather_cache_stats,
+    Tensor uvm_cache_stats,
+    bool lock_cache_line,
+    Tensor lxu_cache_locking_counter) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      D_offsets,
+      sorted_cache_sets,
+      cache_set_sorted_unique_indices,
+      unique_indices_length,
+      lxu_cache_state,
+      lxu_cache_weights,
+      lru_state,
+      uvm_cache_stats,
+      lxu_cache_locking_counter);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(weights.get_device());
+
+  const int32_t N = cache_set_sorted_unique_indices.numel();
+
+  DISPATCH_EMB_CACHE_TYPES(
+      weights.scalar_type(),
+      lxu_cache_weights.scalar_type(),
+      "lru_cache_insert_kernel_2",
+      ([&] {
+        at::PhiloxCudaState rng_engine_inputs;
+        if (stochastic_rounding && !std::is_same<emb_t, float>::value) {
+          auto gen = at::cuda::detail::getDefaultCUDAGenerator();
+          std::lock_guard<std::mutex> lock(gen.mutex());
+          rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)
+                                  ->philox_cuda_state(4);
+        }
+
+        // During concurrent prefetch, cache lines are locked and we use less
+        // SMs for some of the prefetch kernels (e.g. insert)
+        // since it is not SM bound. It leaves SMs for main stream to overlap
+        constexpr int ALL_TO_PREFETCH_SM_RATIO = 8;
+
+        auto grid_size = lock_cache_line
+            ? div_round_up(get_device_sm_cnt_(), ALL_TO_PREFETCH_SM_RATIO)
+            : div_round_up(N, kMaxThreads / kWarpSize);
+
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lru_cache_insert_kernel";
+#endif
+        lru_cache_insert_kernel<emb_t, cache_t>
+            <<<grid_size,
+               dim3(kWarpSize, kMaxThreads / kWarpSize),
+               0,
+               at::cuda::getCurrentCUDAStream()>>>(
+                MAKE_PTA_WITH_NAME(func_name, weights, emb_t, 1, 64),
+                MAKE_PTA_WITH_NAME(
+                    func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name, cache_index_table_map, int32_t, 1, 64),
+                MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
+                MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name, sorted_cache_sets, int32_t, 1, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name, cache_set_sorted_unique_indices, int64_t, 1, 32),
+                unique_indices_length.data_ptr<int32_t>(),
+                MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name, lxu_cache_weights, cache_t, 2, 64),
+                time_stamp,
+                MAKE_PTA_WITH_NAME(func_name, lru_state, int64_t, 2, 32),
+                stochastic_rounding,
+                rng_engine_inputs,
+                gather_cache_stats,
+                MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats, int32_t, 1, 32),
+                lock_cache_line,
+                MAKE_PTA_WITH_NAME(
+                    func_name, lxu_cache_locking_counter, int32_t, 2, 32));
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      }));
+}
+
+} // namespace
+
+DLL_PUBLIC void lru_cache_populate_cuda(
+    Tensor weights,
+    Tensor cache_hash_size_cumsum,
+    const int64_t total_cache_hash_size,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor D_offsets,
+    Tensor linear_cache_indices,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    const int64_t time_stamp,
+    Tensor lru_state,
+    const bool stochastic_rounding,
+    bool gather_cache_stats,
+    c10::optional<Tensor> uvm_cache_stats,
+    bool lock_cache_line,
+    c10::optional<Tensor> lxu_cache_locking_counter) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      D_offsets,
+      linear_cache_indices,
+      lxu_cache_state,
+      lxu_cache_weights,
+      lru_state);
+
+  Tensor uvm_cache_stats_ = at::empty({0}, weights.options().dtype(at::kInt));
+  if (gather_cache_stats) {
+    TORCH_CHECK(uvm_cache_stats.has_value());
+    uvm_cache_stats_ = uvm_cache_stats.value();
+    TENSOR_ON_CUDA_GPU(uvm_cache_stats_);
+  }
+
+  Tensor lxu_cache_locking_counter_ =
+      at::empty({0, 0}, lxu_cache_state.options().dtype(at::kInt));
+  if (lock_cache_line) {
+    TORCH_CHECK(lxu_cache_locking_counter.has_value());
+    lxu_cache_locking_counter_ = lxu_cache_locking_counter.value();
+    TENSOR_ON_CUDA_GPU(lxu_cache_locking_counter_);
+  }
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(weights.get_device());
+
+  TORCH_CHECK(
+      linear_cache_indices.numel() < std::numeric_limits<int32_t>::max());
+  if (linear_cache_indices.numel() == 0) {
+    // nothing to do
+    return;
+  }
+
+  // Get unqiue indices
+  Tensor unique_indices;
+  Tensor unique_indices_length;
+  c10::optional<Tensor> unique_indices_count;
+  std::tie(unique_indices, unique_indices_length, unique_indices_count) =
+      get_unique_indices_cuda(
+          linear_cache_indices, total_cache_hash_size, false);
+
+  auto cache_sets_and_unique_indices = lru_cache_find_uncached_cuda(
+      unique_indices,
+      unique_indices_length,
+      total_cache_hash_size,
+      lxu_cache_state,
+      time_stamp,
+      lru_state,
+      gather_cache_stats,
+      uvm_cache_stats_,
+      lock_cache_line,
+      lxu_cache_locking_counter_);
+  auto sorted_cache_sets = cache_sets_and_unique_indices.first;
+  auto cache_set_sorted_unique_indices = cache_sets_and_unique_indices.second;
+
+  // insert caching weights
+  lru_cache_insert_cuda(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      D_offsets,
+      sorted_cache_sets,
+      cache_set_sorted_unique_indices,
+      unique_indices_length,
+      lxu_cache_state,
+      lxu_cache_weights,
+      time_stamp,
+      lru_state,
+      stochastic_rounding,
+      gather_cache_stats,
+      uvm_cache_stats_,
+      lock_cache_line,
+      lxu_cache_locking_counter_);
+}
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cu b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cu
new file mode 100644
index 0000000000..2f580d5a03
--- /dev/null
+++ b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cu
@@ -0,0 +1,702 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "common.cuh"
+
+using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+namespace {
+
+template <typename index_t>
+__global__
+__launch_bounds__(kMaxThreads) void direct_mapped_lru_cache_find_uncached_kernel(
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        linear_cache_indices,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> cache_sets,
+    const int64_t max_indices,
+    const pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+        lxu_cache_state,
+    const int64_t time_stamp,
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
+    const bool gather_cache_stats,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        uvm_cache_stats,
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+        lxu_cache_miss_timestamp) {
+  const int32_t N = linear_cache_indices.size(0);
+  const int32_t C = lxu_cache_state.size(0);
+
+  if (gather_cache_stats) {
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+      atomicAdd(
+          &uvm_cache_stats[uvm_cache_stats_index::num_calls], 1); // N_called.
+      atomicAdd(
+          &uvm_cache_stats[uvm_cache_stats_index::num_requested_indices],
+          N); // N_requested_indices.
+    }
+  }
+
+  CUDA_KERNEL_LOOP(n, N) {
+    int64_t idx = linear_cache_indices[n];
+    if (idx == max_indices) {
+      // Invalid or pruned row: set it to sentinel value.
+      // 32-way uses C as the sentinel value to reduce the maximum value during
+      // radix sort to make it faster but for direct_mapped we use -1
+      cache_sets[n] = -1;
+      continue;
+    }
+    int32_t cache_set = cache_slot(idx, C);
+
+    const bool found = ::__ldg((&lxu_cache_state[cache_set][0])) == idx;
+    if (found) {
+      // After all threads run, timestamp will be current timestamp
+      // if any idx was hit
+      // +1 because AMD doesn't have atomicMax for signed long so we should
+      // initialize lxu_cache_miss_timestamp with 0 vs. -1.
+      lru_state[cache_set][0] = time_stamp;
+      cache_sets[n] = -1; // sentinel value
+    } else {
+      // There is no atomicMax for int64_t...
+#ifdef __HIP_PLATFORM_HCC__
+      auto addr = reinterpret_cast<unsigned long long*>(
+          &lxu_cache_miss_timestamp[cache_set][0]);
+      auto val = static_cast<unsigned long long>(time_stamp + 1);
+      auto old = static_cast<int64_t>(atomicMax(addr, val));
+#else
+      auto addr = reinterpret_cast<long long int*>(
+          &lxu_cache_miss_timestamp[cache_set][0]);
+      auto val = static_cast<long long int>(time_stamp + 1);
+      auto old = static_cast<int64_t>(atomicMax(addr, val));
+#endif
+
+      if (old < time_stamp + 1) {
+        // This is the lucky thread that gets to insert its idx in the cache
+        // slot. So the number of elements in cache_sets array that has the
+        // value of cache_set is 1 at maximum
+        cache_sets[n] = cache_set;
+      } else {
+        // Otherwise (too late to get this set)
+        // set it to sentinel value.
+        cache_sets[n] = -1;
+      }
+    }
+  }
+}
+
+Tensor direct_mapped_lru_cache_find_uncached_cuda(
+    Tensor linear_cache_indices,
+    int64_t max_indices,
+    Tensor lxu_cache_state,
+    int64_t time_stamp,
+    Tensor lru_state,
+    Tensor lxu_cache_miss_timestamp,
+    bool gather_cache_stats,
+    Tensor uvm_cache_stats) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      linear_cache_indices,
+      lxu_cache_state,
+      lru_state,
+      lxu_cache_miss_timestamp);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(linear_cache_indices.get_device());
+
+  const int32_t N = linear_cache_indices.numel();
+
+  auto cache_sets = empty_like(
+      linear_cache_indices, linear_cache_indices.options().dtype(at::kInt));
+
+  AT_DISPATCH_INDEX_TYPES(
+      linear_cache_indices.scalar_type(),
+      "direct_mapped_lru_cache_find_uncached_cuda",
+      [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "direct_mapped_lru_cache_find_uncached_kernel";
+#endif
+        // Find uncached indices
+        direct_mapped_lru_cache_find_uncached_kernel<<<
+            std::min(
+                div_round_up(N, kMaxThreads),
+                get_max_thread_blocks_for_cache_kernels_()),
+            kMaxThreads,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            MAKE_PTA_WITH_NAME(func_name, linear_cache_indices, index_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, cache_sets, int32_t, 1, 32),
+            max_indices,
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+            time_stamp,
+            MAKE_PTA_WITH_NAME(func_name, lru_state, int64_t, 2, 32),
+            gather_cache_stats,
+            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats, int32_t, 1, 32),
+            MAKE_PTA_WITH_NAME(
+                func_name, lxu_cache_miss_timestamp, int64_t, 2, 32));
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
+
+  return cache_sets;
+}
+
+template <typename index_t>
+__global__ __launch_bounds__(kMaxThreads) void lru_cache_insert_byte_kernel(
+    pta::PackedTensorAccessor64<uint8_t, 1, at::RestrictPtrTraits> weights,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        cache_hash_size_cumsum,
+    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
+        cache_index_table_map,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        weights_offsets,
+    const pta::PackedTensorAccessor32<uint8_t, 1, at::RestrictPtrTraits>
+        weights_tys,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        D_offsets,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        sorted_cache_sets,
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        cache_set_sorted_indices,
+    const int32_t* __restrict__ N_unique,
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+        lxu_cache_state,
+    pta::PackedTensorAccessor64<uint8_t, 2, at::RestrictPtrTraits>
+        lxu_cache_weights,
+    int64_t time_stamp,
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
+    const bool gather_cache_stats,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        uvm_cache_stats,
+    const int64_t row_alignment) {
+  const int32_t C = lxu_cache_state.size(0);
+  int64_t n_conflict_misses = 0;
+  for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique;
+       n += gridDim.x * blockDim.y) {
+    // check if this warp is responsible for this whole segment.
+    const bool segment_start =
+        (n == 0 || sorted_cache_sets[n - 1] != sorted_cache_sets[n]);
+
+    if (!segment_start) {
+      // don't have *warp* divergence since we launch full warps in blockDim.x,
+      // so we can just exit this warp entirely.
+      continue;
+    }
+    const int32_t cache_set = sorted_cache_sets[n];
+    if (cache_set == C) {
+      // ignore the already-existing elements
+      continue;
+    }
+
+    int32_t SL = 1;
+    while (n + SL < *N_unique && sorted_cache_sets[n + SL] == cache_set) {
+      SL += 1;
+    }
+    int64_t n_inserted = 0;
+
+    // now, we need to insert the (unique!) values in indices[n:n + SL] into
+    // our slots.
+    const int32_t slot = threadIdx.x;
+    const int64_t slot_time = lru_state[cache_set][slot];
+    int64_t costs[1] = {slot_time};
+    int32_t slots[1] = {slot};
+
+    BitonicSort<int64_t, int32_t, 1, Comparator<int64_t>>::sort(costs, slots);
+    const int32_t sorted_slot = slots[0];
+    const int64_t sorted_lru_cost = costs[0];
+
+    for (int32_t l = 0; l < min(SL, kWarpSize); ++l) {
+      const int32_t insert_slot = shfl_sync(sorted_slot, l);
+      const int64_t insert_current_lru_cost = shfl_sync(sorted_lru_cost, l);
+      if (insert_current_lru_cost == time_stamp) {
+        break;
+      }
+      index_t insert_idx = cache_set_sorted_indices[n + l];
+      const int32_t t_insert = cache_index_table_map[insert_idx];
+      SparseType weight_ty_insert =
+          static_cast<SparseType>(weights_tys[t_insert]);
+      const int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
+      const int64_t weights_offset_insert = weights_offsets[t_insert];
+      const int32_t D_start_insert = D_offsets[t_insert];
+      const int32_t D_end_insert = D_offsets[t_insert + 1];
+      const int32_t D_insert = D_end_insert - D_start_insert;
+
+      const int32_t D_insert_bytes = nbit::padded_row_size_in_bytes(
+          D_insert, weight_ty_insert, row_alignment);
+
+      // insert into cache. Note that nbit::padded_row_size_in_bytes pad each
+      // row with row_alignment (16 bytes on GPUs) So each row will be multiple
+      // of 16 bytes (uint4 = 32bit x 4 = 16 bytes).
+      auto row = reinterpret_cast<const uint4*>(
+          &weights[weights_offset_insert + idx_insert * D_insert_bytes + 0]);
+      auto cache_row = reinterpret_cast<uint4*>(
+          &lxu_cache_weights[cache_set * kWarpSize + insert_slot][0]);
+      for (int32_t d = threadIdx.x; d * sizeof(uint4) < D_insert_bytes;
+           d += blockDim.x) {
+        cache_row[d] = row[d];
+      }
+
+      if (threadIdx.x == 0) {
+        lxu_cache_state[cache_set][insert_slot] = insert_idx;
+        lru_state[cache_set][insert_slot] = time_stamp;
+      }
+      n_inserted++;
+    }
+    n_conflict_misses += (SL - n_inserted);
+  }
+  if (gather_cache_stats && n_conflict_misses > 0 && threadIdx.x == 0) {
+    atomicAdd(
+        &uvm_cache_stats[uvm_cache_stats_index::num_conflict_unique_misses],
+        n_conflict_misses);
+  }
+}
+
+template <typename index_t>
+__global__
+__launch_bounds__(kMaxThreads) void direct_mapped_lru_cache_insert_byte_kernel(
+    pta::PackedTensorAccessor64<uint8_t, 1, at::RestrictPtrTraits> weights,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        cache_hash_size_cumsum,
+    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
+        cache_index_table_map,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        weights_offsets,
+    const pta::PackedTensorAccessor32<uint8_t, 1, at::RestrictPtrTraits>
+        weights_tys,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        D_offsets,
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+        lxu_cache_state,
+    pta::PackedTensorAccessor64<uint8_t, 2, at::RestrictPtrTraits>
+        lxu_cache_weights,
+    int64_t time_stamp,
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        linear_cache_indices,
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+        lxu_cache_miss_timestamp,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> cache_sets,
+    const bool gather_cache_stats,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        uvm_cache_stats,
+    const int64_t row_alignment) {
+  const int32_t N = cache_sets.size(0);
+
+  // one warp for each set (multiple times)
+  // (no divergence for each control branch)
+  for (int32_t pos = blockIdx.x * blockDim.y + threadIdx.y; pos < N;
+       pos += gridDim.x * blockDim.y) {
+    auto cache_set = cache_sets[pos];
+
+    if (cache_set == -1) {
+      // Cache hit, index invalid (e.g., pruned), or too late to grab this set.
+      continue;
+    }
+
+    if (lru_state[cache_set][0] == time_stamp) {
+      // we have a missing index but
+      // current cache row is a hit
+      // so abort unnecessary insert
+      continue;
+    }
+
+    // no need to check because cache_sets[pos] != -1 only when it was the
+    // first one to set the buffer time_stamp
+    // if (lxu_cache_miss_timestamp[cache_set][0] != time_stamp) {
+    //   continue;
+    // }
+
+    if (gather_cache_stats && threadIdx.x == 0) {
+      // We are using this slot for a slightly different purpose.
+      // In 32 way:
+      //    UVM traffic for insert
+      //    = # of inserted rows
+      //    = # of unique misses - # of unique misses that were not inserted
+      //    = uvm_cache_stats_index::num_unique_misses
+      //      - uvm_cache_stats_index::num_conflict_unique_misses
+      // In Direct Mapped (here):
+      //    UVM traffic for insert
+      //    = # of inserted rows
+      //    = uvm_cache_stats_index::num_conflict_unique_misses
+      //      (just store here directly)
+      atomicAdd(
+          &uvm_cache_stats[uvm_cache_stats_index::num_conflict_unique_misses],
+          1);
+    }
+
+    // insert the index in the buffer into our only slot
+    const int32_t insert_slot = 0;
+
+    int64_t insert_idx = linear_cache_indices[pos];
+    const int32_t t_insert = cache_index_table_map[insert_idx];
+    SparseType weight_ty_insert =
+        static_cast<SparseType>(weights_tys[t_insert]);
+    const int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
+    const int64_t weights_offset_insert = weights_offsets[t_insert];
+    const int32_t D_start_insert = D_offsets[t_insert];
+    const int32_t D_end_insert = D_offsets[t_insert + 1];
+    const int32_t D_insert = D_end_insert - D_start_insert;
+    const int32_t D_insert_bytes = nbit::padded_row_size_in_bytes(
+        D_insert, weight_ty_insert, row_alignment);
+
+    // insert into cache. Note that nbit::padded_row_size_in_bytes pad each
+    // row with row_alignment (16 bytes on GPUs) So each row will be multiple
+    // of 16 bytes (uint4 = 32bit x 4 = 16 bytes).
+    auto row = reinterpret_cast<const uint4*>(
+        &weights[weights_offset_insert + idx_insert * D_insert_bytes + 0]);
+    auto cache_row = reinterpret_cast<uint4*>(&lxu_cache_weights[cache_set][0]);
+    for (int32_t d = threadIdx.x; d * sizeof(uint4) < D_insert_bytes;
+         d += blockDim.x) {
+      cache_row[d] = row[d];
+    }
+
+    if (threadIdx.x == 0) {
+      lxu_cache_state[cache_set][insert_slot] = insert_idx;
+      lru_state[cache_set][insert_slot] = time_stamp;
+    }
+  }
+}
+
+void lru_cache_insert_byte_cuda(
+    Tensor weights,
+    Tensor cache_hash_size_cumsum,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor weights_tys,
+    Tensor D_offsets,
+    Tensor sorted_cache_sets,
+    Tensor cache_set_sorted_unique_indices,
+    Tensor unique_indices_length,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    int64_t time_stamp,
+    Tensor lru_state,
+    bool gather_cache_stats,
+    Tensor uvm_cache_stats,
+    int64_t row_alignment) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      weights_tys,
+      D_offsets,
+      sorted_cache_sets,
+      cache_set_sorted_unique_indices,
+      unique_indices_length,
+      lxu_cache_state,
+      lxu_cache_weights,
+      lru_state,
+      uvm_cache_stats);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(weights.get_device());
+
+  const int32_t N = cache_set_sorted_unique_indices.numel();
+
+  AT_DISPATCH_INDEX_TYPES(
+      cache_set_sorted_unique_indices.scalar_type(),
+      "lru_cache_insert_byte_cuda",
+      [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lru_cache_insert_byte_kernel";
+#endif
+        lru_cache_insert_byte_kernel<<<
+            std::min(
+                div_round_up(N, kMaxThreads / kWarpSize),
+                get_max_thread_blocks_for_cache_kernels_()),
+            dim3(kWarpSize, kMaxThreads / kWarpSize),
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            MAKE_PTA_WITH_NAME(func_name, weights, uint8_t, 1, 64),
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_index_table_map, int32_t, 1, 64),
+            MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, weights_tys, uint8_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, sorted_cache_sets, int32_t, 1, 32),
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_set_sorted_unique_indices, index_t, 1, 32),
+            unique_indices_length.data_ptr<int32_t>(),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_weights, uint8_t, 2, 64),
+            time_stamp,
+            MAKE_PTA_WITH_NAME(func_name, lru_state, int64_t, 2, 32),
+            gather_cache_stats,
+            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats, int32_t, 1, 32),
+            row_alignment);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
+}
+
+void direct_mapped_lru_cache_insert_byte_cuda(
+    Tensor weights,
+    Tensor cache_hash_size_cumsum,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor weights_tys,
+    Tensor D_offsets,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    int64_t time_stamp,
+    Tensor lru_state,
+    Tensor linear_cache_indices,
+    Tensor lxu_cache_miss_timestamp,
+    Tensor cache_sets,
+    bool gather_cache_stats,
+    Tensor uvm_cache_stats,
+    int64_t row_alignment) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      weights_tys,
+      D_offsets,
+      lxu_cache_state,
+      lxu_cache_weights,
+      lru_state,
+      linear_cache_indices,
+      lxu_cache_miss_timestamp);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(weights.get_device());
+
+  const int32_t N = cache_sets.size(0);
+
+  AT_DISPATCH_INDEX_TYPES(
+      linear_cache_indices.scalar_type(),
+      "direct_mapped_lru_cache_insert_byte_cuda",
+      [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "direct_mapped_lru_cache_insert_byte_kernel";
+#endif
+        direct_mapped_lru_cache_insert_byte_kernel<<<
+            std::min(
+                div_round_up(N, kMaxThreads / kWarpSize),
+                get_max_thread_blocks_for_cache_kernels_()),
+            dim3(kWarpSize, kMaxThreads / kWarpSize),
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            MAKE_PTA_WITH_NAME(func_name, weights, uint8_t, 1, 64),
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_index_table_map, int32_t, 1, 64),
+            MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, weights_tys, uint8_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_weights, uint8_t, 2, 64),
+            time_stamp,
+            MAKE_PTA_WITH_NAME(func_name, lru_state, int64_t, 2, 32),
+            MAKE_PTA_WITH_NAME(func_name, linear_cache_indices, index_t, 1, 32),
+            MAKE_PTA_WITH_NAME(
+                func_name, lxu_cache_miss_timestamp, int64_t, 2, 32),
+            MAKE_PTA_WITH_NAME(func_name, cache_sets, int32_t, 1, 32),
+            gather_cache_stats,
+            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats, int32_t, 1, 32),
+            row_alignment);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
+}
+
+} // namespace
+
+DLL_PUBLIC void lru_cache_populate_byte_cuda(
+    Tensor weights,
+    Tensor cache_hash_size_cumsum,
+    int64_t total_cache_hash_size,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor weights_tys,
+    Tensor D_offsets,
+    Tensor linear_cache_indices,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    int64_t time_stamp,
+    Tensor lru_state,
+    int64_t row_alignment,
+    bool gather_cache_stats,
+    c10::optional<Tensor> uvm_cache_stats) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      weights_tys,
+      D_offsets,
+      linear_cache_indices,
+      lxu_cache_state,
+      lxu_cache_weights,
+      lru_state);
+
+  Tensor uvm_cache_stats_ = at::empty({0}, weights.options().dtype(at::kInt));
+  if (gather_cache_stats) {
+    TORCH_CHECK(uvm_cache_stats.has_value());
+    uvm_cache_stats_ = uvm_cache_stats.value();
+    TENSOR_ON_CUDA_GPU(uvm_cache_stats_);
+  }
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(weights.get_device());
+
+  TORCH_CHECK(
+      linear_cache_indices.numel() < std::numeric_limits<int32_t>::max());
+  if (linear_cache_indices.numel() == 0) {
+    // nothing to do
+    return;
+  }
+
+  // Get unqiue indices
+  Tensor unique_indices;
+  Tensor unique_indices_length;
+  c10::optional<Tensor> unique_indices_count;
+  std::tie(unique_indices, unique_indices_length, unique_indices_count) =
+      get_unique_indices_cuda(
+          linear_cache_indices, total_cache_hash_size, false);
+
+  // Find uncached indices
+  Tensor lxu_cache_locking_counter =
+      at::empty({0, 0}, lxu_cache_state.options().dtype(at::kInt));
+  auto cache_sets_and_unique_indices = lru_cache_find_uncached_cuda(
+      unique_indices,
+      unique_indices_length,
+      total_cache_hash_size,
+      lxu_cache_state,
+      time_stamp,
+      lru_state,
+      gather_cache_stats,
+      uvm_cache_stats_,
+      false, // lock_cache_line
+      lxu_cache_locking_counter);
+  auto sorted_cache_sets = cache_sets_and_unique_indices.first;
+  auto cache_set_sorted_unique_indices = cache_sets_and_unique_indices.second;
+
+  // insert caching weights
+  lru_cache_insert_byte_cuda(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      weights_tys,
+      D_offsets,
+      sorted_cache_sets,
+      cache_set_sorted_unique_indices,
+      unique_indices_length,
+      lxu_cache_state,
+      lxu_cache_weights,
+      time_stamp,
+      lru_state,
+      gather_cache_stats,
+      uvm_cache_stats_,
+      row_alignment);
+}
+
+DLL_PUBLIC void direct_mapped_lru_cache_populate_byte_cuda(
+    Tensor weights,
+    Tensor cache_hash_size_cumsum,
+    int64_t total_cache_hash_size,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor weights_tys,
+    Tensor D_offsets,
+    Tensor linear_cache_indices,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    int64_t time_stamp,
+    Tensor lru_state,
+    Tensor lxu_cache_miss_timestamp,
+    int64_t row_alignment,
+    bool gather_cache_stats,
+    c10::optional<Tensor> uvm_cache_stats) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      weights_tys,
+      D_offsets,
+      linear_cache_indices,
+      lxu_cache_state,
+      lxu_cache_weights,
+      lru_state,
+      lxu_cache_miss_timestamp);
+
+  if (gather_cache_stats) {
+    TORCH_CHECK(uvm_cache_stats.has_value());
+    TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+        uvm_cache_stats, lxu_cache_weights);
+  }
+  auto uvm_cache_stats_ = uvm_cache_stats.value_or(
+      at::empty({0}, weights.options().dtype(at::kInt)));
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(weights.get_device());
+
+  TORCH_CHECK(
+      linear_cache_indices.numel() < std::numeric_limits<int32_t>::max());
+  if (linear_cache_indices.numel() == 0) {
+    // nothing to do
+    return;
+  }
+
+  /*
+  populate_byte normal flow:
+  (1) get_unique (sort, dedup)
+  (2) find_uncached
+  (3) sort by set_idx
+  (4) insert rows
+
+  merged kernels flow:
+  (1) find_uncached
+        No need for sorting.
+        Each hit idx will just update the timestamp in lru_state.
+        Only one of miss indices will atomically set miss_timestamp,
+                                      and have cache_sets[pos] = set
+                                          where pos is the position of that idx
+                                          in the linear_cache_indices array
+        After this, for each set, we either have
+          (a) lru_state timestamp is recent (hit) => no need to insert row
+          (b) lru_state timestamp is not recent (no hit)
+              (b-1) miss_timestamp is recent
+                    => insert row for idx = linear_cache_indices[pos]
+              (b-2) insert_timestamp_buffer is not recent
+                    => no need to insert since there was no miss idx this time
+  (2) insert rows
+        Use buffer info to insert rows as the above logic.
+  */
+
+  auto cache_sets = direct_mapped_lru_cache_find_uncached_cuda(
+      linear_cache_indices,
+      total_cache_hash_size,
+      lxu_cache_state,
+      time_stamp,
+      lru_state,
+      lxu_cache_miss_timestamp,
+      gather_cache_stats,
+      uvm_cache_stats_);
+
+  // insert caching weights
+  direct_mapped_lru_cache_insert_byte_cuda(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      weights_tys,
+      D_offsets,
+      lxu_cache_state,
+      lxu_cache_weights,
+      time_stamp,
+      lru_state,
+      linear_cache_indices,
+      lxu_cache_miss_timestamp,
+      cache_sets,
+      gather_cache_stats,
+      uvm_cache_stats_,
+      row_alignment);
+}
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu b/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu
new file mode 100644
index 0000000000..445ad68c9d
--- /dev/null
+++ b/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu
@@ -0,0 +1,527 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "common.cuh"
+
+using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+DLL_PUBLIC int64_t host_lxu_cache_slot(int64_t h_in, int64_t C) {
+  return static_cast<int64_t>(cache_slot(h_in, static_cast<int32_t>(C)));
+}
+
+namespace {
+
+template <typename emb_t, typename cache_t>
+__global__ __launch_bounds__(kMaxThreads) void lxu_cache_flush_kernel(
+    pta::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> weights,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        cache_hash_size_cumsum,
+    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
+        cache_index_table_map,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        weights_offsets,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        D_offsets,
+    const pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+        lxu_cache_state,
+    pta::PackedTensorAccessor64<cache_t, 2, at::RestrictPtrTraits>
+        lxu_cache_weights,
+    bool stochastic_rounding,
+    at::PhiloxCudaState stochastic_rounding_philox_args) {
+  const int32_t B = lxu_cache_weights.size(0);
+  const int32_t b = blockIdx.x * blockDim.y + threadIdx.y;
+  if (b >= B) {
+    return;
+  }
+  const int32_t slot = b % kWarpSize;
+  const int32_t cache_set = b / kWarpSize;
+  const int64_t current_idx = lxu_cache_state[cache_set][slot];
+  if (current_idx != static_cast<int64_t>(kCacheStateInvalid)) {
+    // evict from slot to backing storage
+    const int32_t t_current = cache_index_table_map[current_idx];
+    const int64_t idx_current = current_idx - cache_hash_size_cumsum[t_current];
+    const int64_t weights_offset_current = weights_offsets[t_current];
+    const int32_t D_start_current = D_offsets[t_current];
+    const int32_t D_end_current = D_offsets[t_current + 1];
+    const int32_t D_current = D_end_current - D_start_current;
+
+    int32_t D_emb = D_current;
+    if constexpr (std::is_same_v<emb_t, uint8_t>) {
+      D_emb += kINT8QparamsBytes;
+    }
+    auto weight_row = WeightRow<emb_t, cache_t, at::acc_type<cache_t, true>>(
+        &weights[weights_offset_current + idx_current * D_emb + 0],
+        &lxu_cache_weights[b][0],
+        D_current,
+        nullptr);
+
+    weight_row.set_stochastic_rounding(
+        stochastic_rounding,
+        stochastic_rounding_philox_args,
+        blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x +
+            threadIdx.x);
+
+    float2 qparams;
+    if (std::is_same<emb_t, uint8_t>::value) {
+      qparams =
+          thrust_find_qparams<cache_t>(&lxu_cache_weights[b][0], D_current);
+      if (threadIdx.x == 0) {
+        weight_row.store_qparams(qparams);
+      }
+    }
+    for (int32_t d = threadIdx.x; d * 4 < D_current; d += blockDim.x) {
+      Vec4T<at::acc_type<cache_t, true>> cache_weights_vec =
+          weight_row.load(d * 4, qparams);
+      weight_row.evict(cache_weights_vec, d * 4, qparams);
+    }
+  }
+}
+
+} // namespace
+
+DLL_PUBLIC void lxu_cache_flush_cuda(
+    Tensor uvm_weights,
+    Tensor cache_hash_size_cumsum,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor D_offsets,
+    int64_t total_D,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    bool stochastic_rounding) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      uvm_weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      D_offsets,
+      lxu_cache_state,
+      lxu_cache_weights);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(lxu_cache_weights.get_device());
+
+  const int32_t T = D_offsets.numel() - 1;
+  const int32_t S = lxu_cache_weights.size(0);
+  const int32_t tx = std::min<int32_t>(total_D / 4 / T, kMaxThreads);
+  const dim3 threads(tx, kMaxThreads / tx);
+  const dim3 blocks(div_round_up(S, kMaxThreads / tx));
+
+  DISPATCH_EMB_CACHE_TYPES(
+      uvm_weights.scalar_type(),
+      lxu_cache_weights.scalar_type(),
+      "lxu_cache_flush_kernel_2",
+      ([&] {
+        at::PhiloxCudaState rng_engine_inputs;
+        if (stochastic_rounding && std::is_same<emb_t, at::Half>::value) {
+          auto gen = at::cuda::detail::getDefaultCUDAGenerator();
+          std::lock_guard<std::mutex> lock(gen.mutex());
+          rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)
+                                  ->philox_cuda_state(4);
+        }
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lxu_cache_flush_kernel";
+#endif
+        lxu_cache_flush_kernel<emb_t, cache_t>
+            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                MAKE_PTA_WITH_NAME(func_name, uvm_weights, emb_t, 1, 64),
+                MAKE_PTA_WITH_NAME(
+                    func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name, cache_index_table_map, int32_t, 1, 64),
+                MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
+                MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
+                MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name, lxu_cache_weights, cache_t, 2, 64),
+                stochastic_rounding,
+                rng_engine_inputs);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      }));
+}
+
+namespace {
+
+// count the number of times that a cache_slot appears in lxu_cache_locations
+// we actually only care about whether the number is 0 or > 0.
+__global__ __launch_bounds__(kMaxThreads) void lxu_cache_locations_count_kernel(
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        lxu_cache_locations,
+    pta::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits> count,
+    FixedDivisor fd) {
+  const int32_t N = lxu_cache_locations.size(0);
+  CUDA_KERNEL_LOOP(n, N) {
+    if (lxu_cache_locations[n] >= 0) {
+      int32_t cache_set;
+      int32_t slot;
+      fd.DivMod(lxu_cache_locations[n], &cache_set, &slot);
+      atomicAdd(&count[cache_set][slot], 1);
+    }
+  }
+}
+
+// if a cache_slot is in lxu_cache_locations (count > 0),
+// decrement the counter of that cache_slot.
+__global__
+__launch_bounds__(kMaxThreads) void lxu_cache_locking_counter_decrement_kernel(
+    pta::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits>
+        lxu_cache_locking_counter,
+    pta::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits> count) {
+  const int32_t C = lxu_cache_locking_counter.size(0);
+  for (int32_t i = blockIdx.x * blockDim.y + threadIdx.y; i < C;
+       i += gridDim.x * blockDim.y) {
+    const auto j = threadIdx.x;
+    if (count[i][j] > 0) {
+      lxu_cache_locking_counter[i][j] -= 1;
+    }
+  }
+}
+
+} // namespace
+
+// for any cache_slot in lxu_cache_locations,
+// decrement the counter of that cache_slot.
+// duplicate cache_slot only decrement once.
+void lxu_cache_locking_counter_decrement_cuda(
+    at::Tensor lxu_cache_locking_counter,
+    at::Tensor lxu_cache_locations) {
+  TENSOR_ON_CUDA_GPU(lxu_cache_locking_counter);
+  TENSOR_ON_CUDA_GPU(lxu_cache_locations);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(lxu_cache_locations.get_device());
+
+  const auto N = lxu_cache_locations.numel();
+  if (N == 0) {
+    return;
+  }
+
+  auto count = at::zeros_like(lxu_cache_locking_counter);
+  const int32_t C = lxu_cache_locking_counter.size(0);
+  TORCH_CHECK(lxu_cache_locking_counter.size(1) == kWarpSize);
+  auto fd = FixedDivisor(kWarpSize);
+
+  const dim3 blocks(std::min(
+      div_round_up(N, kMaxThreads),
+      get_max_thread_blocks_for_cache_kernels_()));
+
+#ifdef FBGEMM_GPU_MEMCHECK
+  const char* func_name = "lxu_cache_locations_count_kernel";
+#endif
+
+  lxu_cache_locations_count_kernel<<<
+      blocks,
+      kMaxThreads,
+      0,
+      at::cuda::getCurrentCUDAStream()>>>(
+      MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32),
+      MAKE_PTA_WITH_NAME(func_name, count, int32_t, 2, 32),
+      fd);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+#ifdef FBGEMM_GPU_MEMCHECK
+  const char* func_name2 = "lxu_cache_locking_counter_decrement_kernel";
+#endif
+
+  lxu_cache_locking_counter_decrement_kernel<<<
+      std::min(
+          div_round_up(C, kMaxThreads / kWarpSize),
+          get_max_thread_blocks_for_cache_kernels_()),
+      dim3(kWarpSize, kMaxThreads / kWarpSize),
+      0,
+      at::cuda::getCurrentCUDAStream()>>>(
+      MAKE_PTA_WITH_NAME(func_name2, lxu_cache_locking_counter, int32_t, 2, 32),
+      MAKE_PTA_WITH_NAME(func_name2, count, int32_t, 2, 32));
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+namespace {
+
+template <typename index_t>
+__global__ __launch_bounds__(kMaxThreads) void lxu_cache_lookup_kernel(
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        linear_cache_indices,
+    const pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+        lxu_cache_state,
+    int64_t invalid_index,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        lxu_cache_locations,
+    const bool gather_cache_stats,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        uvm_cache_stats) {
+  const int32_t C = lxu_cache_state.size(0);
+  const int32_t N = linear_cache_indices.size(0);
+  const int32_t n0 =
+      blockIdx.x * blockDim.y * blockDim.x + threadIdx.y * blockDim.x;
+  if (n0 >= N) {
+    return;
+  }
+
+  int32_t cache_location = kCacheLocationMissing;
+  int32_t n_indices = 0;
+  int32_t n_hits = 0;
+  const auto slot = threadIdx.x;
+  for (int i = 0; i < blockDim.x; ++i) {
+    int32_t n = n0 + i;
+    if (n >= N) {
+      continue;
+    }
+    const int64_t idx = linear_cache_indices[n];
+    if (idx == invalid_index) {
+      continue;
+    }
+    const int32_t cache_set = cache_slot(idx, C);
+    n_indices++;
+    const bool found =
+        (::__ldg((&lxu_cache_state[cache_set][0]) + slot) == idx);
+#ifdef __HIP_PLATFORM_HCC__
+    // FIXME: __ballot_sync with mask isn't supported by HIP yet.
+    // See https://fburl.com/fvy7j0lq for the similar context.
+    // assert false here with https://fburl.com/pfm7enw2
+    assert(false);
+    const auto bitmap = __ballot(found);
+    if (bitmap) {
+      const auto way = __ffsll(bitmap) - 1;
+#else
+    const auto bitmap = __ballot_sync(0xFFFFFFFF, found);
+    if (bitmap) {
+      // LSB == 1 hence we need to subtract one to get lane ID.
+      const auto way = __ffs(bitmap) - 1;
+#endif
+      if (i == threadIdx.x) {
+        cache_location = cache_set * kWarpSize + way;
+      }
+      n_hits++;
+    }
+  }
+
+  const int32_t n = n0 + threadIdx.x;
+  if (n < N) {
+    lxu_cache_locations[n] = cache_location;
+  }
+  if (gather_cache_stats && threadIdx.x == 0 && n_indices > n_hits) {
+    atomicAdd(
+        &uvm_cache_stats[uvm_cache_stats_index::num_conflict_misses],
+        (n_indices - n_hits));
+  }
+}
+
+template <typename index_t>
+__global__
+__launch_bounds__(kMaxThreads) void direct_mapped_lxu_cache_lookup_kernel(
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        linear_cache_indices,
+    const pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+        lxu_cache_state,
+    int64_t invalid_index,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        lxu_cache_locations,
+    const bool gather_cache_stats,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        uvm_cache_stats) {
+  const int32_t C = lxu_cache_state.size(0);
+  const int32_t N = linear_cache_indices.size(0);
+
+  int32_t n_indices = 0;
+  int32_t n_hits = 0;
+
+  CUDA_KERNEL_LOOP(n, N) {
+    int32_t cache_location = kCacheLocationMissing;
+    const auto slot = 0;
+
+    const int64_t idx = linear_cache_indices[n];
+    if (idx == invalid_index) {
+      continue;
+    }
+
+    const int32_t cache_set = cache_slot(idx, C);
+    n_indices++;
+    const bool found =
+        (::__ldg((&lxu_cache_state[cache_set][0]) + slot) == idx);
+    if (found) {
+      cache_location = cache_set;
+      n_hits++;
+    }
+    lxu_cache_locations[n] = cache_location;
+  }
+
+  if (gather_cache_stats) {
+    typedef cub::BlockReduce<int32_t, kMaxThreads> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp;
+
+    const int32_t conflict_miss = n_indices - n_hits;
+    const int32_t conflict_miss_sum = BlockReduce(temp).Sum(conflict_miss);
+
+    if (threadIdx.x == 0) {
+      atomicAdd(
+          &uvm_cache_stats[uvm_cache_stats_index::num_conflict_misses],
+          conflict_miss_sum);
+    }
+  }
+}
+
+} // namespace
+
+DLL_PUBLIC Tensor lxu_cache_lookup_cuda(
+    Tensor linear_cache_indices,
+    Tensor lxu_cache_state,
+    int64_t invalid_index,
+    bool gather_cache_stats,
+    c10::optional<Tensor> uvm_cache_stats) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      linear_cache_indices, lxu_cache_state);
+  Tensor uvm_cache_stats_ =
+      at::empty({0}, linear_cache_indices.options().dtype(at::kInt));
+  if (gather_cache_stats) {
+    TORCH_CHECK(uvm_cache_stats.has_value());
+    uvm_cache_stats_ = uvm_cache_stats.value();
+  }
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(linear_cache_indices.get_device());
+
+  const auto N = linear_cache_indices.numel();
+  auto lxu_cache_locations = empty_like(
+      linear_cache_indices, linear_cache_indices.options().dtype(at::kInt));
+  if (linear_cache_indices.numel() == 0) {
+    // nothing to do
+    return lxu_cache_locations;
+  }
+
+  const dim3 threads(kWarpSize, kMaxThreads / kWarpSize);
+  const dim3 blocks(div_round_up(N, kMaxThreads));
+
+  AT_DISPATCH_INDEX_TYPES(
+      linear_cache_indices.scalar_type(), "lxu_cache_lookup_cuda", [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lxu_cache_lookup_kernel";
+#endif
+        lxu_cache_lookup_kernel<<<
+            blocks,
+            threads,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            MAKE_PTA_WITH_NAME(func_name, linear_cache_indices, index_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+            invalid_index,
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32),
+            gather_cache_stats,
+            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats_, int32_t, 1, 32));
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
+
+  return lxu_cache_locations;
+}
+
+DLL_PUBLIC Tensor direct_mapped_lxu_cache_lookup_cuda(
+    Tensor linear_cache_indices,
+    Tensor lxu_cache_state,
+    int64_t invalid_index,
+    bool gather_cache_stats,
+    c10::optional<Tensor> uvm_cache_stats) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      linear_cache_indices, lxu_cache_state);
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(uvm_cache_stats, lxu_cache_state);
+
+  if (gather_cache_stats) {
+    TORCH_CHECK(uvm_cache_stats.has_value());
+  }
+  auto uvm_cache_stats_ = uvm_cache_stats.value_or(
+      at::empty({0}, linear_cache_indices.options().dtype(at::kInt)));
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(linear_cache_indices.get_device());
+
+  const auto N = linear_cache_indices.numel();
+  auto lxu_cache_locations = empty_like(
+      linear_cache_indices, linear_cache_indices.options().dtype(at::kInt));
+  if (linear_cache_indices.numel() == 0) {
+    // nothing to do
+    return lxu_cache_locations;
+  }
+
+  const dim3 blocks(div_round_up(N, kMaxThreads));
+
+  AT_DISPATCH_INDEX_TYPES(
+      linear_cache_indices.scalar_type(),
+      "direct_mapped_lxu_cache_lookup_cuda",
+      [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "direct_mapped_lxu_cache_lookup_kernel";
+#endif
+        direct_mapped_lxu_cache_lookup_kernel<<<
+            blocks,
+            kMaxThreads,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            MAKE_PTA_WITH_NAME(func_name, linear_cache_indices, index_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+            invalid_index,
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32),
+            gather_cache_stats,
+            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats_, int32_t, 1, 32));
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
+
+  return lxu_cache_locations;
+}
+
+namespace {
+
+__global__
+__launch_bounds__(kMaxThreads) void lxu_cache_locations_update_kernel(
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        lxu_cache_locations,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        lxu_cache_locations_new) {
+  const int32_t N = lxu_cache_locations.size(0);
+  CUDA_KERNEL_LOOP(n, N) {
+    if (lxu_cache_locations[n] == kCacheLocationMissing &&
+        lxu_cache_locations_new[n] >= 0) {
+      lxu_cache_locations[n] = lxu_cache_locations_new[n];
+    }
+  }
+}
+
+} // namespace
+
+DLL_PUBLIC void lxu_cache_locations_update_cuda(
+    Tensor lxu_cache_locations,
+    Tensor lxu_cache_locations_new) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      lxu_cache_locations, lxu_cache_locations_new);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(lxu_cache_locations.get_device());
+
+  const auto N = lxu_cache_locations.numel();
+
+  if (N == 0) {
+    return;
+  }
+
+  const dim3 blocks(std::min(
+      div_round_up(N, kMaxThreads),
+      get_max_thread_blocks_for_cache_kernels_()));
+
+#ifdef FBGEMM_GPU_MEMCHECK
+  const char* func_name = "lxu_cache_locations_update_kernel";
+#endif
+
+  lxu_cache_locations_update_kernel<<<
+      blocks,
+      kMaxThreads,
+      0,
+      at::cuda::getCurrentCUDAStream()>>>(
+      MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32),
+      MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations_new, int32_t, 1, 32));
+
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+  return;
+}
diff --git a/fbgemm_gpu/src/split_embeddings_cache/reset_weight_momentum.cu b/fbgemm_gpu/src/split_embeddings_cache/reset_weight_momentum.cu
new file mode 100644
index 0000000000..104bf140e1
--- /dev/null
+++ b/fbgemm_gpu/src/split_embeddings_cache/reset_weight_momentum.cu
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "common.cuh"
+
+using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+namespace {
+
+int get_sm_count_() {
+  cudaDeviceProp* deviceProp =
+      at::cuda::getDeviceProperties(c10::cuda::current_device());
+  return deviceProp->multiProcessorCount;
+}
+
+__global__ __launch_bounds__(kMaxThreads) void get_cache_indices_kernel(
+    int32_t blocks_per_table,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        cache_hash_size_cumsum,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        pruned_indices,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        pruned_indices_offsets,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        logical_table_ids,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        buffer_ids,
+    pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        linear_cache_indices) {
+  const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  const int32_t t_i = blockIdx.x / blocks_per_table;
+  const int32_t threads_per_table = blocks_per_table * blockDim.x;
+  const int32_t idx_table = index % threads_per_table;
+  const int32_t logical_id = logical_table_ids[t_i];
+  const int32_t buffer_id = buffer_ids[t_i];
+
+  const int64_t num_indices =
+      pruned_indices_offsets[buffer_id + 1] - pruned_indices_offsets[buffer_id];
+
+  if (num_indices <= 0) {
+    return;
+  }
+
+  const int64_t indices_per_thread =
+      div_round_up(num_indices, threads_per_table);
+  const int64_t start = idx_table * indices_per_thread;
+  const int64_t end = min(start + indices_per_thread, num_indices);
+
+  if (start >= num_indices) {
+    return;
+  }
+
+  const int64_t pruned_indices_offset = pruned_indices_offsets[buffer_id];
+  const int64_t* pruned_indices_table = &pruned_indices[pruned_indices_offset];
+  int64_t* linear_cache_indices_table =
+      &linear_cache_indices[pruned_indices_offset];
+
+  const auto max_offset =
+      ::__ldg(&cache_hash_size_cumsum[cache_hash_size_cumsum.size(0) - 1]);
+  const auto curr_offset = ::__ldg(&cache_hash_size_cumsum[logical_id]);
+
+  for (int64_t i = start; i < end; i++) {
+    if (curr_offset >= 0) {
+      linear_cache_indices_table[i] = curr_offset + pruned_indices_table[i];
+    } else {
+      linear_cache_indices_table[i] = max_offset;
+    }
+  }
+}
+
+template <typename emb_t, typename cache_t>
+__global__ __launch_bounds__(kMaxThreads) void reset_weight_momentum_kernel(
+    int32_t blocks_per_table,
+    pta::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> dev_weights,
+    pta::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> uvm_weights,
+    pta::PackedTensorAccessor64<cache_t, 2, at::RestrictPtrTraits>
+        lxu_cache_weights,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        weights_placements,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        weights_offsets,
+    pta::PackedTensorAccessor64<
+        at::acc_type<cache_t, true>,
+        1,
+        at::RestrictPtrTraits> momentum1_dev,
+    pta::PackedTensorAccessor64<
+        at::acc_type<cache_t, true>,
+        1,
+        at::RestrictPtrTraits> momentum1_uvm,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        momentum1_placements,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        momentum1_offsets,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        D_offsets,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        pruned_indices,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        pruned_indices_offsets,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        logical_table_ids,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        buffer_ids,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        lxu_cache_locations) {
+  const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  const int32_t t_i = blockIdx.x / blocks_per_table;
+  const int32_t buffer_id = buffer_ids[t_i];
+  const int64_t num_indices =
+      pruned_indices_offsets[buffer_id + 1] - pruned_indices_offsets[buffer_id];
+
+  if (num_indices <= 0) {
+    return;
+  }
+
+  const int32_t logical_id = logical_table_ids[t_i];
+  int32_t D = D_offsets[logical_id + 1] - D_offsets[logical_id];
+  const int32_t chunk4s_per_row = D / 4;
+  const int64_t total_chunk4s_per_table = num_indices * chunk4s_per_row;
+
+  const int32_t threads_per_table = blocks_per_table * blockDim.x;
+  const int64_t chunk4s_per_thread =
+      div_round_up(total_chunk4s_per_table, threads_per_table);
+  const int32_t idx_table = index % threads_per_table;
+  const int64_t start = idx_table * chunk4s_per_thread;
+  const int64_t end = min(start + chunk4s_per_thread, total_chunk4s_per_table);
+
+  if (start >= total_chunk4s_per_table) {
+    return;
+  }
+
+  int32_t D_emb = D;
+  if constexpr (std::is_same_v<emb_t, uint8_t>) {
+    D_emb += kINT8QparamsBytes;
+  }
+
+  at::acc_type<cache_t, true>* __restrict__ momentum1;
+  const auto momentum1_placement =
+      static_cast<PlacementType>(momentum1_placements[logical_id]);
+  int64_t momentum1_offset = momentum1_offsets[logical_id];
+  if (momentum1_placement == PlacementType::DEVICE) {
+    momentum1 = &momentum1_dev[momentum1_offset];
+  } else {
+    momentum1 = &momentum1_uvm[momentum1_offset];
+  }
+
+  emb_t* __restrict__ weights{nullptr};
+  cache_t* __restrict__ cache_weights{nullptr};
+  const auto weights_placement =
+      static_cast<PlacementType>(weights_placements[logical_id]);
+  int64_t weights_offset = weights_offsets[logical_id];
+
+  const int64_t pruned_indices_offset = pruned_indices_offsets[buffer_id];
+  const int64_t* pruned_indices_table = &pruned_indices[pruned_indices_offset];
+
+  for (int64_t i = start; i < end; i++) {
+    int64_t idx = i / chunk4s_per_row;
+    int64_t pruned_index = pruned_indices_table[idx];
+
+    if (weights_placement == PlacementType::DEVICE) {
+      weights = &dev_weights[weights_offset + pruned_index * D_emb];
+    } else {
+      weights = &uvm_weights[weights_offset + pruned_index * D_emb];
+    }
+    if (weights_placement == PlacementType::MANAGED_CACHING) {
+      int32_t cache_idx = lxu_cache_locations[pruned_indices_offset + idx];
+      if (cache_idx != kCacheLocationMissing) {
+        cache_weights = &lxu_cache_weights[cache_idx][0];
+      }
+    }
+
+    auto weight_row_template =
+        WeightRow<emb_t, cache_t, at::acc_type<cache_t, true>>(
+            weights, cache_weights, D, nullptr);
+
+    // reset momentum1
+    const int32_t d = (i % chunk4s_per_row) * 4;
+    if (d == 0) {
+      momentum1[pruned_index] = 0;
+    }
+
+    // reset weight
+    float2 qparams_new = {1.0, 0.0}; // scaler=1.0, and offset=0.0, for int8.
+    Vec4T<at::acc_type<cache_t, true>> weight_new; // 0 weight
+    weight_row_template.store(
+        weight_new,
+        d,
+        qparams_new); // qparams_new not used if type is not int8
+  }
+}
+
+} // namespace
+
+DLL_PUBLIC void reset_weight_momentum_cuda(
+    Tensor dev_weights,
+    Tensor uvm_weights,
+    Tensor lxu_cache_weights,
+    Tensor weights_placements,
+    Tensor weights_offsets,
+    Tensor momentum1_dev,
+    Tensor momentum1_uvm,
+    Tensor momentum1_placements,
+    Tensor momentum1_offsets,
+    Tensor D_offsets,
+    Tensor pruned_indices,
+    Tensor pruned_indices_offsets,
+    Tensor logical_table_ids,
+    Tensor buffer_ids,
+    Tensor cache_hash_size_cumsum,
+    Tensor lxu_cache_state,
+    int64_t total_cache_hash_size) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      dev_weights,
+      uvm_weights,
+      lxu_cache_weights,
+      weights_placements,
+      weights_offsets,
+      momentum1_dev,
+      momentum1_uvm,
+      momentum1_placements,
+      momentum1_offsets,
+      D_offsets,
+      pruned_indices,
+      pruned_indices_offsets,
+      logical_table_ids,
+      buffer_ids,
+      cache_hash_size_cumsum,
+      lxu_cache_state);
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(dev_weights.get_device());
+
+  const int64_t num_pruned_indices = pruned_indices.size(0);
+  const int32_t num_pruned_tables = buffer_ids.size(0);
+  const int32_t blocks_per_table = get_sm_count_();
+
+  auto lxu_cache_locations =
+      at::zeros({num_pruned_indices}, pruned_indices.options().dtype(at::kInt));
+  lxu_cache_locations.fill_(kCacheLocationMissing);
+
+  if (total_cache_hash_size > 0) {
+    // Get corresponding cache indices of pruned indices
+    auto linear_cache_indices = at::zeros(
+        {num_pruned_indices}, pruned_indices.options().dtype(at::kLong));
+
+#ifdef FBGEMM_GPU_MEMCHECK
+    const char* func_name = "get_cache_indices_kernel";
+#endif
+
+    get_cache_indices_kernel<<<
+        num_pruned_tables * blocks_per_table,
+        kMaxThreads,
+        0,
+        at::cuda::getCurrentCUDAStream()>>>(
+        blocks_per_table,
+        MAKE_PTA_WITH_NAME(func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+        MAKE_PTA_WITH_NAME(func_name, pruned_indices, int64_t, 1, 32),
+        MAKE_PTA_WITH_NAME(func_name, pruned_indices_offsets, int64_t, 1, 32),
+        MAKE_PTA_WITH_NAME(func_name, logical_table_ids, int32_t, 1, 32),
+        MAKE_PTA_WITH_NAME(func_name, buffer_ids, int32_t, 1, 32),
+        MAKE_PTA_WITH_NAME(func_name, linear_cache_indices, int64_t, 1, 32));
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+    // Look up cache locations
+    Tensor uvm_cache_stats =
+        at::empty({0}, lxu_cache_weights.options().dtype(at::kInt));
+    lxu_cache_locations = lxu_cache_lookup_cuda(
+        linear_cache_indices,
+        lxu_cache_state,
+        total_cache_hash_size,
+        false, // gather_cache_stats
+        uvm_cache_stats);
+  }
+
+  // Reset weight and momentum of pruned rows
+  DISPATCH_EMB_CACHE_TYPES(
+      dev_weights.scalar_type(),
+      lxu_cache_weights.scalar_type(),
+      "reset_weight_momentum_kernel",
+      ([&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name2 = "get_cache_indices_kernel";
+#endif
+        reset_weight_momentum_kernel<emb_t, cache_t>
+            <<<num_pruned_tables * blocks_per_table,
+               kMaxThreads,
+               0,
+               at::cuda::getCurrentCUDAStream()>>>(
+                blocks_per_table,
+                MAKE_PTA_WITH_NAME(func_name2, dev_weights, emb_t, 1, 64),
+                MAKE_PTA_WITH_NAME(func_name2, uvm_weights, emb_t, 1, 64),
+                MAKE_PTA_WITH_NAME(
+                    func_name2, lxu_cache_weights, cache_t, 2, 64),
+                MAKE_PTA_WITH_NAME(
+                    func_name2, weights_placements, int32_t, 1, 32),
+                MAKE_PTA_WITH_NAME(func_name2, weights_offsets, int64_t, 1, 32),
+                MAKE_PTA_ACC_WITH_NAME(
+                    func_name2, momentum1_dev, cache_t, 1, 64),
+                MAKE_PTA_ACC_WITH_NAME(
+                    func_name2, momentum1_uvm, cache_t, 1, 64),
+                MAKE_PTA_WITH_NAME(
+                    func_name2, momentum1_placements, int32_t, 1, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name2, momentum1_offsets, int64_t, 1, 32),
+                MAKE_PTA_WITH_NAME(func_name2, D_offsets, int32_t, 1, 32),
+                MAKE_PTA_WITH_NAME(func_name2, pruned_indices, int64_t, 1, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name2, pruned_indices_offsets, int64_t, 1, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name2, logical_table_ids, int32_t, 1, 32),
+                MAKE_PTA_WITH_NAME(func_name2, buffer_ids, int32_t, 1, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name2, lxu_cache_locations, int32_t, 1, 32));
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      }));
+}
diff --git a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
deleted file mode 100644
index ce33c5fcd7..0000000000
--- a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
+++ /dev/null
@@ -1,3201 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// clang-format off
-#include "fbgemm_gpu/cub_namespace_prefix.cuh"
-#include <cub/device/device_radix_sort.cuh>
-#include <cub/device/device_run_length_encode.cuh>
-#include <cub/device/device_select.cuh>
-#include <cub/block/block_reduce.cuh>
-#include "fbgemm_gpu/cub_namespace_postfix.cuh"
-// clang-format on
-
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
-#include <ATen/TensorUtils.h>
-#include <ATen/core/TensorAccessor.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDAGeneratorImpl.h>
-#include <ATen/cuda/detail/KernelUtils.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand_kernel.h>
-#include <ATen/cuda/Atomic.cuh>
-#include <ATen/cuda/CUDAGraphsUtils.cuh>
-#include <limits>
-#include <mutex>
-
-#include "fbgemm_gpu/dispatch_macros.h"
-#include "fbgemm_gpu/embedding_common.h"
-#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
-#include "fbgemm_gpu/fbgemm_tensor_accessor.h"
-#include "fbgemm_gpu/ops_utils.h"
-#include "fbgemm_gpu/sparse_ops_utils.h"
-#include "fbgemm_gpu/split_embeddings_utils.cuh"
-
-constexpr size_t kCacheMaxThreads = 512;
-
-using Tensor = at::Tensor;
-
-using namespace fbgemm_gpu;
-
-namespace {
-
-// // TODO: do we care about 64-bit indices? Currently we just ignore.
-// __host__ DEVICE_INLINE uint32_t cache_slot(int32_t h_in, int32_t C) {
-//   // MurmorHash3 32-bit mixing function.
-//   uint32_t h = (uint32_t)h_in;
-//   h ^= h >> 16;
-//   h *= 0x85ebca6b;
-//   h ^= h >> 13;
-//   h *= 0xc2b2ae35;
-//   h ^= h >> 16;
-//   //
-//   https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
-//   return ((uint64_t)h * (uint64_t)C) >> 32;
-// }
-
-__host__ DEVICE_INLINE uint32_t
-cache_slot(const int64_t h_in, const int32_t C) {
-  // MurmurHash3 64-bit mixing function.
-  uint64_t h = (uint64_t)h_in;
-  h ^= h >> 33;
-  h *= 0xff51afd7ed558ccd;
-  h ^= h >> 33;
-  h *= 0xc4ceb9fe1a85ec53;
-  h ^= h >> 33;
-
-  return h % (uint32_t)C;
-}
-
-enum uvm_cache_stats_index {
-  num_calls = 0,
-  num_requested_indices = 1,
-  num_unique_indices = 2,
-  num_unique_misses = 3,
-  num_conflict_unique_misses = 4,
-  num_conflict_misses = 5,
-};
-
-// Experiments showed that performance of lru/lxu_cache_find_uncached_kernel is
-// not sensitive to grid size as long as the number thread blocks per SM is not
-// too small nor too big.
-constexpr int MAX_THREAD_BLOCKS_PER_SM_FOR_CACHE_KERNELS = 16;
-
-int get_max_thread_blocks_for_cache_kernels_() {
-  return get_device_sm_cnt_() * MAX_THREAD_BLOCKS_PER_SM_FOR_CACHE_KERNELS;
-}
-
-} // namespace
-
-DLL_PUBLIC int64_t host_lxu_cache_slot(int64_t h_in, int64_t C) {
-  return static_cast<int64_t>(cache_slot(h_in, static_cast<int32_t>(C)));
-}
-
-namespace {
-
-constexpr int32_t kCacheLocationMissing = -1;
-constexpr int64_t kCacheStateInvalid = -1;
-
-template <typename emb_t, typename cache_t>
-__global__ __launch_bounds__(kMaxThreads) void lxu_cache_flush_kernel(
-    pta::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> weights,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        cache_hash_size_cumsum,
-    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
-        cache_index_table_map,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        weights_offsets,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        D_offsets,
-    const pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
-        lxu_cache_state,
-    pta::PackedTensorAccessor64<cache_t, 2, at::RestrictPtrTraits>
-        lxu_cache_weights,
-    bool stochastic_rounding,
-    at::PhiloxCudaState stochastic_rounding_philox_args) {
-  const int32_t B = lxu_cache_weights.size(0);
-  const int32_t b = blockIdx.x * blockDim.y + threadIdx.y;
-  if (b >= B) {
-    return;
-  }
-  const int32_t slot = b % kWarpSize;
-  const int32_t cache_set = b / kWarpSize;
-  const int64_t current_idx = lxu_cache_state[cache_set][slot];
-  if (current_idx != static_cast<int64_t>(kCacheStateInvalid)) {
-    // evict from slot to backing storage
-    const int32_t t_current = cache_index_table_map[current_idx];
-    const int64_t idx_current = current_idx - cache_hash_size_cumsum[t_current];
-    const int64_t weights_offset_current = weights_offsets[t_current];
-    const int32_t D_start_current = D_offsets[t_current];
-    const int32_t D_end_current = D_offsets[t_current + 1];
-    const int32_t D_current = D_end_current - D_start_current;
-
-    int32_t D_emb = D_current;
-    if constexpr (std::is_same_v<emb_t, uint8_t>) {
-      D_emb += kINT8QparamsBytes;
-    }
-    auto weight_row = WeightRow<emb_t, cache_t, at::acc_type<cache_t, true>>(
-        &weights[weights_offset_current + idx_current * D_emb + 0],
-        &lxu_cache_weights[b][0],
-        D_current,
-        nullptr);
-
-    weight_row.set_stochastic_rounding(
-        stochastic_rounding,
-        stochastic_rounding_philox_args,
-        blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x +
-            threadIdx.x);
-
-    float2 qparams;
-    if (std::is_same<emb_t, uint8_t>::value) {
-      qparams =
-          thrust_find_qparams<cache_t>(&lxu_cache_weights[b][0], D_current);
-      if (threadIdx.x == 0) {
-        weight_row.store_qparams(qparams);
-      }
-    }
-    for (int32_t d = threadIdx.x; d * 4 < D_current; d += blockDim.x) {
-      Vec4T<at::acc_type<cache_t, true>> cache_weights_vec =
-          weight_row.load(d * 4, qparams);
-      weight_row.evict(cache_weights_vec, d * 4, qparams);
-    }
-  }
-}
-
-} // namespace
-
-DLL_PUBLIC void lxu_cache_flush_cuda(
-    Tensor uvm_weights,
-    Tensor cache_hash_size_cumsum,
-    Tensor cache_index_table_map,
-    Tensor weights_offsets,
-    Tensor D_offsets,
-    int64_t total_D,
-    Tensor lxu_cache_state,
-    Tensor lxu_cache_weights,
-    bool stochastic_rounding) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      uvm_weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      D_offsets,
-      lxu_cache_state,
-      lxu_cache_weights);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(lxu_cache_weights.get_device());
-
-  const int32_t T = D_offsets.numel() - 1;
-  const int32_t S = lxu_cache_weights.size(0);
-  const int32_t tx = std::min<int32_t>(total_D / 4 / T, kMaxThreads);
-  const dim3 threads(tx, kMaxThreads / tx);
-  const dim3 blocks(div_round_up(S, kMaxThreads / tx));
-
-  DISPATCH_EMB_CACHE_TYPES(
-      uvm_weights.scalar_type(),
-      lxu_cache_weights.scalar_type(),
-      "lxu_cache_flush_kernel_2",
-      ([&] {
-        at::PhiloxCudaState rng_engine_inputs;
-        if (stochastic_rounding && std::is_same<emb_t, at::Half>::value) {
-          auto gen = at::cuda::detail::getDefaultCUDAGenerator();
-          std::lock_guard<std::mutex> lock(gen.mutex());
-          rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)
-                                  ->philox_cuda_state(4);
-        }
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "lxu_cache_flush_kernel";
-#endif
-        lxu_cache_flush_kernel<emb_t, cache_t>
-            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-                MAKE_PTA_WITH_NAME(func_name, uvm_weights, emb_t, 1, 64),
-                MAKE_PTA_WITH_NAME(
-                    func_name, cache_hash_size_cumsum, int64_t, 1, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name, cache_index_table_map, int32_t, 1, 64),
-                MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
-                MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
-                MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name, lxu_cache_weights, cache_t, 2, 64),
-                stochastic_rounding,
-                rng_engine_inputs);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      }));
-}
-
-namespace {
-
-template <typename index_t>
-__global__ __launch_bounds__(kMaxThreads) void linearize_cache_indices_kernel(
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        cache_hash_size_cumsum,
-    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        indices,
-    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        table_offsets,
-    pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        linear_cache_indices) {
-  const index_t index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index >= indices.size(0)) {
-    return;
-  }
-
-  // Perform binary search.
-  int left = 0;
-  int right = table_offsets.size(0);
-  while (left != right) {
-    const int middle =
-        left + (right - left) / 2; // Avoid overflow in midpoint calculation
-    if (table_offsets[middle] <= index) {
-      left = middle + 1;
-    } else {
-      right = middle;
-    }
-  }
-  const int table_index = left;
-
-  const auto max_offset =
-      ::__ldg(&cache_hash_size_cumsum[cache_hash_size_cumsum.size(0) - 1]);
-  const auto curr_offset = ::__ldg(&cache_hash_size_cumsum[table_index]);
-  if (curr_offset >= 0 && indices[index] >= 0) {
-    linear_cache_indices[index] = indices[index] + curr_offset;
-  } else {
-    // Either table index is wrong, or index value is negative (due to pruning):
-    // set it to invalid value.
-    linear_cache_indices[index] = max_offset;
-  }
-}
-
-} // namespace
-
-DLL_PUBLIC Tensor linearize_cache_indices_cuda(
-    Tensor cache_hash_size_cumsum,
-    Tensor indices,
-    Tensor offsets) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      cache_hash_size_cumsum, indices, offsets);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(cache_hash_size_cumsum.get_device());
-
-  const auto T = cache_hash_size_cumsum.size(0) - 1;
-  TORCH_CHECK(T > 0);
-  // offsets = [B x T  + 1]
-  const auto B = (offsets.size(0) - 1) / T;
-  TORCH_CHECK(B >= 0);
-
-  auto linear_cache_indices = at::empty_like(indices);
-  const auto num_indices = indices.numel();
-  if (B == 0 || num_indices == 0) {
-    return linear_cache_indices;
-  }
-
-  auto table_offsets = offsets.slice(0, B, B * T, B);
-
-  AT_DISPATCH_INDEX_TYPES(
-      indices.scalar_type(), "linearize_cache_indices_kernel", [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "linearize_cache_indices_kernel";
-#endif
-        linearize_cache_indices_kernel<<<
-            div_round_up(num_indices, kMaxThreads),
-            kMaxThreads,
-            0,
-            at::cuda::getCurrentCUDAStream()>>>(
-            MAKE_PTA_WITH_NAME(
-                func_name, cache_hash_size_cumsum, int64_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, indices, index_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, table_offsets, index_t, 1, 32),
-            MAKE_PTA_WITH_NAME(
-                func_name, linear_cache_indices, index_t, 1, 32));
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
-  return linear_cache_indices;
-}
-
-namespace {
-
-template <typename index_t>
-__global__
-__launch_bounds__(kMaxThreads) void linearize_cache_indices_from_row_idx_kernel(
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        cache_hash_size_cumsum,
-    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        update_table_indices,
-    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        update_row_indices,
-    pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        linear_cache_indices) {
-  const index_t index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index >= update_row_indices.size(0)) {
-    return;
-  }
-  const int table_index = update_table_indices[index];
-
-  const auto max_offset =
-      ::__ldg(&cache_hash_size_cumsum[cache_hash_size_cumsum.size(0) - 1]);
-  const auto curr_offset = ::__ldg(&cache_hash_size_cumsum[table_index]);
-  if (curr_offset >= 0 && update_row_indices[index] >= 0) {
-    linear_cache_indices[index] = update_row_indices[index] + curr_offset;
-  } else {
-    // Either table index is wrong, or index value is negative (due to pruning):
-    // set it to invalid value.
-    linear_cache_indices[index] = max_offset;
-  }
-}
-
-} // namespace
-
-DLL_PUBLIC Tensor linearize_cache_indices_from_row_idx_cuda(
-    Tensor cache_hash_size_cumsum,
-    Tensor update_table_indices,
-    Tensor update_row_indices) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      cache_hash_size_cumsum, update_table_indices, update_row_indices);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(cache_hash_size_cumsum.get_device());
-
-  const auto T = cache_hash_size_cumsum.size(0) - 1;
-  TORCH_CHECK(T > 0);
-
-  auto linear_cache_indices = at::empty_like(update_row_indices);
-  const auto num_indices = update_row_indices.numel();
-  if (num_indices == 0) {
-    return linear_cache_indices;
-  }
-
-  AT_DISPATCH_INDEX_TYPES(
-      update_row_indices.scalar_type(),
-      "linearize_cache_indices_from_row_idx_kernel",
-      [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "linearize_cache_indices_from_row_idx_kernel";
-#endif
-        linearize_cache_indices_from_row_idx_kernel<<<
-            div_round_up(num_indices, kMaxThreads),
-            kMaxThreads,
-            0,
-            at::cuda::getCurrentCUDAStream()>>>(
-            MAKE_PTA_WITH_NAME(
-                func_name, cache_hash_size_cumsum, int64_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, update_table_indices, index_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, update_row_indices, index_t, 1, 32),
-            MAKE_PTA_WITH_NAME(
-                func_name, linear_cache_indices, index_t, 1, 32));
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
-  return linear_cache_indices;
-}
-
-DLL_PUBLIC std::tuple<Tensor, Tensor, c10::optional<Tensor>>
-get_unique_indices_cuda(
-    Tensor linear_indices,
-    int64_t max_indices,
-    bool compute_count) {
-  TENSOR_ON_CUDA_GPU(linear_indices);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(linear_indices.get_device());
-
-  TORCH_CHECK(linear_indices.numel() < std::numeric_limits<int32_t>::max());
-  const int32_t N = linear_indices.numel();
-  auto sorted_indices = at::empty_like(linear_indices);
-  auto unique_indices = at::empty_like(linear_indices);
-  auto unique_indices_length =
-      at::empty({1}, linear_indices.options().dtype(at::kInt));
-  c10::optional<Tensor> unique_indices_count = c10::nullopt;
-  if (compute_count) {
-    unique_indices_count = at::empty(
-        {linear_indices.numel()}, linear_indices.options().dtype(at::kInt));
-  }
-  AT_DISPATCH_INDEX_TYPES(
-      linear_indices.scalar_type(), "get_unique_indices_cuda", [&] {
-        // sort indices
-        size_t temp_storage_bytes_0 = 0;
-        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortKeys(
-            nullptr,
-            temp_storage_bytes_0,
-            linear_indices.data_ptr<index_t>(),
-            sorted_indices.data_ptr<index_t>(),
-            N,
-            0,
-            int(log2(float(max_indices + 1)) + 1),
-            at::cuda::getCurrentCUDAStream(),
-            false));
-        auto temp_storage_0 = at::empty(
-            {static_cast<index_t>(temp_storage_bytes_0)},
-            linear_indices.options().dtype(at::kByte));
-        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortKeys(
-            temp_storage_0.data_ptr(),
-            temp_storage_bytes_0,
-            linear_indices.data_ptr<index_t>(),
-            sorted_indices.data_ptr<index_t>(),
-            N,
-            0,
-            int(log2(float(max_indices + 1)) + 1),
-            at::cuda::getCurrentCUDAStream(),
-            false));
-        // get unique indices
-        if (compute_count) {
-          size_t temp_storage_bytes_1 = 0;
-          AT_CUDA_CHECK(
-              FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRunLengthEncode::Encode(
-                  nullptr,
-                  temp_storage_bytes_1,
-                  sorted_indices.data_ptr<index_t>(),
-                  unique_indices.data_ptr<index_t>(),
-                  unique_indices_count->data_ptr<int32_t>(),
-                  unique_indices_length.data_ptr<int32_t>(),
-                  N,
-                  at::cuda::getCurrentCUDAStream(),
-                  false));
-          auto temp_storage_1 = at::empty(
-              {static_cast<index_t>(temp_storage_bytes_1)},
-              linear_indices.options().dtype(at::kByte));
-          AT_CUDA_CHECK(
-              FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRunLengthEncode::Encode(
-                  temp_storage_1.data_ptr(),
-                  temp_storage_bytes_1,
-                  sorted_indices.data_ptr<index_t>(),
-                  unique_indices.data_ptr<index_t>(),
-                  unique_indices_count->data_ptr<int32_t>(),
-                  unique_indices_length.data_ptr<int32_t>(),
-                  N,
-                  at::cuda::getCurrentCUDAStream(),
-                  false));
-        } else {
-          size_t temp_storage_bytes_1 = 0;
-          AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceSelect::Unique(
-              nullptr,
-              temp_storage_bytes_1,
-              sorted_indices.data_ptr<index_t>(),
-              unique_indices.data_ptr<index_t>(),
-              unique_indices_length.data_ptr<int32_t>(),
-              N,
-              at::cuda::getCurrentCUDAStream(),
-              false));
-          auto temp_storage_1 = at::empty(
-              {static_cast<index_t>(temp_storage_bytes_1)},
-              linear_indices.options().dtype(at::kByte));
-          AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceSelect::Unique(
-              temp_storage_1.data_ptr(),
-              temp_storage_bytes_1,
-              sorted_indices.data_ptr<index_t>(),
-              unique_indices.data_ptr<index_t>(),
-              unique_indices_length.data_ptr<int32_t>(),
-              N,
-              at::cuda::getCurrentCUDAStream(),
-              false));
-        }
-      });
-  return std::make_tuple(
-      unique_indices, unique_indices_length, unique_indices_count);
-}
-
-namespace {
-
-__global__ __launch_bounds__(kMaxThreads) void emulate_cache_miss_kernel(
-    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        lxu_cache_locations,
-    const int64_t enforced_misses_per_256,
-    const bool gather_cache_stats,
-    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        uvm_cache_stats) {
-  const int32_t N = lxu_cache_locations.size(0);
-  int64_t n_enforced_misses = 0;
-  CUDA_KERNEL_LOOP(n, N) {
-    if ((n & 0x00FF) < enforced_misses_per_256) {
-      if (lxu_cache_locations[n] >= 0) {
-        n_enforced_misses++;
-      }
-      lxu_cache_locations[n] = kCacheLocationMissing;
-    }
-  }
-  if (gather_cache_stats && n_enforced_misses > 0) {
-    atomicAdd(
-        &uvm_cache_stats[uvm_cache_stats_index::num_conflict_misses],
-        n_enforced_misses);
-  }
-}
-} // namespace
-
-DLL_PUBLIC Tensor emulate_cache_miss(
-    Tensor lxu_cache_locations,
-    const int64_t enforced_misses_per_256,
-    const bool gather_cache_stats,
-    Tensor uvm_cache_stats) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      lxu_cache_locations, uvm_cache_stats);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(lxu_cache_locations.get_device());
-
-  const auto N = lxu_cache_locations.numel();
-  if (N == 0) {
-    // nothing to do
-    return lxu_cache_locations;
-  }
-
-  const dim3 blocks(std::min(
-      div_round_up(N, kMaxThreads),
-      get_max_thread_blocks_for_cache_kernels_()));
-
-#ifdef FBGEMM_GPU_MEMCHECK
-  const char* func_name = "emulate_cache_miss_kernel";
-#endif
-
-  emulate_cache_miss_kernel<<<
-      blocks,
-      kMaxThreads,
-      0,
-      at::cuda::getCurrentCUDAStream()>>>(
-      MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32),
-      enforced_misses_per_256,
-      gather_cache_stats,
-      MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats, int32_t, 1, 32));
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-  return lxu_cache_locations;
-}
-
-namespace {
-// count the number of times that a cache_slot appears in lxu_cache_locations
-// we actually only care about whether the number is 0 or > 0.
-__global__ __launch_bounds__(kMaxThreads) void lxu_cache_locations_count_kernel(
-    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        lxu_cache_locations,
-    pta::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits> count,
-    FixedDivisor fd) {
-  const int32_t N = lxu_cache_locations.size(0);
-  CUDA_KERNEL_LOOP(n, N) {
-    if (lxu_cache_locations[n] >= 0) {
-      int32_t cache_set;
-      int32_t slot;
-      fd.DivMod(lxu_cache_locations[n], &cache_set, &slot);
-      atomicAdd(&count[cache_set][slot], 1);
-    }
-  }
-}
-
-// if a cache_slot is in lxu_cache_locations (count > 0),
-// decrement the counter of that cache_slot.
-__global__
-__launch_bounds__(kMaxThreads) void lxu_cache_locking_counter_decrement_kernel(
-    pta::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits>
-        lxu_cache_locking_counter,
-    pta::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits> count) {
-  const int32_t C = lxu_cache_locking_counter.size(0);
-  for (int32_t i = blockIdx.x * blockDim.y + threadIdx.y; i < C;
-       i += gridDim.x * blockDim.y) {
-    const auto j = threadIdx.x;
-    if (count[i][j] > 0) {
-      lxu_cache_locking_counter[i][j] -= 1;
-    }
-  }
-}
-} // namespace
-
-// for any cache_slot in lxu_cache_locations,
-// decrement the counter of that cache_slot.
-// duplicate cache_slot only decrement once.
-void lxu_cache_locking_counter_decrement_cuda(
-    at::Tensor lxu_cache_locking_counter,
-    at::Tensor lxu_cache_locations) {
-  TENSOR_ON_CUDA_GPU(lxu_cache_locking_counter);
-  TENSOR_ON_CUDA_GPU(lxu_cache_locations);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(lxu_cache_locations.get_device());
-
-  const auto N = lxu_cache_locations.numel();
-  if (N == 0) {
-    return;
-  }
-
-  auto count = at::zeros_like(lxu_cache_locking_counter);
-  const int32_t C = lxu_cache_locking_counter.size(0);
-  TORCH_CHECK(lxu_cache_locking_counter.size(1) == kWarpSize);
-  auto fd = FixedDivisor(kWarpSize);
-
-  const dim3 blocks(std::min(
-      div_round_up(N, kMaxThreads),
-      get_max_thread_blocks_for_cache_kernels_()));
-
-#ifdef FBGEMM_GPU_MEMCHECK
-  const char* func_name = "lxu_cache_locations_count_kernel";
-#endif
-
-  lxu_cache_locations_count_kernel<<<
-      blocks,
-      kMaxThreads,
-      0,
-      at::cuda::getCurrentCUDAStream()>>>(
-      MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32),
-      MAKE_PTA_WITH_NAME(func_name, count, int32_t, 2, 32),
-      fd);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-#ifdef FBGEMM_GPU_MEMCHECK
-  const char* func_name2 = "lxu_cache_locking_counter_decrement_kernel";
-#endif
-
-  lxu_cache_locking_counter_decrement_kernel<<<
-      std::min(
-          div_round_up(C, kMaxThreads / kWarpSize),
-          get_max_thread_blocks_for_cache_kernels_()),
-      dim3(kWarpSize, kMaxThreads / kWarpSize),
-      0,
-      at::cuda::getCurrentCUDAStream()>>>(
-      MAKE_PTA_WITH_NAME(func_name2, lxu_cache_locking_counter, int32_t, 2, 32),
-      MAKE_PTA_WITH_NAME(func_name2, count, int32_t, 2, 32));
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-}
-
-namespace {
-template <typename index_t>
-__global__ __launch_bounds__(kMaxThreads) void lru_cache_find_uncached_kernel(
-    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        unique_indices,
-    const int32_t* __restrict__ N_unique,
-    int64_t max_indices,
-    const pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
-        lxu_cache_state,
-    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> cache_sets,
-    int64_t time_stamp,
-    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
-    const bool gather_cache_stats,
-    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        uvm_cache_stats,
-    const bool lock_cache_line,
-    pta::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits>
-        lxu_cache_locking_counter) {
-  if (gather_cache_stats) {
-    if (blockIdx.x == 0 && threadIdx.x == 0 && threadIdx.y == 0) {
-      atomicAdd(
-          &uvm_cache_stats[uvm_cache_stats_index::num_calls], 1); // N_called.
-      atomicAdd(
-          &uvm_cache_stats[uvm_cache_stats_index::num_requested_indices],
-          unique_indices.size(0)); // N_requested_indices.
-      atomicAdd(
-          &uvm_cache_stats[uvm_cache_stats_index::num_unique_indices],
-          *N_unique); // N_unique_indices.
-    }
-  }
-
-  const int32_t C = lxu_cache_state.size(0);
-  int32_t n_misses = 0;
-
-  for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique;
-       n += gridDim.x * blockDim.y) {
-    int64_t idx = unique_indices[n];
-    if (idx == max_indices) {
-      // cache_sets are initialized with sentinel values in
-      // lru_cache_find_uncached_cuda
-      continue;
-    }
-    int32_t cache_set = cache_slot(idx, C);
-
-    const auto slot = threadIdx.x;
-    const bool found = ::__ldg((&lxu_cache_state[cache_set][0]) + slot) == idx;
-    if (found) {
-      // mark it as recently accessed so we don't evict.
-      lru_state[cache_set][slot] = time_stamp;
-      if (lock_cache_line) {
-        lxu_cache_locking_counter[cache_set][slot] += 1;
-      }
-    }
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (!__any_sync(0xFFFFFFFFFFFFFFFF, found)) {
-#else
-    if (!__any_sync(0xFFFFFFFF, found)) {
-#endif
-      if (threadIdx.x == 0) {
-        cache_sets[n] = cache_set;
-        n_misses++;
-      }
-    }
-  }
-  if (gather_cache_stats && threadIdx.x == 0) {
-    atomicAdd(
-        &uvm_cache_stats[uvm_cache_stats_index::num_unique_misses],
-        n_misses); // N_unique_misses.
-  }
-}
-
-template <typename index_t>
-__global__
-__launch_bounds__(kMaxThreads) void direct_mapped_lru_cache_find_uncached_kernel(
-    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        linear_cache_indices,
-    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> cache_sets,
-    const int64_t max_indices,
-    const pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
-        lxu_cache_state,
-    const int64_t time_stamp,
-    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
-    const bool gather_cache_stats,
-    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        uvm_cache_stats,
-    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
-        lxu_cache_miss_timestamp) {
-  const int32_t N = linear_cache_indices.size(0);
-  const int32_t C = lxu_cache_state.size(0);
-
-  if (gather_cache_stats) {
-    if (blockIdx.x == 0 && threadIdx.x == 0) {
-      atomicAdd(
-          &uvm_cache_stats[uvm_cache_stats_index::num_calls], 1); // N_called.
-      atomicAdd(
-          &uvm_cache_stats[uvm_cache_stats_index::num_requested_indices],
-          N); // N_requested_indices.
-    }
-  }
-
-  CUDA_KERNEL_LOOP(n, N) {
-    int64_t idx = linear_cache_indices[n];
-    if (idx == max_indices) {
-      // Invalid or pruned row: set it to sentinel value.
-      // 32-way uses C as the sentinel value to reduce the maximum value during
-      // radix sort to make it faster but for direct_mapped we use -1
-      cache_sets[n] = -1;
-      continue;
-    }
-    int32_t cache_set = cache_slot(idx, C);
-
-    const bool found = ::__ldg((&lxu_cache_state[cache_set][0])) == idx;
-    if (found) {
-      // After all threads run, timestamp will be current timestamp
-      // if any idx was hit
-      // +1 because AMD doesn't have atomicMax for signed long so we should
-      // initialize lxu_cache_miss_timestamp with 0 vs. -1.
-      lru_state[cache_set][0] = time_stamp;
-      cache_sets[n] = -1; // sentinel value
-    } else {
-      // There is no atomicMax for int64_t...
-#ifdef __HIP_PLATFORM_HCC__
-      auto addr = reinterpret_cast<unsigned long long*>(
-          &lxu_cache_miss_timestamp[cache_set][0]);
-      auto val = static_cast<unsigned long long>(time_stamp + 1);
-      auto old = static_cast<int64_t>(atomicMax(addr, val));
-#else
-      auto addr = reinterpret_cast<long long int*>(
-          &lxu_cache_miss_timestamp[cache_set][0]);
-      auto val = static_cast<long long int>(time_stamp + 1);
-      auto old = static_cast<int64_t>(atomicMax(addr, val));
-#endif
-
-      if (old < time_stamp + 1) {
-        // This is the lucky thread that gets to insert its idx in the cache
-        // slot. So the number of elements in cache_sets array that has the
-        // value of cache_set is 1 at maximum
-        cache_sets[n] = cache_set;
-      } else {
-        // Otherwise (too late to get this set)
-        // set it to sentinel value.
-        cache_sets[n] = -1;
-      }
-    }
-  }
-}
-} // namespace
-
-DLL_PUBLIC std::pair<Tensor, Tensor> lru_cache_find_uncached_cuda(
-    Tensor unique_indices,
-    Tensor unique_indices_length,
-    int64_t max_indices,
-    Tensor lxu_cache_state,
-    int64_t time_stamp,
-    Tensor lru_state,
-    bool gather_cache_stats,
-    Tensor uvm_cache_stats,
-    bool lock_cache_line,
-    Tensor lxu_cache_locking_counter) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      unique_indices,
-      unique_indices_length,
-      lxu_cache_state,
-      lru_state,
-      uvm_cache_stats,
-      lxu_cache_locking_counter);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(unique_indices.get_device());
-
-  // Fill with sentinel value
-  auto cache_sets = full_like(
-      unique_indices,
-      lxu_cache_state.size(0),
-      unique_indices.options().dtype(at::kInt));
-  const int32_t N = unique_indices.numel();
-  auto sorted_cache_sets = empty_like(cache_sets);
-  auto cache_set_sorted_unique_indices = empty_like(unique_indices);
-
-  AT_DISPATCH_INDEX_TYPES(
-      unique_indices.scalar_type(), "lru_cache_find_uncached_cuda", [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "lru_cache_find_uncached_kernel";
-#endif
-        // Find uncached indices
-        lru_cache_find_uncached_kernel<<<
-            std::min(
-                div_round_up(N, kMaxThreads / kWarpSize),
-                get_max_thread_blocks_for_cache_kernels_()),
-            dim3(kWarpSize, kMaxThreads / kWarpSize),
-            0,
-            at::cuda::getCurrentCUDAStream()>>>(
-            MAKE_PTA_WITH_NAME(func_name, unique_indices, index_t, 1, 32),
-            unique_indices_length.data_ptr<int32_t>(),
-            max_indices,
-            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
-            MAKE_PTA_WITH_NAME(func_name, cache_sets, int32_t, 1, 32),
-            time_stamp,
-            MAKE_PTA_WITH_NAME(func_name, lru_state, int64_t, 2, 32),
-            gather_cache_stats,
-            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats, int32_t, 1, 32),
-            lock_cache_line,
-            MAKE_PTA_WITH_NAME(
-                func_name, lxu_cache_locking_counter, int32_t, 2, 32));
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-        // Sort the cache sets and ids
-        size_t temp_storage_bytes = 0;
-        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs(
-            nullptr,
-            temp_storage_bytes,
-            cache_sets.data_ptr<int32_t>(),
-            sorted_cache_sets.data_ptr<int32_t>(),
-            unique_indices.data_ptr<index_t>(),
-            cache_set_sorted_unique_indices.data_ptr<index_t>(),
-            N,
-            0,
-            int(log2(float(lxu_cache_state.size(0) + 1)) + 1),
-            at::cuda::getCurrentCUDAStream(),
-            false));
-        auto temp_storage = at::empty(
-            {static_cast<index_t>(temp_storage_bytes)},
-            unique_indices.options().dtype(at::kByte));
-        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs(
-            temp_storage.data_ptr(),
-            temp_storage_bytes,
-            cache_sets.data_ptr<int32_t>(),
-            sorted_cache_sets.data_ptr<int32_t>(),
-            unique_indices.data_ptr<index_t>(),
-            cache_set_sorted_unique_indices.data_ptr<index_t>(),
-            N,
-            0,
-            int(log2(float(lxu_cache_state.size(0) + 1)) + 1),
-            at::cuda::getCurrentCUDAStream(),
-            false));
-      });
-  return {sorted_cache_sets, cache_set_sorted_unique_indices};
-}
-
-namespace {
-
-Tensor direct_mapped_lru_cache_find_uncached_cuda(
-    Tensor linear_cache_indices,
-    int64_t max_indices,
-    Tensor lxu_cache_state,
-    int64_t time_stamp,
-    Tensor lru_state,
-    Tensor lxu_cache_miss_timestamp,
-    bool gather_cache_stats,
-    Tensor uvm_cache_stats) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      linear_cache_indices,
-      lxu_cache_state,
-      lru_state,
-      lxu_cache_miss_timestamp);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(linear_cache_indices.get_device());
-
-  const int32_t N = linear_cache_indices.numel();
-
-  auto cache_sets = empty_like(
-      linear_cache_indices, linear_cache_indices.options().dtype(at::kInt));
-
-  AT_DISPATCH_INDEX_TYPES(
-      linear_cache_indices.scalar_type(),
-      "direct_mapped_lru_cache_find_uncached_cuda",
-      [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "direct_mapped_lru_cache_find_uncached_kernel";
-#endif
-        // Find uncached indices
-        direct_mapped_lru_cache_find_uncached_kernel<<<
-            std::min(
-                div_round_up(N, kMaxThreads),
-                get_max_thread_blocks_for_cache_kernels_()),
-            kMaxThreads,
-            0,
-            at::cuda::getCurrentCUDAStream()>>>(
-            MAKE_PTA_WITH_NAME(func_name, linear_cache_indices, index_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, cache_sets, int32_t, 1, 32),
-            max_indices,
-            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
-            time_stamp,
-            MAKE_PTA_WITH_NAME(func_name, lru_state, int64_t, 2, 32),
-            gather_cache_stats,
-            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats, int32_t, 1, 32),
-            MAKE_PTA_WITH_NAME(
-                func_name, lxu_cache_miss_timestamp, int64_t, 2, 32));
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
-
-  return cache_sets;
-}
-
-template <typename emb_t, typename cache_t>
-__global__ __launch_bounds__(kMaxThreads) void lru_cache_insert_kernel(
-    pta::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> weights,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        cache_hash_size_cumsum,
-    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
-        cache_index_table_map,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        weights_offsets,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        D_offsets,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        sorted_cache_sets,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        cache_set_sorted_indices,
-    const int32_t* __restrict__ N_unique,
-    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
-        lxu_cache_state,
-    pta::PackedTensorAccessor64<cache_t, 2, at::RestrictPtrTraits>
-        lxu_cache_weights,
-    const int64_t time_stamp,
-    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
-    const bool stochastic_rounding,
-    at::PhiloxCudaState stochastic_rounding_philox_args,
-    const bool gather_cache_stats,
-    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        uvm_cache_stats,
-    const bool lock_cache_line,
-    pta::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits>
-        lxu_cache_locking_counter) {
-  const int32_t C = lxu_cache_state.size(0);
-  int32_t n_conflict_misses = 0;
-  for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique;
-       n += gridDim.x * blockDim.y) {
-    // check if this warp is responsible for this whole segment.
-    const bool segment_start =
-        (n == 0 || sorted_cache_sets[n - 1] != sorted_cache_sets[n]);
-
-    if (!segment_start) {
-      // don't have *warp* divergence since we launch full warps in blockDim.x,
-      // so we can just exit this warp entirely.
-      continue;
-    }
-    const int32_t cache_set = sorted_cache_sets[n];
-    if (cache_set == C) {
-      // ignore the already-existing elements
-      continue;
-    }
-
-    int32_t SL = 1;
-    while (n + SL < *N_unique && sorted_cache_sets[n + SL] == cache_set) {
-      SL += 1;
-    }
-    int32_t n_inserted = 0; // also used as index to insert
-
-    // now, we need to insert the (unique!) values in indices[n:n + SL] into
-    // our slots.
-    const int32_t slot = threadIdx.x;
-    const int64_t slot_time = lru_state[cache_set][slot];
-    int64_t costs[1] = {slot_time};
-    int32_t slots[1] = {slot};
-
-    BitonicSort<int64_t, int32_t, 1, Comparator<int64_t>>::sort(costs, slots);
-    const int32_t sorted_slot = slots[0];
-    const int64_t sorted_lru_cost = costs[0];
-
-    for (int32_t l = 0; l < min(SL, kWarpSize); ++l) {
-      const int32_t insert_slot = shfl_sync(sorted_slot, l);
-      if (lock_cache_line) {
-        auto count = lxu_cache_locking_counter[cache_set][insert_slot];
-        if (count > 0) {
-          continue; // cache slot is in use
-        }
-      }
-      const int64_t insert_current_lru_cost = shfl_sync(sorted_lru_cost, l);
-      if (insert_current_lru_cost == time_stamp) {
-        break;
-      }
-      const int64_t insert_idx = cache_set_sorted_indices[n + n_inserted];
-      const int32_t t_insert = cache_index_table_map[insert_idx];
-      const int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
-      const int64_t weights_offset_insert = weights_offsets[t_insert];
-      const int32_t D_start_insert = D_offsets[t_insert];
-      const int32_t D_end_insert = D_offsets[t_insert + 1];
-      const int32_t D_insert = D_end_insert - D_start_insert;
-
-      // ensure that threadIdx.x is the only thread reading/writing to
-      // lxu_cache_state
-      int64_t current_idx =
-          threadIdx.x == 0 ? lxu_cache_state[cache_set][insert_slot] : 0;
-      current_idx = shfl_sync(current_idx, 0);
-
-      // not empty
-      if (current_idx != static_cast<int64_t>(kCacheStateInvalid)) {
-        // evict from slot to backing storage
-        const int32_t t_current = cache_index_table_map[current_idx];
-        const int64_t idx_current =
-            current_idx - cache_hash_size_cumsum[t_current];
-        const int64_t weights_offset_current = weights_offsets[t_current];
-        const int32_t D_start_current = D_offsets[t_current];
-        const int32_t D_end_current = D_offsets[t_current + 1];
-        const int32_t D_current = D_end_current - D_start_current;
-        int32_t D_emb = D_current;
-        if constexpr (std::is_same_v<emb_t, uint8_t>) {
-          D_emb += kINT8QparamsBytes;
-        }
-
-        auto weight_row = WeightRow<emb_t, cache_t, cache_t>(
-            &weights[weights_offset_current + idx_current * D_emb + 0],
-            &lxu_cache_weights[cache_set * kWarpSize + insert_slot][0],
-            D_current,
-            nullptr);
-
-        weight_row.set_stochastic_rounding(
-            stochastic_rounding,
-            stochastic_rounding_philox_args,
-            (blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x +
-             threadIdx.x) *
-                    kWarpSize +
-                l);
-
-        weight_row.warp_evict(D_current, blockDim.x, threadIdx.x);
-      }
-
-      int32_t D_emb = D_insert;
-      if constexpr (std::is_same_v<emb_t, uint8_t>) {
-        D_emb += kINT8QparamsBytes;
-      }
-
-      auto weight_row_cache = WeightRow<emb_t, cache_t, cache_t>(
-          &weights[weights_offset_insert + idx_insert * D_emb + 0],
-          &lxu_cache_weights[cache_set * kWarpSize + insert_slot][0],
-          D_insert,
-          nullptr);
-
-      auto weight_row_emb = WeightRow<emb_t, cache_t, cache_t>(
-          &weights[weights_offset_insert + idx_insert * D_emb + 0],
-          nullptr,
-          D_insert,
-          nullptr);
-
-      weight_row_emb.warp_copy_to(
-          weight_row_cache, D_insert, blockDim.x, threadIdx.x);
-
-      if (threadIdx.x == 0) {
-        lxu_cache_state[cache_set][insert_slot] = insert_idx;
-        lru_state[cache_set][insert_slot] = time_stamp;
-        if (lock_cache_line) {
-          lxu_cache_locking_counter[cache_set][insert_slot] += 1;
-        }
-      }
-
-      n_inserted++;
-    }
-    n_conflict_misses += (SL - n_inserted);
-  }
-  if (gather_cache_stats && n_conflict_misses > 0 && threadIdx.x == 0) {
-    atomicAdd(
-        &uvm_cache_stats[uvm_cache_stats_index::num_conflict_unique_misses],
-        n_conflict_misses);
-  }
-}
-
-void lru_cache_insert_cuda(
-    Tensor weights,
-    Tensor cache_hash_size_cumsum,
-    Tensor cache_index_table_map,
-    Tensor weights_offsets,
-    Tensor D_offsets,
-    Tensor sorted_cache_sets,
-    Tensor cache_set_sorted_unique_indices,
-    Tensor unique_indices_length,
-    Tensor lxu_cache_state,
-    Tensor lxu_cache_weights,
-    const int64_t time_stamp,
-    Tensor lru_state,
-    const bool stochastic_rounding,
-    bool gather_cache_stats,
-    Tensor uvm_cache_stats,
-    bool lock_cache_line,
-    Tensor lxu_cache_locking_counter) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      D_offsets,
-      sorted_cache_sets,
-      cache_set_sorted_unique_indices,
-      unique_indices_length,
-      lxu_cache_state,
-      lxu_cache_weights,
-      lru_state,
-      uvm_cache_stats,
-      lxu_cache_locking_counter);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(weights.get_device());
-
-  const int32_t N = cache_set_sorted_unique_indices.numel();
-
-  DISPATCH_EMB_CACHE_TYPES(
-      weights.scalar_type(),
-      lxu_cache_weights.scalar_type(),
-      "lru_cache_insert_kernel_2",
-      ([&] {
-        at::PhiloxCudaState rng_engine_inputs;
-        if (stochastic_rounding && !std::is_same<emb_t, float>::value) {
-          auto gen = at::cuda::detail::getDefaultCUDAGenerator();
-          std::lock_guard<std::mutex> lock(gen.mutex());
-          rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)
-                                  ->philox_cuda_state(4);
-        }
-
-        // During concurrent prefetch, cache lines are locked and we use less
-        // SMs for some of the prefetch kernels (e.g. insert)
-        // since it is not SM bound. It leaves SMs for main stream to overlap
-        constexpr int ALL_TO_PREFETCH_SM_RATIO = 8;
-
-        auto grid_size = lock_cache_line
-            ? div_round_up(get_device_sm_cnt_(), ALL_TO_PREFETCH_SM_RATIO)
-            : div_round_up(N, kMaxThreads / kWarpSize);
-
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "lru_cache_insert_kernel";
-#endif
-        lru_cache_insert_kernel<emb_t, cache_t>
-            <<<grid_size,
-               dim3(kWarpSize, kMaxThreads / kWarpSize),
-               0,
-               at::cuda::getCurrentCUDAStream()>>>(
-                MAKE_PTA_WITH_NAME(func_name, weights, emb_t, 1, 64),
-                MAKE_PTA_WITH_NAME(
-                    func_name, cache_hash_size_cumsum, int64_t, 1, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name, cache_index_table_map, int32_t, 1, 64),
-                MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
-                MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name, sorted_cache_sets, int32_t, 1, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name, cache_set_sorted_unique_indices, int64_t, 1, 32),
-                unique_indices_length.data_ptr<int32_t>(),
-                MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name, lxu_cache_weights, cache_t, 2, 64),
-                time_stamp,
-                MAKE_PTA_WITH_NAME(func_name, lru_state, int64_t, 2, 32),
-                stochastic_rounding,
-                rng_engine_inputs,
-                gather_cache_stats,
-                MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats, int32_t, 1, 32),
-                lock_cache_line,
-                MAKE_PTA_WITH_NAME(
-                    func_name, lxu_cache_locking_counter, int32_t, 2, 32));
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      }));
-}
-
-} // namespace
-
-DLL_PUBLIC void lru_cache_populate_cuda(
-    Tensor weights,
-    Tensor cache_hash_size_cumsum,
-    const int64_t total_cache_hash_size,
-    Tensor cache_index_table_map,
-    Tensor weights_offsets,
-    Tensor D_offsets,
-    Tensor linear_cache_indices,
-    Tensor lxu_cache_state,
-    Tensor lxu_cache_weights,
-    const int64_t time_stamp,
-    Tensor lru_state,
-    const bool stochastic_rounding,
-    bool gather_cache_stats,
-    c10::optional<Tensor> uvm_cache_stats,
-    bool lock_cache_line,
-    c10::optional<Tensor> lxu_cache_locking_counter) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      D_offsets,
-      linear_cache_indices,
-      lxu_cache_state,
-      lxu_cache_weights,
-      lru_state);
-
-  Tensor uvm_cache_stats_ = at::empty({0}, weights.options().dtype(at::kInt));
-  if (gather_cache_stats) {
-    TORCH_CHECK(uvm_cache_stats.has_value());
-    uvm_cache_stats_ = uvm_cache_stats.value();
-    TENSOR_ON_CUDA_GPU(uvm_cache_stats_);
-  }
-
-  Tensor lxu_cache_locking_counter_ =
-      at::empty({0, 0}, lxu_cache_state.options().dtype(at::kInt));
-  if (lock_cache_line) {
-    TORCH_CHECK(lxu_cache_locking_counter.has_value());
-    lxu_cache_locking_counter_ = lxu_cache_locking_counter.value();
-    TENSOR_ON_CUDA_GPU(lxu_cache_locking_counter_);
-  }
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(weights.get_device());
-
-  TORCH_CHECK(
-      linear_cache_indices.numel() < std::numeric_limits<int32_t>::max());
-  if (linear_cache_indices.numel() == 0) {
-    // nothing to do
-    return;
-  }
-
-  // Get unqiue indices
-  Tensor unique_indices;
-  Tensor unique_indices_length;
-  c10::optional<Tensor> unique_indices_count;
-  std::tie(unique_indices, unique_indices_length, unique_indices_count) =
-      get_unique_indices_cuda(
-          linear_cache_indices, total_cache_hash_size, false);
-
-  auto cache_sets_and_unique_indices = lru_cache_find_uncached_cuda(
-      unique_indices,
-      unique_indices_length,
-      total_cache_hash_size,
-      lxu_cache_state,
-      time_stamp,
-      lru_state,
-      gather_cache_stats,
-      uvm_cache_stats_,
-      lock_cache_line,
-      lxu_cache_locking_counter_);
-  auto sorted_cache_sets = cache_sets_and_unique_indices.first;
-  auto cache_set_sorted_unique_indices = cache_sets_and_unique_indices.second;
-
-  // insert caching weights
-  lru_cache_insert_cuda(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      D_offsets,
-      sorted_cache_sets,
-      cache_set_sorted_unique_indices,
-      unique_indices_length,
-      lxu_cache_state,
-      lxu_cache_weights,
-      time_stamp,
-      lru_state,
-      stochastic_rounding,
-      gather_cache_stats,
-      uvm_cache_stats_,
-      lock_cache_line,
-      lxu_cache_locking_counter_);
-}
-
-namespace {
-
-template <typename index_t>
-__global__ __launch_bounds__(kMaxThreads) void lru_cache_insert_byte_kernel(
-    pta::PackedTensorAccessor64<uint8_t, 1, at::RestrictPtrTraits> weights,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        cache_hash_size_cumsum,
-    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
-        cache_index_table_map,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        weights_offsets,
-    const pta::PackedTensorAccessor32<uint8_t, 1, at::RestrictPtrTraits>
-        weights_tys,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        D_offsets,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        sorted_cache_sets,
-    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        cache_set_sorted_indices,
-    const int32_t* __restrict__ N_unique,
-    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
-        lxu_cache_state,
-    pta::PackedTensorAccessor64<uint8_t, 2, at::RestrictPtrTraits>
-        lxu_cache_weights,
-    int64_t time_stamp,
-    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
-    const bool gather_cache_stats,
-    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        uvm_cache_stats,
-    const int64_t row_alignment) {
-  const int32_t C = lxu_cache_state.size(0);
-  int64_t n_conflict_misses = 0;
-  for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique;
-       n += gridDim.x * blockDim.y) {
-    // check if this warp is responsible for this whole segment.
-    const bool segment_start =
-        (n == 0 || sorted_cache_sets[n - 1] != sorted_cache_sets[n]);
-
-    if (!segment_start) {
-      // don't have *warp* divergence since we launch full warps in blockDim.x,
-      // so we can just exit this warp entirely.
-      continue;
-    }
-    const int32_t cache_set = sorted_cache_sets[n];
-    if (cache_set == C) {
-      // ignore the already-existing elements
-      continue;
-    }
-
-    int32_t SL = 1;
-    while (n + SL < *N_unique && sorted_cache_sets[n + SL] == cache_set) {
-      SL += 1;
-    }
-    int64_t n_inserted = 0;
-
-    // now, we need to insert the (unique!) values in indices[n:n + SL] into
-    // our slots.
-    const int32_t slot = threadIdx.x;
-    const int64_t slot_time = lru_state[cache_set][slot];
-    int64_t costs[1] = {slot_time};
-    int32_t slots[1] = {slot};
-
-    BitonicSort<int64_t, int32_t, 1, Comparator<int64_t>>::sort(costs, slots);
-    const int32_t sorted_slot = slots[0];
-    const int64_t sorted_lru_cost = costs[0];
-
-    for (int32_t l = 0; l < min(SL, kWarpSize); ++l) {
-      const int32_t insert_slot = shfl_sync(sorted_slot, l);
-      const int64_t insert_current_lru_cost = shfl_sync(sorted_lru_cost, l);
-      if (insert_current_lru_cost == time_stamp) {
-        break;
-      }
-      index_t insert_idx = cache_set_sorted_indices[n + l];
-      const int32_t t_insert = cache_index_table_map[insert_idx];
-      SparseType weight_ty_insert =
-          static_cast<SparseType>(weights_tys[t_insert]);
-      const int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
-      const int64_t weights_offset_insert = weights_offsets[t_insert];
-      const int32_t D_start_insert = D_offsets[t_insert];
-      const int32_t D_end_insert = D_offsets[t_insert + 1];
-      const int32_t D_insert = D_end_insert - D_start_insert;
-
-      const int32_t D_insert_bytes = nbit::padded_row_size_in_bytes(
-          D_insert, weight_ty_insert, row_alignment);
-
-      // insert into cache. Note that nbit::padded_row_size_in_bytes pad each
-      // row with row_alignment (16 bytes on GPUs) So each row will be multiple
-      // of 16 bytes (uint4 = 32bit x 4 = 16 bytes).
-      auto row = reinterpret_cast<const uint4*>(
-          &weights[weights_offset_insert + idx_insert * D_insert_bytes + 0]);
-      auto cache_row = reinterpret_cast<uint4*>(
-          &lxu_cache_weights[cache_set * kWarpSize + insert_slot][0]);
-      for (int32_t d = threadIdx.x; d * sizeof(uint4) < D_insert_bytes;
-           d += blockDim.x) {
-        cache_row[d] = row[d];
-      }
-
-      if (threadIdx.x == 0) {
-        lxu_cache_state[cache_set][insert_slot] = insert_idx;
-        lru_state[cache_set][insert_slot] = time_stamp;
-      }
-      n_inserted++;
-    }
-    n_conflict_misses += (SL - n_inserted);
-  }
-  if (gather_cache_stats && n_conflict_misses > 0 && threadIdx.x == 0) {
-    atomicAdd(
-        &uvm_cache_stats[uvm_cache_stats_index::num_conflict_unique_misses],
-        n_conflict_misses);
-  }
-}
-
-template <typename index_t>
-__global__
-__launch_bounds__(kMaxThreads) void direct_mapped_lru_cache_insert_byte_kernel(
-    pta::PackedTensorAccessor64<uint8_t, 1, at::RestrictPtrTraits> weights,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        cache_hash_size_cumsum,
-    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
-        cache_index_table_map,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        weights_offsets,
-    const pta::PackedTensorAccessor32<uint8_t, 1, at::RestrictPtrTraits>
-        weights_tys,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        D_offsets,
-    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
-        lxu_cache_state,
-    pta::PackedTensorAccessor64<uint8_t, 2, at::RestrictPtrTraits>
-        lxu_cache_weights,
-    int64_t time_stamp,
-    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
-    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        linear_cache_indices,
-    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
-        lxu_cache_miss_timestamp,
-    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> cache_sets,
-    const bool gather_cache_stats,
-    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        uvm_cache_stats,
-    const int64_t row_alignment) {
-  const int32_t N = cache_sets.size(0);
-
-  // one warp for each set (multiple times)
-  // (no divergence for each control branch)
-  for (int32_t pos = blockIdx.x * blockDim.y + threadIdx.y; pos < N;
-       pos += gridDim.x * blockDim.y) {
-    auto cache_set = cache_sets[pos];
-
-    if (cache_set == -1) {
-      // Cache hit, index invalid (e.g., pruned), or too late to grab this set.
-      continue;
-    }
-
-    if (lru_state[cache_set][0] == time_stamp) {
-      // we have a missing index but
-      // current cache row is a hit
-      // so abort unnecessary insert
-      continue;
-    }
-
-    // no need to check because cache_sets[pos] != -1 only when it was the
-    // first one to set the buffer time_stamp
-    // if (lxu_cache_miss_timestamp[cache_set][0] != time_stamp) {
-    //   continue;
-    // }
-
-    if (gather_cache_stats && threadIdx.x == 0) {
-      // We are using this slot for a slightly different purpose.
-      // In 32 way:
-      //    UVM traffic for insert
-      //    = # of inserted rows
-      //    = # of unique misses - # of unique misses that were not inserted
-      //    = uvm_cache_stats_index::num_unique_misses
-      //      - uvm_cache_stats_index::num_conflict_unique_misses
-      // In Direct Mapped (here):
-      //    UVM traffic for insert
-      //    = # of inserted rows
-      //    = uvm_cache_stats_index::num_conflict_unique_misses
-      //      (just store here directly)
-      atomicAdd(
-          &uvm_cache_stats[uvm_cache_stats_index::num_conflict_unique_misses],
-          1);
-    }
-
-    // insert the index in the buffer into our only slot
-    const int32_t insert_slot = 0;
-
-    int64_t insert_idx = linear_cache_indices[pos];
-    const int32_t t_insert = cache_index_table_map[insert_idx];
-    SparseType weight_ty_insert =
-        static_cast<SparseType>(weights_tys[t_insert]);
-    const int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
-    const int64_t weights_offset_insert = weights_offsets[t_insert];
-    const int32_t D_start_insert = D_offsets[t_insert];
-    const int32_t D_end_insert = D_offsets[t_insert + 1];
-    const int32_t D_insert = D_end_insert - D_start_insert;
-    const int32_t D_insert_bytes = nbit::padded_row_size_in_bytes(
-        D_insert, weight_ty_insert, row_alignment);
-
-    // insert into cache. Note that nbit::padded_row_size_in_bytes pad each
-    // row with row_alignment (16 bytes on GPUs) So each row will be multiple
-    // of 16 bytes (uint4 = 32bit x 4 = 16 bytes).
-    auto row = reinterpret_cast<const uint4*>(
-        &weights[weights_offset_insert + idx_insert * D_insert_bytes + 0]);
-    auto cache_row = reinterpret_cast<uint4*>(&lxu_cache_weights[cache_set][0]);
-    for (int32_t d = threadIdx.x; d * sizeof(uint4) < D_insert_bytes;
-         d += blockDim.x) {
-      cache_row[d] = row[d];
-    }
-
-    if (threadIdx.x == 0) {
-      lxu_cache_state[cache_set][insert_slot] = insert_idx;
-      lru_state[cache_set][insert_slot] = time_stamp;
-    }
-  }
-}
-
-void lru_cache_insert_byte_cuda(
-    Tensor weights,
-    Tensor cache_hash_size_cumsum,
-    Tensor cache_index_table_map,
-    Tensor weights_offsets,
-    Tensor weights_tys,
-    Tensor D_offsets,
-    Tensor sorted_cache_sets,
-    Tensor cache_set_sorted_unique_indices,
-    Tensor unique_indices_length,
-    Tensor lxu_cache_state,
-    Tensor lxu_cache_weights,
-    int64_t time_stamp,
-    Tensor lru_state,
-    bool gather_cache_stats,
-    Tensor uvm_cache_stats,
-    int64_t row_alignment) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      weights_tys,
-      D_offsets,
-      sorted_cache_sets,
-      cache_set_sorted_unique_indices,
-      unique_indices_length,
-      lxu_cache_state,
-      lxu_cache_weights,
-      lru_state,
-      uvm_cache_stats);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(weights.get_device());
-
-  const int32_t N = cache_set_sorted_unique_indices.numel();
-
-  AT_DISPATCH_INDEX_TYPES(
-      cache_set_sorted_unique_indices.scalar_type(),
-      "lru_cache_insert_byte_cuda",
-      [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "lru_cache_insert_byte_kernel";
-#endif
-        lru_cache_insert_byte_kernel<<<
-            std::min(
-                div_round_up(N, kMaxThreads / kWarpSize),
-                get_max_thread_blocks_for_cache_kernels_()),
-            dim3(kWarpSize, kMaxThreads / kWarpSize),
-            0,
-            at::cuda::getCurrentCUDAStream()>>>(
-            MAKE_PTA_WITH_NAME(func_name, weights, uint8_t, 1, 64),
-            MAKE_PTA_WITH_NAME(
-                func_name, cache_hash_size_cumsum, int64_t, 1, 32),
-            MAKE_PTA_WITH_NAME(
-                func_name, cache_index_table_map, int32_t, 1, 64),
-            MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, weights_tys, uint8_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, sorted_cache_sets, int32_t, 1, 32),
-            MAKE_PTA_WITH_NAME(
-                func_name, cache_set_sorted_unique_indices, index_t, 1, 32),
-            unique_indices_length.data_ptr<int32_t>(),
-            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
-            MAKE_PTA_WITH_NAME(func_name, lxu_cache_weights, uint8_t, 2, 64),
-            time_stamp,
-            MAKE_PTA_WITH_NAME(func_name, lru_state, int64_t, 2, 32),
-            gather_cache_stats,
-            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats, int32_t, 1, 32),
-            row_alignment);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
-}
-
-void direct_mapped_lru_cache_insert_byte_cuda(
-    Tensor weights,
-    Tensor cache_hash_size_cumsum,
-    Tensor cache_index_table_map,
-    Tensor weights_offsets,
-    Tensor weights_tys,
-    Tensor D_offsets,
-    Tensor lxu_cache_state,
-    Tensor lxu_cache_weights,
-    int64_t time_stamp,
-    Tensor lru_state,
-    Tensor linear_cache_indices,
-    Tensor lxu_cache_miss_timestamp,
-    Tensor cache_sets,
-    bool gather_cache_stats,
-    Tensor uvm_cache_stats,
-    int64_t row_alignment) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      weights_tys,
-      D_offsets,
-      lxu_cache_state,
-      lxu_cache_weights,
-      lru_state,
-      linear_cache_indices,
-      lxu_cache_miss_timestamp);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(weights.get_device());
-
-  const int32_t N = cache_sets.size(0);
-
-  AT_DISPATCH_INDEX_TYPES(
-      linear_cache_indices.scalar_type(),
-      "direct_mapped_lru_cache_insert_byte_cuda",
-      [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "direct_mapped_lru_cache_insert_byte_kernel";
-#endif
-        direct_mapped_lru_cache_insert_byte_kernel<<<
-            std::min(
-                div_round_up(N, kMaxThreads / kWarpSize),
-                get_max_thread_blocks_for_cache_kernels_()),
-            dim3(kWarpSize, kMaxThreads / kWarpSize),
-            0,
-            at::cuda::getCurrentCUDAStream()>>>(
-            MAKE_PTA_WITH_NAME(func_name, weights, uint8_t, 1, 64),
-            MAKE_PTA_WITH_NAME(
-                func_name, cache_hash_size_cumsum, int64_t, 1, 32),
-            MAKE_PTA_WITH_NAME(
-                func_name, cache_index_table_map, int32_t, 1, 64),
-            MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, weights_tys, uint8_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
-            MAKE_PTA_WITH_NAME(func_name, lxu_cache_weights, uint8_t, 2, 64),
-            time_stamp,
-            MAKE_PTA_WITH_NAME(func_name, lru_state, int64_t, 2, 32),
-            MAKE_PTA_WITH_NAME(func_name, linear_cache_indices, index_t, 1, 32),
-            MAKE_PTA_WITH_NAME(
-                func_name, lxu_cache_miss_timestamp, int64_t, 2, 32),
-            MAKE_PTA_WITH_NAME(func_name, cache_sets, int32_t, 1, 32),
-            gather_cache_stats,
-            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats, int32_t, 1, 32),
-            row_alignment);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
-}
-
-} // namespace
-
-DLL_PUBLIC void lru_cache_populate_byte_cuda(
-    Tensor weights,
-    Tensor cache_hash_size_cumsum,
-    int64_t total_cache_hash_size,
-    Tensor cache_index_table_map,
-    Tensor weights_offsets,
-    Tensor weights_tys,
-    Tensor D_offsets,
-    Tensor linear_cache_indices,
-    Tensor lxu_cache_state,
-    Tensor lxu_cache_weights,
-    int64_t time_stamp,
-    Tensor lru_state,
-    int64_t row_alignment,
-    bool gather_cache_stats,
-    c10::optional<Tensor> uvm_cache_stats) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      weights_tys,
-      D_offsets,
-      linear_cache_indices,
-      lxu_cache_state,
-      lxu_cache_weights,
-      lru_state);
-
-  Tensor uvm_cache_stats_ = at::empty({0}, weights.options().dtype(at::kInt));
-  if (gather_cache_stats) {
-    TORCH_CHECK(uvm_cache_stats.has_value());
-    uvm_cache_stats_ = uvm_cache_stats.value();
-    TENSOR_ON_CUDA_GPU(uvm_cache_stats_);
-  }
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(weights.get_device());
-
-  TORCH_CHECK(
-      linear_cache_indices.numel() < std::numeric_limits<int32_t>::max());
-  if (linear_cache_indices.numel() == 0) {
-    // nothing to do
-    return;
-  }
-
-  // Get unqiue indices
-  Tensor unique_indices;
-  Tensor unique_indices_length;
-  c10::optional<Tensor> unique_indices_count;
-  std::tie(unique_indices, unique_indices_length, unique_indices_count) =
-      get_unique_indices_cuda(
-          linear_cache_indices, total_cache_hash_size, false);
-
-  // Find uncached indices
-  Tensor lxu_cache_locking_counter =
-      at::empty({0, 0}, lxu_cache_state.options().dtype(at::kInt));
-  auto cache_sets_and_unique_indices = lru_cache_find_uncached_cuda(
-      unique_indices,
-      unique_indices_length,
-      total_cache_hash_size,
-      lxu_cache_state,
-      time_stamp,
-      lru_state,
-      gather_cache_stats,
-      uvm_cache_stats_,
-      false, // lock_cache_line
-      lxu_cache_locking_counter);
-  auto sorted_cache_sets = cache_sets_and_unique_indices.first;
-  auto cache_set_sorted_unique_indices = cache_sets_and_unique_indices.second;
-
-  // insert caching weights
-  lru_cache_insert_byte_cuda(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      weights_tys,
-      D_offsets,
-      sorted_cache_sets,
-      cache_set_sorted_unique_indices,
-      unique_indices_length,
-      lxu_cache_state,
-      lxu_cache_weights,
-      time_stamp,
-      lru_state,
-      gather_cache_stats,
-      uvm_cache_stats_,
-      row_alignment);
-}
-
-DLL_PUBLIC void direct_mapped_lru_cache_populate_byte_cuda(
-    Tensor weights,
-    Tensor cache_hash_size_cumsum,
-    int64_t total_cache_hash_size,
-    Tensor cache_index_table_map,
-    Tensor weights_offsets,
-    Tensor weights_tys,
-    Tensor D_offsets,
-    Tensor linear_cache_indices,
-    Tensor lxu_cache_state,
-    Tensor lxu_cache_weights,
-    int64_t time_stamp,
-    Tensor lru_state,
-    Tensor lxu_cache_miss_timestamp,
-    int64_t row_alignment,
-    bool gather_cache_stats,
-    c10::optional<Tensor> uvm_cache_stats) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      weights_tys,
-      D_offsets,
-      linear_cache_indices,
-      lxu_cache_state,
-      lxu_cache_weights,
-      lru_state,
-      lxu_cache_miss_timestamp);
-
-  if (gather_cache_stats) {
-    TORCH_CHECK(uvm_cache_stats.has_value());
-    TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-        uvm_cache_stats, lxu_cache_weights);
-  }
-  auto uvm_cache_stats_ = uvm_cache_stats.value_or(
-      at::empty({0}, weights.options().dtype(at::kInt)));
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(weights.get_device());
-
-  TORCH_CHECK(
-      linear_cache_indices.numel() < std::numeric_limits<int32_t>::max());
-  if (linear_cache_indices.numel() == 0) {
-    // nothing to do
-    return;
-  }
-
-  /*
-  populate_byte normal flow:
-  (1) get_unique (sort, dedup)
-  (2) find_uncached
-  (3) sort by set_idx
-  (4) insert rows
-
-  merged kernels flow:
-  (1) find_uncached
-        No need for sorting.
-        Each hit idx will just update the timestamp in lru_state.
-        Only one of miss indices will atomically set miss_timestamp,
-                                      and have cache_sets[pos] = set
-                                          where pos is the position of that idx
-                                          in the linear_cache_indices array
-        After this, for each set, we either have
-          (a) lru_state timestamp is recent (hit) => no need to insert row
-          (b) lru_state timestamp is not recent (no hit)
-              (b-1) miss_timestamp is recent
-                    => insert row for idx = linear_cache_indices[pos]
-              (b-2) insert_timestamp_buffer is not recent
-                    => no need to insert since there was no miss idx this time
-  (2) insert rows
-        Use buffer info to insert rows as the above logic.
-  */
-
-  auto cache_sets = direct_mapped_lru_cache_find_uncached_cuda(
-      linear_cache_indices,
-      total_cache_hash_size,
-      lxu_cache_state,
-      time_stamp,
-      lru_state,
-      lxu_cache_miss_timestamp,
-      gather_cache_stats,
-      uvm_cache_stats_);
-
-  // insert caching weights
-  direct_mapped_lru_cache_insert_byte_cuda(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      weights_tys,
-      D_offsets,
-      lxu_cache_state,
-      lxu_cache_weights,
-      time_stamp,
-      lru_state,
-      linear_cache_indices,
-      lxu_cache_miss_timestamp,
-      cache_sets,
-      gather_cache_stats,
-      uvm_cache_stats_,
-      row_alignment);
-}
-
-namespace {
-
-template <typename index_t>
-__global__ __launch_bounds__(kMaxThreads) void lfu_update_counts_kernel(
-    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        unique_indices,
-    const int32_t* __restrict__ N_unique,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        unique_indices_count,
-    pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits> lfu_state) {
-  CUDA_KERNEL_LOOP(n, *N_unique) {
-    const auto idx = unique_indices[n];
-    lfu_state[idx] += unique_indices_count[n];
-  }
-}
-
-void lfu_update_counts_cuda(
-    Tensor unique_indices,
-    Tensor unique_indices_length,
-    Tensor unique_indices_count,
-    Tensor lfu_state) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      unique_indices, unique_indices_length, unique_indices_count, lfu_state);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(unique_indices.get_device());
-
-  const int32_t N = unique_indices.size(0);
-  AT_DISPATCH_INDEX_TYPES(
-      unique_indices.scalar_type(), "lfu_update_counts_cuda", [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "lfu_update_counts_kernel";
-#endif
-        lfu_update_counts_kernel<<<
-            std::min(
-                div_round_up(N, kMaxThreads),
-                get_max_thread_blocks_for_cache_kernels_()),
-            kMaxThreads,
-            0,
-            at::cuda::getCurrentCUDAStream()>>>(
-            MAKE_PTA_WITH_NAME(func_name, unique_indices, index_t, 1, 32),
-            unique_indices_length.data_ptr<int32_t>(),
-            MAKE_PTA_WITH_NAME(func_name, unique_indices_count, int32_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64));
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
-}
-
-constexpr int32_t kCacheSetBits = 24;
-constexpr int32_t kLFUCounterBits = 40;
-static_assert(kCacheSetBits + kLFUCounterBits == 8 * sizeof(int64_t), "");
-
-template <typename index_t>
-__global__ __launch_bounds__(kMaxThreads) void lfu_cache_find_uncached_kernel(
-    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        unique_indices,
-    const int32_t* __restrict__ N_unique,
-    int64_t max_indices,
-    const pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
-        lxu_cache_state,
-    uint64_t* __restrict__ cache_sets,
-    const pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
-        lfu_state) {
-  const int32_t C = lxu_cache_state.size(0);
-
-  for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique;
-       n += gridDim.x * blockDim.y) {
-    const int64_t idx = unique_indices[n];
-    if (idx == max_indices) {
-      // cache_sets are initialized with sentinel values in
-      // lfu_cache_find_uncached_cuda
-      continue;
-    }
-    const uint32_t cache_set = cache_slot(idx, C);
-
-    const auto slot = threadIdx.x;
-    const bool found = ::__ldg((&lxu_cache_state[cache_set][0]) + slot) == idx;
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (!__any_sync(0xFFFFFFFFFFFFFFFF, found)) {
-#else
-    if (!__any_sync(0xFFFFFFFF, found)) {
-#endif
-      if (threadIdx.x == 0) {
-        // sort so the highest LFUs come first in the segment.
-        // assume lfu_state[idx] <= 2^40 - 1 and cache_set < 2^24 -1
-        cache_sets[n] =
-            ((static_cast<uint64_t>(cache_set) << kLFUCounterBits)) |
-            ((static_cast<uint64_t>(1) << kLFUCounterBits) - 1 -
-             lfu_state[idx]);
-      }
-    }
-  }
-}
-
-std::pair<Tensor, Tensor> lfu_cache_find_uncached_cuda(
-    Tensor unique_indices,
-    Tensor unique_indices_length,
-    int64_t max_indices,
-    Tensor lxu_cache_state,
-    Tensor lfu_state) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      unique_indices, unique_indices_length, lxu_cache_state, lfu_state);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(unique_indices.get_device());
-
-  auto cache_sets = full_like(
-      unique_indices,
-      static_cast<int64_t>(
-          static_cast<uint64_t>(lxu_cache_state.size(0)) << kLFUCounterBits),
-      unique_indices.options().dtype(at::kLong));
-  const int32_t N = unique_indices.numel();
-  auto sorted_cache_sets = empty_like(cache_sets);
-  auto cache_set_sorted_unique_indices = empty_like(unique_indices);
-
-  AT_DISPATCH_INDEX_TYPES(
-      unique_indices.scalar_type(), "lfu_cache_find_uncached_cuda", [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "lfu_cache_find_uncached_kernel";
-#endif
-        // Find uncached indices
-        lfu_cache_find_uncached_kernel<<<
-            std::min(
-                div_round_up(N, kMaxThreads / kWarpSize),
-                get_max_thread_blocks_for_cache_kernels_()),
-            dim3(kWarpSize, kMaxThreads / kWarpSize),
-            0,
-            at::cuda::getCurrentCUDAStream()>>>(
-            MAKE_PTA_WITH_NAME(func_name, unique_indices, index_t, 1, 32),
-            unique_indices_length.data_ptr<int32_t>(),
-            max_indices,
-            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
-            (uint64_t*)cache_sets.data_ptr<int64_t>(),
-            MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64));
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-        // Sort the cache sets and ids
-        size_t temp_storage_bytes = 0;
-        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs(
-            nullptr,
-            temp_storage_bytes,
-            (uint64_t*)cache_sets.data_ptr<int64_t>(),
-            (uint64_t*)sorted_cache_sets.data_ptr<int64_t>(),
-            unique_indices.data_ptr<index_t>(),
-            cache_set_sorted_unique_indices.data_ptr<index_t>(),
-            N,
-            0,
-            int(log2(float(lxu_cache_state.size(0) + 1)) + 1) + kLFUCounterBits,
-            at::cuda::getCurrentCUDAStream(),
-            false));
-        auto temp_storage = at::empty(
-            {static_cast<int64_t>(temp_storage_bytes)},
-            unique_indices.options().dtype(at::kByte));
-        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs(
-            temp_storage.data_ptr(),
-            temp_storage_bytes,
-            (uint64_t*)cache_sets.data_ptr<int64_t>(),
-            (uint64_t*)sorted_cache_sets.data_ptr<int64_t>(),
-            unique_indices.data_ptr<index_t>(),
-            cache_set_sorted_unique_indices.data_ptr<index_t>(),
-            N,
-            0,
-            int(log2(float(lxu_cache_state.size(0) + 1)) + 1) + kLFUCounterBits,
-            at::cuda::getCurrentCUDAStream(),
-            false));
-      });
-  return {sorted_cache_sets, cache_set_sorted_unique_indices};
-}
-
-template <typename emb_t, typename cache_t>
-__global__ __launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_kernel(
-    pta::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> weights,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        cache_hash_size_cumsum,
-    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
-        cache_index_table_map,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        weights_offsets,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        D_offsets,
-    const uint64_t* __restrict__ sorted_cache_sets,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        cache_set_sorted_indices,
-    const int32_t* __restrict__ N_unique,
-    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
-        lxu_cache_state,
-    pta::PackedTensorAccessor64<cache_t, 2, at::RestrictPtrTraits>
-        lxu_cache_weights,
-    const pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
-        lfu_state,
-    bool stochastic_rounding,
-    at::PhiloxCudaState stochastic_rounding_philox_args) {
-  const int32_t C = lxu_cache_state.size(0);
-  for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique;
-       n += gridDim.x * blockDim.y) {
-    // check if this warp is responsible for this whole segment.
-    const bool segment_start =
-        (n == 0 ||
-         (sorted_cache_sets[n - 1] >> kLFUCounterBits) !=
-             (sorted_cache_sets[n] >> kLFUCounterBits));
-
-    if (!segment_start) {
-      // don't have *warp* divergence since we launch full warps in blockDim.x,
-      // so we can just exit this warp entirely.
-      continue;
-    }
-    const uint32_t cache_set = (sorted_cache_sets[n] >> kLFUCounterBits);
-    if (cache_set == C) {
-      // ignore the already-existing elements
-      continue;
-    }
-
-    int32_t SL = 1;
-    while (n + SL < *N_unique &&
-           (sorted_cache_sets[n + SL] >> kLFUCounterBits) == cache_set) {
-      SL += 1;
-    }
-
-    // now, we need to insert the (unique!) values in indices[n:n + SL] into
-    // our slots.
-    const int32_t slot = threadIdx.x;
-    const int64_t current_idx = lxu_cache_state[cache_set][slot];
-    const int64_t current_lfu_cost =
-        (current_idx != static_cast<int64_t>(kCacheStateInvalid))
-        ? lfu_state[current_idx]
-        : -1;
-    int64_t costs[1] = {current_lfu_cost};
-    int32_t slots[1] = {slot};
-
-    BitonicSort<int64_t, int32_t, 1, Comparator<int64_t>>::sort(costs, slots);
-    const int32_t sorted_slot = slots[0];
-    const int64_t sorted_lfu_cost = costs[0];
-
-    for (int32_t l = 0; l < min(SL, kWarpSize); ++l) {
-      const int32_t insert_slot = shfl_sync(sorted_slot, l);
-      const int64_t insert_current_lfu_cost = shfl_sync(sorted_lfu_cost, l);
-      const int64_t insert_idx = cache_set_sorted_indices[n + l];
-      const int64_t insert_lfu_cost = lfu_state[insert_idx];
-
-      if (insert_current_lfu_cost > insert_lfu_cost) {
-        // don't insert.
-        // all subsequent `current_lfu_cost` values are greater, and all
-        // subsequent `insert_lfu_cost` values are smaller, so we can exit
-        // early here.
-        break;
-      }
-      const int32_t t_insert = cache_index_table_map[insert_idx];
-      const int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
-      const int64_t weights_offset_insert = weights_offsets[t_insert];
-      const int32_t D_start_insert = D_offsets[t_insert];
-      const int32_t D_end_insert = D_offsets[t_insert + 1];
-      const int32_t D_insert = D_end_insert - D_start_insert;
-
-      // not empty
-      if (insert_current_lfu_cost != -1) {
-        // ensure that threadIdx.x is the only thread reading/writing to
-        // lxu_cache_state
-        int64_t current_idx =
-            threadIdx.x == 0 ? lxu_cache_state[cache_set][insert_slot] : 0;
-        current_idx = shfl_sync(current_idx, 0);
-        const int32_t t_current = cache_index_table_map[current_idx];
-        const int64_t idx_current =
-            current_idx - cache_hash_size_cumsum[t_current];
-        const int64_t weights_offset_current = weights_offsets[t_current];
-        const int32_t D_start_current = D_offsets[t_current];
-        const int32_t D_end_current = D_offsets[t_current + 1];
-        const int32_t D_current = D_end_current - D_start_current;
-
-        int32_t D_emb = D_current;
-        if constexpr (std::is_same_v<emb_t, uint8_t>) {
-          D_emb += kINT8QparamsBytes;
-        }
-        auto weight_row = WeightRow<emb_t, cache_t, cache_t>(
-            &weights[weights_offset_current + idx_current * D_emb + 0],
-            &lxu_cache_weights[cache_set * kWarpSize + insert_slot][0],
-            D_current,
-            nullptr);
-
-        weight_row.set_stochastic_rounding(
-            stochastic_rounding,
-            stochastic_rounding_philox_args,
-            (blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x +
-             threadIdx.x) *
-                    kWarpSize +
-                l);
-
-        weight_row.warp_evict(D_current, blockDim.x, threadIdx.x);
-      }
-
-      // insert into cache
-      int32_t D_emb = D_insert;
-      if constexpr (std::is_same_v<emb_t, uint8_t>) {
-        D_emb += kINT8QparamsBytes;
-      }
-
-      auto weight_row_cache = WeightRow<emb_t, cache_t, cache_t>(
-          &weights[weights_offset_insert + idx_insert * D_emb + 0],
-          &lxu_cache_weights[cache_set * kWarpSize + insert_slot][0],
-          D_insert,
-          nullptr);
-
-      auto weight_row_emb = WeightRow<emb_t, cache_t, cache_t>(
-          &weights[weights_offset_insert + idx_insert * D_emb + 0],
-          nullptr,
-          D_insert,
-          nullptr);
-
-      weight_row_emb.warp_copy_to(
-          weight_row_cache, D_insert, blockDim.x, threadIdx.x);
-
-      if (threadIdx.x == 0) {
-        lxu_cache_state[cache_set][insert_slot] = insert_idx;
-      }
-    }
-  }
-}
-
-void lfu_cache_insert_cuda(
-    Tensor weights,
-    Tensor cache_hash_size_cumsum,
-    Tensor cache_index_table_map,
-    Tensor weights_offsets,
-    Tensor D_offsets,
-    Tensor sorted_cache_sets,
-    Tensor cache_set_sorted_unique_indices,
-    Tensor unique_indices_length,
-    Tensor lxu_cache_state,
-    Tensor lxu_cache_weights,
-    Tensor lfu_state,
-    bool stochastic_rounding) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      D_offsets,
-      sorted_cache_sets,
-      cache_set_sorted_unique_indices,
-      unique_indices_length,
-      lxu_cache_state,
-      lxu_cache_weights,
-      lfu_state);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(weights.get_device());
-
-  const int32_t N = cache_set_sorted_unique_indices.numel();
-
-  DISPATCH_EMB_CACHE_TYPES(
-      weights.scalar_type(),
-      lxu_cache_weights.scalar_type(),
-      "lfu_cache_insert_kernel_2",
-      ([&] {
-        at::PhiloxCudaState rng_engine_inputs;
-        if (stochastic_rounding && !std::is_same<emb_t, float>::value) {
-          auto gen = at::cuda::detail::getDefaultCUDAGenerator();
-          std::lock_guard<std::mutex> lock(gen.mutex());
-          rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)
-                                  ->philox_cuda_state(4);
-        }
-
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "lfu_cache_insert_kernel";
-#endif
-
-        lfu_cache_insert_kernel<emb_t, cache_t>
-            <<<std::min(
-                   div_round_up(N, kCacheMaxThreads / kWarpSize),
-                   get_max_thread_blocks_for_cache_kernels_()),
-               dim3(kWarpSize, kCacheMaxThreads / kWarpSize),
-               0,
-               at::cuda::getCurrentCUDAStream()>>>(
-                MAKE_PTA_WITH_NAME(func_name, weights, emb_t, 1, 64),
-                MAKE_PTA_WITH_NAME(
-                    func_name, cache_hash_size_cumsum, int64_t, 1, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name, cache_index_table_map, int32_t, 1, 64),
-                MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
-                MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
-                (uint64_t*)sorted_cache_sets.data_ptr<int64_t>(),
-                MAKE_PTA_WITH_NAME(
-                    func_name, cache_set_sorted_unique_indices, int64_t, 1, 32),
-                unique_indices_length.data_ptr<int32_t>(),
-                MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name, lxu_cache_weights, cache_t, 2, 64),
-                MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64),
-                stochastic_rounding,
-                rng_engine_inputs);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      }));
-}
-
-} // namespace
-
-DLL_PUBLIC void lfu_cache_populate_cuda(
-    Tensor weights,
-    Tensor cache_hash_size_cumsum,
-    int64_t total_cache_hash_size,
-    Tensor cache_index_table_map,
-    Tensor weights_offsets,
-    Tensor D_offsets,
-    Tensor linear_cache_indices,
-    Tensor lxu_cache_state,
-    Tensor lxu_cache_weights,
-    Tensor lfu_state,
-    bool stochastic_rounding) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      D_offsets,
-      linear_cache_indices,
-      lxu_cache_state,
-      lxu_cache_weights,
-      lfu_state);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(weights.get_device());
-
-  TORCH_CHECK(
-      linear_cache_indices.numel() < std::numeric_limits<int32_t>::max());
-  if (linear_cache_indices.numel() == 0) {
-    // nothing to do
-    return;
-  }
-
-  // get unqiue indices
-  Tensor unique_indices;
-  Tensor unique_indices_length;
-  c10::optional<Tensor> unique_indices_count;
-  std::tie(unique_indices, unique_indices_length, unique_indices_count) =
-      get_unique_indices_cuda(
-          linear_cache_indices, total_cache_hash_size, true);
-
-  // update lfu counts
-  lfu_update_counts_cuda(
-      unique_indices, unique_indices_length, *unique_indices_count, lfu_state);
-
-  // find uncached indices
-  auto cache_sets_and_unique_indices = lfu_cache_find_uncached_cuda(
-      unique_indices,
-      unique_indices_length,
-      total_cache_hash_size,
-      lxu_cache_state,
-      lfu_state);
-  const auto sorted_cache_sets = cache_sets_and_unique_indices.first;
-  const auto cache_set_sorted_unique_indices =
-      cache_sets_and_unique_indices.second;
-
-  // insert caching weights
-  lfu_cache_insert_cuda(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      D_offsets,
-      sorted_cache_sets,
-      cache_set_sorted_unique_indices,
-      unique_indices_length,
-      lxu_cache_state,
-      lxu_cache_weights,
-      lfu_state,
-      stochastic_rounding);
-}
-
-namespace {
-
-// In `lfu_cache_insert_kernel`, we use `emb_t` and `cache_t` for the
-// high-precision cache implementation, where we can have {FP32, FP16, INT8}
-// for embedding precision (data types), and {FP32, FP16} for cache precision
-// (data types).
-//
-// In `lfu_cache_insert_byte_kernel`, we only use uint8_t for the both embedding
-// and cache data type (conforming to the inference TBE kernel logics).
-// - We pass in `weights_tys` to denote the real data types for the embeddings:
-// {FP32, FP16, INT8, INT4, INT2}. For example, FP32 is 4 byte element in the
-// byte tensor, and INT4 is half byte element in the byte tensor.
-// - We only assume that the embedding and cache have the same precisions (the
-// real "precision" is determined by `weights_tys` although the data types are
-// uint8_t only). Basically no "high-precision cache" support for now.
-// - The insert/evict of embedding row from the cache are done in a byte-by-byte
-// manner.
-template <typename index_t>
-__global__
-__launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_byte_kernel(
-    pta::PackedTensorAccessor64<uint8_t, 1, at::RestrictPtrTraits> weights,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        cache_hash_size_cumsum,
-    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
-        cache_index_table_map,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        weights_offsets,
-    const pta::PackedTensorAccessor32<uint8_t, 1, at::RestrictPtrTraits>
-        weights_tys,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        D_offsets,
-    const uint64_t* __restrict__ sorted_cache_sets,
-    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        cache_set_sorted_indices,
-    const int32_t* __restrict__ N_unique,
-    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
-        lxu_cache_state,
-    pta::PackedTensorAccessor64<uint8_t, 2, at::RestrictPtrTraits>
-        lxu_cache_weights,
-    const pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
-        lfu_state,
-    const int64_t row_alignment) {
-  const int32_t C = lxu_cache_state.size(0);
-  for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique;
-       n += gridDim.x * blockDim.y) {
-    // check if this warp is responsible for this whole segment.
-    const bool segment_start =
-        (n == 0 ||
-         (sorted_cache_sets[n - 1] >> kLFUCounterBits) !=
-             (sorted_cache_sets[n] >> kLFUCounterBits));
-
-    if (!segment_start) {
-      // don't have *warp* divergence since we launch full warps in blockDim.x,
-      // so we can just exit this warp entirely.
-      continue;
-    }
-    const uint32_t cache_set = (sorted_cache_sets[n] >> kLFUCounterBits);
-    if (cache_set == C) {
-      // ignore the already-existing elements
-      continue;
-    }
-
-    int32_t SL = 1;
-    while (n + SL < *N_unique &&
-           (sorted_cache_sets[n + SL] >> kLFUCounterBits) == cache_set) {
-      SL += 1;
-    }
-
-    // now, we need to insert the (unique!) values in indices[n:n + SL] into
-    // our slots.
-    const int32_t slot = threadIdx.x;
-    const int64_t current_idx = lxu_cache_state[cache_set][slot];
-    const int64_t current_lfu_cost =
-        (current_idx != static_cast<int64_t>(kCacheStateInvalid))
-        ? lfu_state[current_idx]
-        : -1;
-    int64_t costs[1] = {current_lfu_cost};
-    int32_t slots[1] = {slot};
-
-    BitonicSort<int64_t, int32_t, 1, Comparator<int64_t>>::sort(costs, slots);
-    const int32_t sorted_slot = slots[0];
-    const int64_t sorted_lfu_cost = costs[0];
-
-    for (int32_t l = 0; l < min(SL, kWarpSize); ++l) {
-      const int32_t insert_slot = shfl_sync(sorted_slot, l);
-      const int64_t insert_current_lfu_cost = shfl_sync(sorted_lfu_cost, l);
-      const index_t insert_idx = cache_set_sorted_indices[n + l];
-      const int64_t insert_lfu_cost = lfu_state[insert_idx];
-
-      if (insert_current_lfu_cost > insert_lfu_cost) {
-        // don't insert.
-        // all subsequent `current_lfu_cost` values are greater, and all
-        // subsequent `insert_lfu_cost` values are smaller, so we can exit
-        // early here.
-        break;
-      }
-      const int32_t t_insert = cache_index_table_map[insert_idx];
-      const SparseType weight_ty_insert =
-          static_cast<SparseType>(weights_tys[t_insert]);
-      const int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
-      const int64_t weights_offset_insert = weights_offsets[t_insert];
-      const int32_t D_start_insert = D_offsets[t_insert];
-      const int32_t D_end_insert = D_offsets[t_insert + 1];
-      const int32_t D_insert = D_end_insert - D_start_insert;
-
-      const int32_t D_insert_bytes = nbit::padded_row_size_in_bytes(
-          D_insert, weight_ty_insert, row_alignment);
-
-      // insert into cache. Note that nbit::padded_row_size_in_bytes pad each
-      // row with row_alignment (16 bytes on GPUs) So each row will be multiple
-      // of 16 bytes (uint4 = 32bit x 4 = 16 bytes).
-      auto row = reinterpret_cast<const uint4*>(
-          &weights[weights_offset_insert + idx_insert * D_insert_bytes + 0]);
-      auto cache_row = reinterpret_cast<uint4*>(
-          &lxu_cache_weights[cache_set * kWarpSize + insert_slot][0]);
-      for (int32_t d = threadIdx.x; d * sizeof(uint4) < D_insert_bytes;
-           d += blockDim.x) {
-        cache_row[d] = row[d];
-      }
-      if (threadIdx.x == 0) {
-        lxu_cache_state[cache_set][insert_slot] = insert_idx;
-      }
-    }
-  }
-}
-
-void lfu_cache_insert_byte_cuda(
-    Tensor weights,
-    Tensor cache_hash_size_cumsum,
-    Tensor cache_index_table_map,
-    Tensor weights_offsets,
-    Tensor weights_tys,
-    Tensor D_offsets,
-    Tensor sorted_cache_sets,
-    Tensor cache_set_sorted_unique_indices,
-    Tensor unique_indices_length,
-    Tensor lxu_cache_state,
-    Tensor lxu_cache_weights,
-    Tensor lfu_state,
-    int64_t row_alignment) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      weights_tys,
-      D_offsets,
-      sorted_cache_sets,
-      cache_set_sorted_unique_indices,
-      unique_indices_length,
-      lxu_cache_state,
-      lxu_cache_weights,
-      lfu_state);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(weights.get_device());
-
-  const int32_t N = cache_set_sorted_unique_indices.numel();
-
-  AT_DISPATCH_INDEX_TYPES(
-      cache_set_sorted_unique_indices.scalar_type(),
-      "lfu_cache_insert_byte_cuda",
-      [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "lfu_cache_insert_byte_kernel";
-#endif
-        lfu_cache_insert_byte_kernel<<<
-            std::min(
-                div_round_up(N, kCacheMaxThreads / kWarpSize),
-                get_max_thread_blocks_for_cache_kernels_()),
-            dim3(kWarpSize, kCacheMaxThreads / kWarpSize),
-            0,
-            at::cuda::getCurrentCUDAStream()>>>(
-            MAKE_PTA_WITH_NAME(func_name, weights, uint8_t, 1, 64),
-            MAKE_PTA_WITH_NAME(
-                func_name, cache_hash_size_cumsum, int64_t, 1, 32),
-            MAKE_PTA_WITH_NAME(
-                func_name, cache_index_table_map, int32_t, 1, 64),
-            MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, weights_tys, uint8_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
-            (uint64_t*)sorted_cache_sets.data_ptr<int64_t>(),
-            MAKE_PTA_WITH_NAME(
-                func_name, cache_set_sorted_unique_indices, index_t, 1, 32),
-            unique_indices_length.data_ptr<int32_t>(),
-            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
-            MAKE_PTA_WITH_NAME(func_name, lxu_cache_weights, uint8_t, 2, 64),
-            MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64),
-            row_alignment);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
-}
-
-} // namespace
-
-DLL_PUBLIC void lfu_cache_populate_byte_cuda(
-    Tensor weights,
-    Tensor cache_hash_size_cumsum,
-    int64_t total_cache_hash_size,
-    Tensor cache_index_table_map,
-    Tensor weights_offsets,
-    Tensor weights_tys,
-    Tensor D_offsets,
-    Tensor linear_cache_indices,
-    Tensor lxu_cache_state,
-    Tensor lxu_cache_weights,
-    Tensor lfu_state,
-    int64_t row_alignment) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      weights_tys,
-      D_offsets,
-      linear_cache_indices,
-      lxu_cache_state,
-      lxu_cache_weights,
-      lfu_state);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(weights.get_device());
-
-  TORCH_CHECK(
-      linear_cache_indices.numel() < std::numeric_limits<int32_t>::max());
-  if (linear_cache_indices.numel() == 0) {
-    // nothing to do
-    return;
-  }
-
-  // get unqiue indices
-  Tensor unique_indices;
-  Tensor unique_indices_length;
-  c10::optional<Tensor> unique_indices_count;
-  std::tie(unique_indices, unique_indices_length, unique_indices_count) =
-      get_unique_indices_cuda(
-          linear_cache_indices, total_cache_hash_size, true);
-
-  // update lfu counts
-  lfu_update_counts_cuda(
-      unique_indices, unique_indices_length, *unique_indices_count, lfu_state);
-
-  // find uncached indices
-  const auto cache_sets_and_unique_indices = lfu_cache_find_uncached_cuda(
-      unique_indices,
-      unique_indices_length,
-      total_cache_hash_size,
-      lxu_cache_state,
-      lfu_state);
-  const auto sorted_cache_sets = cache_sets_and_unique_indices.first;
-  const auto cache_set_sorted_unique_indices =
-      cache_sets_and_unique_indices.second;
-
-  // insert caching weights
-  lfu_cache_insert_byte_cuda(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      weights_tys,
-      D_offsets,
-      sorted_cache_sets,
-      cache_set_sorted_unique_indices,
-      unique_indices_length,
-      lxu_cache_state,
-      lxu_cache_weights,
-      lfu_state,
-      row_alignment);
-}
-
-namespace {
-
-template <typename index_t>
-__global__ __launch_bounds__(kMaxThreads) void lxu_cache_lookup_kernel(
-    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        linear_cache_indices,
-    const pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
-        lxu_cache_state,
-    int64_t invalid_index,
-    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        lxu_cache_locations,
-    const bool gather_cache_stats,
-    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        uvm_cache_stats) {
-  const int32_t C = lxu_cache_state.size(0);
-  const int32_t N = linear_cache_indices.size(0);
-  const int32_t n0 =
-      blockIdx.x * blockDim.y * blockDim.x + threadIdx.y * blockDim.x;
-  if (n0 >= N) {
-    return;
-  }
-
-  int32_t cache_location = kCacheLocationMissing;
-  int32_t n_indices = 0;
-  int32_t n_hits = 0;
-  const auto slot = threadIdx.x;
-  for (int i = 0; i < blockDim.x; ++i) {
-    int32_t n = n0 + i;
-    if (n >= N) {
-      continue;
-    }
-    const int64_t idx = linear_cache_indices[n];
-    if (idx == invalid_index) {
-      continue;
-    }
-    const int32_t cache_set = cache_slot(idx, C);
-    n_indices++;
-    const bool found =
-        (::__ldg((&lxu_cache_state[cache_set][0]) + slot) == idx);
-#ifdef __HIP_PLATFORM_HCC__
-    // FIXME: __ballot_sync with mask isn't supported by HIP yet.
-    // See https://fburl.com/fvy7j0lq for the similar context.
-    // assert false here with https://fburl.com/pfm7enw2
-    assert(false);
-    const auto bitmap = __ballot(found);
-    if (bitmap) {
-      const auto way = __ffsll(bitmap) - 1;
-#else
-    const auto bitmap = __ballot_sync(0xFFFFFFFF, found);
-    if (bitmap) {
-      // LSB == 1 hence we need to subtract one to get lane ID.
-      const auto way = __ffs(bitmap) - 1;
-#endif
-      if (i == threadIdx.x) {
-        cache_location = cache_set * kWarpSize + way;
-      }
-      n_hits++;
-    }
-  }
-
-  const int32_t n = n0 + threadIdx.x;
-  if (n < N) {
-    lxu_cache_locations[n] = cache_location;
-  }
-  if (gather_cache_stats && threadIdx.x == 0 && n_indices > n_hits) {
-    atomicAdd(
-        &uvm_cache_stats[uvm_cache_stats_index::num_conflict_misses],
-        (n_indices - n_hits));
-  }
-}
-
-template <typename index_t>
-__global__
-__launch_bounds__(kMaxThreads) void direct_mapped_lxu_cache_lookup_kernel(
-    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        linear_cache_indices,
-    const pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
-        lxu_cache_state,
-    int64_t invalid_index,
-    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        lxu_cache_locations,
-    const bool gather_cache_stats,
-    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        uvm_cache_stats) {
-  const int32_t C = lxu_cache_state.size(0);
-  const int32_t N = linear_cache_indices.size(0);
-
-  int32_t n_indices = 0;
-  int32_t n_hits = 0;
-
-  CUDA_KERNEL_LOOP(n, N) {
-    int32_t cache_location = kCacheLocationMissing;
-    const auto slot = 0;
-
-    const int64_t idx = linear_cache_indices[n];
-    if (idx == invalid_index) {
-      continue;
-    }
-
-    const int32_t cache_set = cache_slot(idx, C);
-    n_indices++;
-    const bool found =
-        (::__ldg((&lxu_cache_state[cache_set][0]) + slot) == idx);
-    if (found) {
-      cache_location = cache_set;
-      n_hits++;
-    }
-    lxu_cache_locations[n] = cache_location;
-  }
-
-  if (gather_cache_stats) {
-    typedef cub::BlockReduce<int32_t, kMaxThreads> BlockReduce;
-    __shared__ typename BlockReduce::TempStorage temp;
-
-    const int32_t conflict_miss = n_indices - n_hits;
-    const int32_t conflict_miss_sum = BlockReduce(temp).Sum(conflict_miss);
-
-    if (threadIdx.x == 0) {
-      atomicAdd(
-          &uvm_cache_stats[uvm_cache_stats_index::num_conflict_misses],
-          conflict_miss_sum);
-    }
-  }
-}
-
-} // namespace
-
-DLL_PUBLIC Tensor lxu_cache_lookup_cuda(
-    Tensor linear_cache_indices,
-    Tensor lxu_cache_state,
-    int64_t invalid_index,
-    bool gather_cache_stats,
-    c10::optional<Tensor> uvm_cache_stats) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      linear_cache_indices, lxu_cache_state);
-  Tensor uvm_cache_stats_ =
-      at::empty({0}, linear_cache_indices.options().dtype(at::kInt));
-  if (gather_cache_stats) {
-    TORCH_CHECK(uvm_cache_stats.has_value());
-    uvm_cache_stats_ = uvm_cache_stats.value();
-  }
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(linear_cache_indices.get_device());
-
-  const auto N = linear_cache_indices.numel();
-  auto lxu_cache_locations = empty_like(
-      linear_cache_indices, linear_cache_indices.options().dtype(at::kInt));
-  if (linear_cache_indices.numel() == 0) {
-    // nothing to do
-    return lxu_cache_locations;
-  }
-
-  const dim3 threads(kWarpSize, kMaxThreads / kWarpSize);
-  const dim3 blocks(div_round_up(N, kMaxThreads));
-
-  AT_DISPATCH_INDEX_TYPES(
-      linear_cache_indices.scalar_type(), "lxu_cache_lookup_cuda", [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "lxu_cache_lookup_kernel";
-#endif
-        lxu_cache_lookup_kernel<<<
-            blocks,
-            threads,
-            0,
-            at::cuda::getCurrentCUDAStream()>>>(
-            MAKE_PTA_WITH_NAME(func_name, linear_cache_indices, index_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
-            invalid_index,
-            MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32),
-            gather_cache_stats,
-            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats_, int32_t, 1, 32));
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
-
-  return lxu_cache_locations;
-}
-
-namespace {
-
-__global__
-__launch_bounds__(kMaxThreads) void lxu_cache_locations_update_kernel(
-    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        lxu_cache_locations,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        lxu_cache_locations_new) {
-  const int32_t N = lxu_cache_locations.size(0);
-  CUDA_KERNEL_LOOP(n, N) {
-    if (lxu_cache_locations[n] == kCacheLocationMissing &&
-        lxu_cache_locations_new[n] >= 0) {
-      lxu_cache_locations[n] = lxu_cache_locations_new[n];
-    }
-  }
-}
-} // namespace
-
-DLL_PUBLIC void lxu_cache_locations_update_cuda(
-    Tensor lxu_cache_locations,
-    Tensor lxu_cache_locations_new) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      lxu_cache_locations, lxu_cache_locations_new);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(lxu_cache_locations.get_device());
-
-  const auto N = lxu_cache_locations.numel();
-
-  if (N == 0) {
-    return;
-  }
-
-  const dim3 blocks(std::min(
-      div_round_up(N, kMaxThreads),
-      get_max_thread_blocks_for_cache_kernels_()));
-
-#ifdef FBGEMM_GPU_MEMCHECK
-  const char* func_name = "lxu_cache_locations_update_kernel";
-#endif
-
-  lxu_cache_locations_update_kernel<<<
-      blocks,
-      kMaxThreads,
-      0,
-      at::cuda::getCurrentCUDAStream()>>>(
-      MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32),
-      MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations_new, int32_t, 1, 32));
-
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-  return;
-}
-
-DLL_PUBLIC Tensor direct_mapped_lxu_cache_lookup_cuda(
-    Tensor linear_cache_indices,
-    Tensor lxu_cache_state,
-    int64_t invalid_index,
-    bool gather_cache_stats,
-    c10::optional<Tensor> uvm_cache_stats) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      linear_cache_indices, lxu_cache_state);
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(uvm_cache_stats, lxu_cache_state);
-
-  if (gather_cache_stats) {
-    TORCH_CHECK(uvm_cache_stats.has_value());
-  }
-  auto uvm_cache_stats_ = uvm_cache_stats.value_or(
-      at::empty({0}, linear_cache_indices.options().dtype(at::kInt)));
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(linear_cache_indices.get_device());
-
-  const auto N = linear_cache_indices.numel();
-  auto lxu_cache_locations = empty_like(
-      linear_cache_indices, linear_cache_indices.options().dtype(at::kInt));
-  if (linear_cache_indices.numel() == 0) {
-    // nothing to do
-    return lxu_cache_locations;
-  }
-
-  const dim3 blocks(div_round_up(N, kMaxThreads));
-
-  AT_DISPATCH_INDEX_TYPES(
-      linear_cache_indices.scalar_type(),
-      "direct_mapped_lxu_cache_lookup_cuda",
-      [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "direct_mapped_lxu_cache_lookup_kernel";
-#endif
-        direct_mapped_lxu_cache_lookup_kernel<<<
-            blocks,
-            kMaxThreads,
-            0,
-            at::cuda::getCurrentCUDAStream()>>>(
-            MAKE_PTA_WITH_NAME(func_name, linear_cache_indices, index_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
-            invalid_index,
-            MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32),
-            gather_cache_stats,
-            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats_, int32_t, 1, 32));
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
-
-  return lxu_cache_locations;
-}
-
-int get_sm_count_() {
-  cudaDeviceProp* deviceProp =
-      at::cuda::getDeviceProperties(c10::cuda::current_device());
-  return deviceProp->multiProcessorCount;
-}
-
-__global__ __launch_bounds__(kMaxThreads) void get_cache_indices_kernel(
-    int32_t blocks_per_table,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        cache_hash_size_cumsum,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        pruned_indices,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        pruned_indices_offsets,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        logical_table_ids,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        buffer_ids,
-    pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        linear_cache_indices) {
-  const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
-
-  const int32_t t_i = blockIdx.x / blocks_per_table;
-  const int32_t threads_per_table = blocks_per_table * blockDim.x;
-  const int32_t idx_table = index % threads_per_table;
-  const int32_t logical_id = logical_table_ids[t_i];
-  const int32_t buffer_id = buffer_ids[t_i];
-
-  const int64_t num_indices =
-      pruned_indices_offsets[buffer_id + 1] - pruned_indices_offsets[buffer_id];
-
-  if (num_indices <= 0) {
-    return;
-  }
-
-  const int64_t indices_per_thread =
-      div_round_up(num_indices, threads_per_table);
-  const int64_t start = idx_table * indices_per_thread;
-  const int64_t end = min(start + indices_per_thread, num_indices);
-
-  if (start >= num_indices) {
-    return;
-  }
-
-  const int64_t pruned_indices_offset = pruned_indices_offsets[buffer_id];
-  const int64_t* pruned_indices_table = &pruned_indices[pruned_indices_offset];
-  int64_t* linear_cache_indices_table =
-      &linear_cache_indices[pruned_indices_offset];
-
-  const auto max_offset =
-      __ldg(&cache_hash_size_cumsum[cache_hash_size_cumsum.size(0) - 1]);
-  const auto curr_offset = __ldg(&cache_hash_size_cumsum[logical_id]);
-
-  for (int64_t i = start; i < end; i++) {
-    if (curr_offset >= 0) {
-      linear_cache_indices_table[i] = curr_offset + pruned_indices_table[i];
-    } else {
-      linear_cache_indices_table[i] = max_offset;
-    }
-  }
-}
-
-template <typename emb_t, typename cache_t>
-__global__ __launch_bounds__(kMaxThreads) void reset_weight_momentum_kernel(
-    int32_t blocks_per_table,
-    pta::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> dev_weights,
-    pta::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> uvm_weights,
-    pta::PackedTensorAccessor64<cache_t, 2, at::RestrictPtrTraits>
-        lxu_cache_weights,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        weights_placements,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        weights_offsets,
-    pta::PackedTensorAccessor64<
-        at::acc_type<cache_t, true>,
-        1,
-        at::RestrictPtrTraits> momentum1_dev,
-    pta::PackedTensorAccessor64<
-        at::acc_type<cache_t, true>,
-        1,
-        at::RestrictPtrTraits> momentum1_uvm,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        momentum1_placements,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        momentum1_offsets,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        D_offsets,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        pruned_indices,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        pruned_indices_offsets,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        logical_table_ids,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        buffer_ids,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        lxu_cache_locations) {
-  const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
-
-  const int32_t t_i = blockIdx.x / blocks_per_table;
-  const int32_t buffer_id = buffer_ids[t_i];
-  const int64_t num_indices =
-      pruned_indices_offsets[buffer_id + 1] - pruned_indices_offsets[buffer_id];
-
-  if (num_indices <= 0) {
-    return;
-  }
-
-  const int32_t logical_id = logical_table_ids[t_i];
-  int32_t D = D_offsets[logical_id + 1] - D_offsets[logical_id];
-  const int32_t chunk4s_per_row = D / 4;
-  const int64_t total_chunk4s_per_table = num_indices * chunk4s_per_row;
-
-  const int32_t threads_per_table = blocks_per_table * blockDim.x;
-  const int64_t chunk4s_per_thread =
-      div_round_up(total_chunk4s_per_table, threads_per_table);
-  const int32_t idx_table = index % threads_per_table;
-  const int64_t start = idx_table * chunk4s_per_thread;
-  const int64_t end = min(start + chunk4s_per_thread, total_chunk4s_per_table);
-
-  if (start >= total_chunk4s_per_table) {
-    return;
-  }
-
-  int32_t D_emb = D;
-  if constexpr (std::is_same_v<emb_t, uint8_t>) {
-    D_emb += kINT8QparamsBytes;
-  }
-
-  at::acc_type<cache_t, true>* __restrict__ momentum1;
-  const auto momentum1_placement =
-      static_cast<PlacementType>(momentum1_placements[logical_id]);
-  int64_t momentum1_offset = momentum1_offsets[logical_id];
-  if (momentum1_placement == PlacementType::DEVICE) {
-    momentum1 = &momentum1_dev[momentum1_offset];
-  } else {
-    momentum1 = &momentum1_uvm[momentum1_offset];
-  }
-
-  emb_t* __restrict__ weights{nullptr};
-  cache_t* __restrict__ cache_weights{nullptr};
-  const auto weights_placement =
-      static_cast<PlacementType>(weights_placements[logical_id]);
-  int64_t weights_offset = weights_offsets[logical_id];
-
-  const int64_t pruned_indices_offset = pruned_indices_offsets[buffer_id];
-  const int64_t* pruned_indices_table = &pruned_indices[pruned_indices_offset];
-
-  for (int64_t i = start; i < end; i++) {
-    int64_t idx = i / chunk4s_per_row;
-    int64_t pruned_index = pruned_indices_table[idx];
-
-    if (weights_placement == PlacementType::DEVICE) {
-      weights = &dev_weights[weights_offset + pruned_index * D_emb];
-    } else {
-      weights = &uvm_weights[weights_offset + pruned_index * D_emb];
-    }
-    if (weights_placement == PlacementType::MANAGED_CACHING) {
-      int32_t cache_idx = lxu_cache_locations[pruned_indices_offset + idx];
-      if (cache_idx != kCacheLocationMissing) {
-        cache_weights = &lxu_cache_weights[cache_idx][0];
-      }
-    }
-
-    auto weight_row_template =
-        WeightRow<emb_t, cache_t, at::acc_type<cache_t, true>>(
-            weights, cache_weights, D, nullptr);
-
-    // reset momentum1
-    const int32_t d = (i % chunk4s_per_row) * 4;
-    if (d == 0) {
-      momentum1[pruned_index] = 0;
-    }
-
-    // reset weight
-    float2 qparams_new = {1.0, 0.0}; // scaler=1.0, and offset=0.0, for int8.
-    Vec4T<at::acc_type<cache_t, true>> weight_new; // 0 weight
-    weight_row_template.store(
-        weight_new,
-        d,
-        qparams_new); // qparams_new not used if type is not int8
-  }
-}
-
-DLL_PUBLIC void reset_weight_momentum_cuda(
-    Tensor dev_weights,
-    Tensor uvm_weights,
-    Tensor lxu_cache_weights,
-    Tensor weights_placements,
-    Tensor weights_offsets,
-    Tensor momentum1_dev,
-    Tensor momentum1_uvm,
-    Tensor momentum1_placements,
-    Tensor momentum1_offsets,
-    Tensor D_offsets,
-    Tensor pruned_indices,
-    Tensor pruned_indices_offsets,
-    Tensor logical_table_ids,
-    Tensor buffer_ids,
-    Tensor cache_hash_size_cumsum,
-    Tensor lxu_cache_state,
-    int64_t total_cache_hash_size) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      dev_weights,
-      uvm_weights,
-      lxu_cache_weights,
-      weights_placements,
-      weights_offsets,
-      momentum1_dev,
-      momentum1_uvm,
-      momentum1_placements,
-      momentum1_offsets,
-      D_offsets,
-      pruned_indices,
-      pruned_indices_offsets,
-      logical_table_ids,
-      buffer_ids,
-      cache_hash_size_cumsum,
-      lxu_cache_state);
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(dev_weights.get_device());
-
-  const int64_t num_pruned_indices = pruned_indices.size(0);
-  const int32_t num_pruned_tables = buffer_ids.size(0);
-  const int32_t blocks_per_table = get_sm_count_();
-
-  auto lxu_cache_locations =
-      at::zeros({num_pruned_indices}, pruned_indices.options().dtype(at::kInt));
-  lxu_cache_locations.fill_(kCacheLocationMissing);
-
-  if (total_cache_hash_size > 0) {
-    // Get corresponding cache indices of pruned indices
-    auto linear_cache_indices = at::zeros(
-        {num_pruned_indices}, pruned_indices.options().dtype(at::kLong));
-
-#ifdef FBGEMM_GPU_MEMCHECK
-    const char* func_name = "get_cache_indices_kernel";
-#endif
-
-    get_cache_indices_kernel<<<
-        num_pruned_tables * blocks_per_table,
-        kMaxThreads,
-        0,
-        at::cuda::getCurrentCUDAStream()>>>(
-        blocks_per_table,
-        MAKE_PTA_WITH_NAME(func_name, cache_hash_size_cumsum, int64_t, 1, 32),
-        MAKE_PTA_WITH_NAME(func_name, pruned_indices, int64_t, 1, 32),
-        MAKE_PTA_WITH_NAME(func_name, pruned_indices_offsets, int64_t, 1, 32),
-        MAKE_PTA_WITH_NAME(func_name, logical_table_ids, int32_t, 1, 32),
-        MAKE_PTA_WITH_NAME(func_name, buffer_ids, int32_t, 1, 32),
-        MAKE_PTA_WITH_NAME(func_name, linear_cache_indices, int64_t, 1, 32));
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-    // Look up cache locations
-    Tensor uvm_cache_stats =
-        at::empty({0}, lxu_cache_weights.options().dtype(at::kInt));
-    lxu_cache_locations = lxu_cache_lookup_cuda(
-        linear_cache_indices,
-        lxu_cache_state,
-        total_cache_hash_size,
-        false, // gather_cache_stats
-        uvm_cache_stats);
-  }
-
-  // Reset weight and momentum of pruned rows
-  DISPATCH_EMB_CACHE_TYPES(
-      dev_weights.scalar_type(),
-      lxu_cache_weights.scalar_type(),
-      "reset_weight_momentum_kernel",
-      ([&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name2 = "get_cache_indices_kernel";
-#endif
-        reset_weight_momentum_kernel<emb_t, cache_t>
-            <<<num_pruned_tables * blocks_per_table,
-               kMaxThreads,
-               0,
-               at::cuda::getCurrentCUDAStream()>>>(
-                blocks_per_table,
-                MAKE_PTA_WITH_NAME(func_name2, dev_weights, emb_t, 1, 64),
-                MAKE_PTA_WITH_NAME(func_name2, uvm_weights, emb_t, 1, 64),
-                MAKE_PTA_WITH_NAME(
-                    func_name2, lxu_cache_weights, cache_t, 2, 64),
-                MAKE_PTA_WITH_NAME(
-                    func_name2, weights_placements, int32_t, 1, 32),
-                MAKE_PTA_WITH_NAME(func_name2, weights_offsets, int64_t, 1, 32),
-                MAKE_PTA_ACC_WITH_NAME(
-                    func_name2, momentum1_dev, cache_t, 1, 64),
-                MAKE_PTA_ACC_WITH_NAME(
-                    func_name2, momentum1_uvm, cache_t, 1, 64),
-                MAKE_PTA_WITH_NAME(
-                    func_name2, momentum1_placements, int32_t, 1, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name2, momentum1_offsets, int64_t, 1, 32),
-                MAKE_PTA_WITH_NAME(func_name2, D_offsets, int32_t, 1, 32),
-                MAKE_PTA_WITH_NAME(func_name2, pruned_indices, int64_t, 1, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name2, pruned_indices_offsets, int64_t, 1, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name2, logical_table_ids, int32_t, 1, 32),
-                MAKE_PTA_WITH_NAME(func_name2, buffer_ids, int32_t, 1, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name2, lxu_cache_locations, int32_t, 1, 32));
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      }));
-}

From 570090d086b4c7fb77cdf63af8e523a42ed249af Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Wed, 11 Oct 2023 22:10:18 -0700
Subject: [PATCH 74/94] Add test retries (#2071)

Summary:
- Add retries to Python tests with delays in between retries

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2071

Reviewed By: spcyppt

Differential Revision: D50207135

Pulled By: q10

fbshipit-source-id: 254a541f3007890ea6e11890552d52b6e6e44932
---
 .github/scripts/fbgemm_gpu_build.bash         |  2 +-
 .github/scripts/fbgemm_gpu_docs.bash          |  8 +++---
 .github/scripts/fbgemm_gpu_install.bash       |  2 +-
 .github/scripts/fbgemm_gpu_lint.bash          |  2 +-
 .github/scripts/fbgemm_gpu_test.bash          |  4 +--
 .github/scripts/utils_base.bash               | 25 +++++++++++--------
 .github/scripts/utils_build.bash              |  4 +--
 .github/scripts/utils_conda.bash              | 10 ++++----
 .github/scripts/utils_cuda.bash               |  4 +--
 .github/scripts/utils_pip.bash                |  6 ++---
 .github/scripts/utils_pytorch.bash            |  2 +-
 .github/scripts/utils_rocm.bash               |  2 +-
 .github/scripts/utils_system.bash             |  4 +--
 .github/workflows/fbgemm_gpu_ci.yml           |  2 +-
 .github/workflows/fbgemm_gpu_cpu_nightly.yml  |  2 +-
 .github/workflows/fbgemm_gpu_cpu_release.yml  |  2 +-
 .github/workflows/fbgemm_gpu_cuda_nightly.yml |  2 +-
 .github/workflows/fbgemm_gpu_cuda_release.yml |  2 +-
 .github/workflows/fbgemm_gpu_pip.yml          |  2 +-
 19 files changed, 46 insertions(+), 41 deletions(-)

diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index 4355ac2936..381efc2fbc 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -45,7 +45,7 @@ prepare_fbgemm_gpu_build () {
 
   echo "[BUILD] Installing other build dependencies ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda run --no-capture-output ${env_prefix} python -m pip install -r requirements.txt) || return 1
+  (exec_with_retries 3 conda run --no-capture-output ${env_prefix} python -m pip install -r requirements.txt) || return 1
 
   # shellcheck disable=SC2086
   (test_python_import_package "${env_name}" numpy) || return 1
diff --git a/.github/scripts/fbgemm_gpu_docs.bash b/.github/scripts/fbgemm_gpu_docs.bash
index 0e923afb1b..caaf923355 100644
--- a/.github/scripts/fbgemm_gpu_docs.bash
+++ b/.github/scripts/fbgemm_gpu_docs.bash
@@ -36,7 +36,7 @@ install_docs_tools () {
 
   echo "[INSTALL] Installing docs tools ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda install ${env_prefix} -c conda-forge -y \
+  (exec_with_retries 3 conda install ${env_prefix} -c conda-forge -y \
     doxygen) || return 1
 
   # Check binaries are visible in the PATH
@@ -71,15 +71,15 @@ build_fbgemm_gpu_docs () {
 
   echo "[BUILD] Installing docs-build dependencies ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda run ${env_prefix} python -m pip install -r requirements.txt) || return 1
+  (exec_with_retries 3 conda run ${env_prefix} python -m pip install -r requirements.txt) || return 1
 
   echo "[BUILD] Running Doxygen build ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda run ${env_prefix} doxygen Doxyfile.in) || return 1
+  (exec_with_retries 3 conda run ${env_prefix} doxygen Doxyfile.in) || return 1
 
   echo "[BUILD] Building HTML pages ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda run ${env_prefix} make html) || return 1
+  (exec_with_retries 3 conda run ${env_prefix} make html) || return 1
 
   echo "[INSTALL] FBGEMM-GPU documentation build completed"
 }
diff --git a/.github/scripts/fbgemm_gpu_install.bash b/.github/scripts/fbgemm_gpu_install.bash
index 02e60e8e99..b0d6610d61 100644
--- a/.github/scripts/fbgemm_gpu_install.bash
+++ b/.github/scripts/fbgemm_gpu_install.bash
@@ -58,7 +58,7 @@ install_fbgemm_gpu_wheel () {
 
   echo "[INSTALL] Installing FBGEMM-GPU wheel: ${wheel_path} ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda run ${env_prefix} python -m pip install "${wheel_path}") || return 1
+  (exec_with_retries 3 conda run ${env_prefix} python -m pip install "${wheel_path}") || return 1
 
   __fbgemm_gpu_post_install_checks "${env_name}" || return 1
 
diff --git a/.github/scripts/fbgemm_gpu_lint.bash b/.github/scripts/fbgemm_gpu_lint.bash
index fc2ab7d25c..a83a22bc8c 100644
--- a/.github/scripts/fbgemm_gpu_lint.bash
+++ b/.github/scripts/fbgemm_gpu_lint.bash
@@ -36,7 +36,7 @@ install_lint_tools () {
 
   echo "[INSTALL] Installing lint tools ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda install ${env_prefix} -c conda-forge -y \
+  (exec_with_retries 3 conda install ${env_prefix} -c conda-forge -y \
     click \
     flake8 \
     ufmt) || return 1
diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
index 83b9ceefb6..8d839eb29f 100644
--- a/.github/scripts/fbgemm_gpu_test.bash
+++ b/.github/scripts/fbgemm_gpu_test.bash
@@ -32,7 +32,7 @@ run_python_test () {
   local env_prefix=$(env_name_or_prefix "${env_name}")
 
   # shellcheck disable=SC2086
-  if print_exec conda run --no-capture-output ${env_prefix} python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
+  if exec_with_retries 2 conda run --no-capture-output ${env_prefix} python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
     echo "[TEST] Python test suite PASSED: ${python_test_file}"
     echo ""
   else
@@ -99,7 +99,7 @@ run_fbgemm_gpu_tests () {
 
   echo "[TEST] Installing pytest ..."
   # shellcheck disable=SC2086
-  print_exec conda install ${env_prefix} -y pytest expecttest
+  (exec_with_retries 3 conda install ${env_prefix} -y pytest expecttest) || return 1
 
   echo "[TEST] Checking imports ..."
   (test_python_import_package "${env_name}" fbgemm_gpu) || return 1
diff --git a/.github/scripts/utils_base.bash b/.github/scripts/utils_base.bash
index 05ff368900..7ea56f816c 100644
--- a/.github/scripts/utils_base.bash
+++ b/.github/scripts/utils_base.bash
@@ -40,26 +40,31 @@ print_exec () {
 }
 
 exec_with_retries () {
-  local max=5
-  local delay=2
+  local max_retries="$1"
+  local delay_secs=2
   local retcode=0
 
-  for i in $(seq 1 ${max}); do
-    echo "[EXEC] [ATTEMPT ${i}/${max}]    + $*"
+  # shellcheck disable=SC2086
+  for i in $(seq 0 ${max_retries}); do
+    # shellcheck disable=SC2145
+    echo "[EXEC] [ATTEMPT ${i}/${max_retries}]    + ${@:2}"
 
-    if "$@"; then
-      retcode=0
+    if "${@:2}"; then
+      local retcode=0
       break
     else
-      retcode=$?
-      echo "[EXEC] [ATTEMPT ${i}/${max}] Command attempt failed."
+      local retcode=$?
+      echo "[EXEC] [ATTEMPT ${i}/${max_retries}] Command attempt failed."
       echo ""
-      sleep $delay
+
+      if [ "$i" -ne "$max_retries" ]; then
+        sleep $delay_secs
+      fi
     fi
   done
 
   if [ $retcode -ne 0 ]; then
-    echo "[EXEC] The command has failed after ${max} attempts; aborting."
+    echo "[EXEC] The command has failed after ${max_retries} + 1 attempts; aborting."
   fi
 
   return $retcode
diff --git a/.github/scripts/utils_build.bash b/.github/scripts/utils_build.bash
index 0d4b886f76..e03d67e2c5 100644
--- a/.github/scripts/utils_build.bash
+++ b/.github/scripts/utils_build.bash
@@ -91,7 +91,7 @@ install_cxx_compiler () {
 
     echo "[INSTALL] Installing C/C++ compilers through Conda (architecture = ${archname}) ..."
     # shellcheck disable=SC2086
-    (exec_with_retries conda install ${env_prefix} -y "gxx_linux-${archname}"=10.4.0 "sysroot_linux-${archname}"=2.17 -c conda-forge) || return 1
+    (exec_with_retries 3 conda install ${env_prefix} -y "gxx_linux-${archname}"=10.4.0 "sysroot_linux-${archname}"=2.17 -c conda-forge) || return 1
 
     # The compilers are visible in the PATH as `x86_64-conda-linux-gnu-cc` and
     # `x86_64-conda-linux-gnu-c++`, so symlinks will need to be created
@@ -161,7 +161,7 @@ install_build_tools () {
 
   echo "[INSTALL] Installing build tools ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda install ${env_prefix} -y \
+  (exec_with_retries 3 conda install ${env_prefix} -y \
     click \
     cmake \
     hypothesis \
diff --git a/.github/scripts/utils_conda.bash b/.github/scripts/utils_conda.bash
index 63bf64d0cc..18c6290e10 100644
--- a/.github/scripts/utils_conda.bash
+++ b/.github/scripts/utils_conda.bash
@@ -42,7 +42,7 @@ setup_miniconda () {
     print_exec mkdir -p "$miniconda_prefix"
 
     echo "[SETUP] Downloading the Miniconda installer ..."
-    (exec_with_retries wget -q "https://repo.anaconda.com/miniconda/Miniconda3-latest-${PLATFORM_NAME}.sh" -O miniconda.sh) || return 1
+    (exec_with_retries 3 wget -q "https://repo.anaconda.com/miniconda/Miniconda3-latest-${PLATFORM_NAME}.sh" -O miniconda.sh) || return 1
 
     echo "[SETUP] Installing Miniconda ..."
     print_exec bash miniconda.sh -b -p "$miniconda_prefix" -u
@@ -54,7 +54,7 @@ setup_miniconda () {
   print_exec . ~/.bashrc
 
   echo "[SETUP] Updating Miniconda base packages ..."
-  (exec_with_retries conda update -n base -c defaults --update-deps -y conda) || return 1
+  (exec_with_retries 3 conda update -n base -c defaults --update-deps -y conda) || return 1
 
   # Clean up packages
   conda_cleanup
@@ -112,17 +112,17 @@ create_conda_environment () {
   # The `-y` flag removes any existing Conda environment with the same name
   echo "[SETUP] Creating new Conda environment (Python ${python_version}) ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda create -y ${env_prefix} python="${python_version}") || return 1
+  (exec_with_retries 3 conda create -y ${env_prefix} python="${python_version}") || return 1
 
   echo "[SETUP] Upgrading PIP to latest ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda run ${env_prefix} pip install --upgrade pip) || return 1
+  (exec_with_retries 3 conda run ${env_prefix} pip install --upgrade pip) || return 1
 
   # The pyOpenSSL and cryptography packages versions need to line up for PyPI publishing to work
   # https://stackoverflow.com/questions/74981558/error-updating-python3-pip-attributeerror-module-lib-has-no-attribute-openss
   echo "[SETUP] Upgrading pyOpenSSL ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda run ${env_prefix} python -m pip install "pyOpenSSL>22.1.0") || return 1
+  (exec_with_retries 3 conda run ${env_prefix} python -m pip install "pyOpenSSL>22.1.0") || return 1
 
   # This test fails with load errors if the pyOpenSSL and cryptography package versions don't align
   echo "[SETUP] Testing pyOpenSSL import ..."
diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash
index d068896e54..09b0b543d5 100644
--- a/.github/scripts/utils_cuda.bash
+++ b/.github/scripts/utils_cuda.bash
@@ -49,7 +49,7 @@ install_cuda () {
   # Install CUDA packages
   echo "[INSTALL] Installing CUDA ${cuda_version} ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda install --force-reinstall ${env_prefix} -y cuda -c "nvidia/label/cuda-${cuda_version}") || return 1
+  (exec_with_retries 3 conda install --force-reinstall ${env_prefix} -y cuda -c "nvidia/label/cuda-${cuda_version}") || return 1
 
   # Ensure that nvcc is properly installed
   (test_binpath "${env_name}" nvcc) || return 1
@@ -135,7 +135,7 @@ install_cudnn () {
 
   # Download cuDNN
   echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..."
-  (exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1
+  (exec_with_retries 3 wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1
 
   # Unpack the tarball
   echo "[INSTALL] Unpacking cuDNN ..."
diff --git a/.github/scripts/utils_pip.bash b/.github/scripts/utils_pip.bash
index cd08af2c12..674b5ca029 100644
--- a/.github/scripts/utils_pip.bash
+++ b/.github/scripts/utils_pip.bash
@@ -108,7 +108,7 @@ install_from_pytorch_pip () {
 
   echo "[INSTALL] Attempting to install [${package_name}, ${package_version}+${package_variant}] from PyTorch PIP using channel ${pip_channel} ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda run ${env_prefix} pip install ${pip_package} --extra-index-url ${pip_channel}) || return 1
+  (exec_with_retries 3 conda run ${env_prefix} pip install ${pip_package} --extra-index-url ${pip_channel}) || return 1
 
   # Check only applies to non-CPU variants
   if [ "$package_variant_type" != "cpu" ]; then
@@ -165,7 +165,7 @@ download_from_pytorch_pip () {
 
   echo "[DOWNLOAD] Attempting to download wheel [${package_name}, ${package_version}+${package_variant}] from PyTorch PIP using channel ${pip_channel} ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda run ${env_prefix} pip download ${pip_package} --extra-index-url ${pip_channel}) || return 1
+  (exec_with_retries 3 conda run ${env_prefix} pip download ${pip_package} --extra-index-url ${pip_channel}) || return 1
 
   # Ensure that the package build is of the correct variant
   # This test usually applies to the nightly builds
@@ -209,7 +209,7 @@ publish_to_pypi () {
 
   echo "[INSTALL] Installing twine ..."
   # shellcheck disable=SC2086
-  print_exec conda install ${env_prefix} -y twine
+  (exec_with_retries 3 conda install ${env_prefix} -y twine) || return 1
   (test_python_import_package "${env_name}" twine) || return 1
   (test_python_import_package "${env_name}" OpenSSL) || return 1
 
diff --git a/.github/scripts/utils_pytorch.bash b/.github/scripts/utils_pytorch.bash
index f14e97b2c2..77e88a8130 100644
--- a/.github/scripts/utils_pytorch.bash
+++ b/.github/scripts/utils_pytorch.bash
@@ -69,7 +69,7 @@ install_pytorch_conda () {
   # Use --force-reinstall to address this on retries - https://datascience.stackexchange.com/questions/41732/conda-verification-failed
   echo "[INSTALL] Attempting to install '${pytorch_package}' (${pytorch_version}, variant = ${pytorch_variant_type}) through Conda using channel '${pytorch_channel}' ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda install --force-reinstall ${env_prefix} -y ${pytorch_package} -c "${pytorch_channel}") || return 1
+  (exec_with_retries 3 conda install --force-reinstall ${env_prefix} -y ${pytorch_package} -c "${pytorch_channel}") || return 1
 
   # Check that PyTorch is importable
   (test_python_import_package "${env_name}" torch.distributed) || return 1
diff --git a/.github/scripts/utils_rocm.bash b/.github/scripts/utils_rocm.bash
index 8efb8128fb..beac98d303 100644
--- a/.github/scripts/utils_rocm.bash
+++ b/.github/scripts/utils_rocm.bash
@@ -63,7 +63,7 @@ install_rocm_ubuntu () {
 
   # Skip installation of kernel driver when run in Docker mode with --no-dkms
   echo "[INSTALL] Installing ROCm ..."
-  (exec_with_retries amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms) || return 1
+  (exec_with_retries 3 amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms) || return 1
 
   echo "[INSTALL] Installing HIP-relevant packages ..."
   install_system_packages hipify-clang miopen-hip miopen-hip-dev
diff --git a/.github/scripts/utils_system.bash b/.github/scripts/utils_system.bash
index de37ec80ef..d6be9707ff 100644
--- a/.github/scripts/utils_system.bash
+++ b/.github/scripts/utils_system.bash
@@ -44,12 +44,12 @@ install_system_packages () {
 
   echo "[INSTALL] Updating system repositories ..."
   # shellcheck disable=SC2068
-  exec_with_retries ${update_cmd[@]}
+  (exec_with_retries 3 ${update_cmd[@]}) || return 1
 
   # shellcheck disable=SC2145
   echo "[INSTALL] Installing system package(s): $@ ..."
   # shellcheck disable=SC2068
-  exec_with_retries ${install_cmd[@]}
+  (exec_with_retries 3 ${install_cmd[@]}) || return 1
 }
 
 free_disk_space () {
diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml
index 8760785a8c..5a643ad090 100644
--- a/.github/workflows/fbgemm_gpu_ci.yml
+++ b/.github/workflows/fbgemm_gpu_ci.yml
@@ -92,7 +92,7 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm gfx90a
 
     - name: Test FBGEMM_GPU-ROCm Nightly Installation
-      timeout-minutes: 10
+      timeout-minutes: 15
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
 
 
diff --git a/.github/workflows/fbgemm_gpu_cpu_nightly.yml b/.github/workflows/fbgemm_gpu_cpu_nightly.yml
index e273c4e64c..5501ee89e3 100644
--- a/.github/workflows/fbgemm_gpu_cpu_nightly.yml
+++ b/.github/workflows/fbgemm_gpu_cpu_nightly.yml
@@ -174,7 +174,7 @@ jobs:
         install_fbgemm_gpu_wheel $BUILD_ENV *.whl
 
     - name: Test with PyTest
-      timeout-minutes: 10
+      timeout-minutes: 15
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
 
     - name: Push FBGEMM_GPU Nightly (CPU version) Binary to PYPI
diff --git a/.github/workflows/fbgemm_gpu_cpu_release.yml b/.github/workflows/fbgemm_gpu_cpu_release.yml
index 67480ef3f2..1300b4781f 100644
--- a/.github/workflows/fbgemm_gpu_cpu_release.yml
+++ b/.github/workflows/fbgemm_gpu_cpu_release.yml
@@ -167,7 +167,7 @@ jobs:
         install_fbgemm_gpu_wheel $BUILD_ENV *.whl
 
     - name: Test with PyTest
-      timeout-minutes: 10
+      timeout-minutes: 15
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
 
     - name: Push FBGEMM_GPU (CPU version) Binary to PYPI
diff --git a/.github/workflows/fbgemm_gpu_cuda_nightly.yml b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
index 9635a2c20c..be1fc32fc4 100644
--- a/.github/workflows/fbgemm_gpu_cuda_nightly.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
@@ -180,7 +180,7 @@ jobs:
       run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl
 
     - name: Test with PyTest
-      timeout-minutes: 10
+      timeout-minutes: 15
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
 
     - name: Push FBGEMM_GPU Nightly Binary to PYPI
diff --git a/.github/workflows/fbgemm_gpu_cuda_release.yml b/.github/workflows/fbgemm_gpu_cuda_release.yml
index 92cc0e1e38..838be62996 100644
--- a/.github/workflows/fbgemm_gpu_cuda_release.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_release.yml
@@ -177,7 +177,7 @@ jobs:
       run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl
 
     - name: Test with PyTest
-      timeout-minutes: 10
+      timeout-minutes: 15
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
 
     - name: Push FBGEMM_GPU Binary to PYPI
diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
index 1b5518e506..6dec1d87bc 100644
--- a/.github/workflows/fbgemm_gpu_pip.yml
+++ b/.github/workflows/fbgemm_gpu_pip.yml
@@ -147,7 +147,7 @@ jobs:
       run: . $PRELUDE; install_fbgemm_gpu_pip $BUILD_ENV ${{ github.event.inputs.fbgemm_gpu_version || 'nightly' }} cuda ${{ matrix.cuda-version }}
 
     - name: Test with PyTest
-      timeout-minutes: 10
+      timeout-minutes: 15
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
 
 

From 9b1a1f87f0dfddc458b921707db062ca4e5b8b90 Mon Sep 17 00:00:00 2001
From: "Amit Agarwal (Ads ML Serving)" <amitaga@meta.com>
Date: Thu, 12 Oct 2023 07:34:13 -0700
Subject: [PATCH 75/94] Add meta function for fbgemm::merge_pooled_embeddings
 operator (#2069)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2069

Add meta function for fbgemm::merge_pooled_embeddings operator to enable PT2 export of models containign this operator

Reviewed By: khabinov

Differential Revision: D49709130

fbshipit-source-id: 7c9bee0a82332f6ba78938d50c8bfd138051649f
---
 .../src/merge_pooled_embeddings_gpu.cpp       | 51 +++++++++++++++----
 .../test/merge_pooled_embeddings_test.py      | 29 +++++++++++
 2 files changed, 71 insertions(+), 9 deletions(-)

diff --git a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
index 0a3ad3b63a..0977a0d98c 100644
--- a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
+++ b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
@@ -18,6 +18,7 @@
 #include <c10/util/irange.h>
 #include <torch/library.h>
 #include <algorithm>
+#include <tuple>
 
 #include "fbgemm_gpu/merge_pooled_embeddings.h"
 #include "fbgemm_gpu/sparse_ops_utils.h"
@@ -515,16 +516,16 @@ Tensor sum_reduce_to_one(
   return output_tensor;
 }
 
-Tensor cat_dim_2d(
+std::tuple<std::array<int64_t, 2>, std::vector<int64_t>, int64_t>
+cat_dim_2d_output_shape(
     std::vector<Tensor>& tensors,
     int64_t uncat_dim_size,
-    at::Device output_device,
-    int64_t cat_dim = 1) {
+    int64_t cat_dim) {
+  TORCH_CHECK(!tensors.empty());
+
   // only support 2d tensor concatenation.
   TORCH_CHECK(cat_dim >= 0 && cat_dim <= 1);
-  if (tensors.size() == 0) {
-    return at::empty({0}, at::TensorOptions().device(output_device));
-  }
+
   int64_t total_cat_dim = 0;
   std::vector<int64_t> cumulative_dims;
   cumulative_dims.push_back(0);
@@ -536,14 +537,30 @@ Tensor cat_dim_2d(
     cumulative_dims.push_back(total_cat_dim);
   }
 
-  auto* prop = at::cuda::getCurrentDeviceProperties();
   // default shape for concatenating on dim 1
-  std::vector<int64_t> output_shape;
+  std::array<int64_t, 2> output_shape;
   if (cat_dim == 0) {
     output_shape = {total_cat_dim, uncat_dim_size};
   } else {
     output_shape = {uncat_dim_size, total_cat_dim};
   }
+
+  return std::make_tuple(output_shape, cumulative_dims, total_cat_dim);
+}
+
+Tensor cat_dim_2d(
+    std::vector<Tensor>& tensors,
+    int64_t uncat_dim_size,
+    at::Device output_device,
+    int64_t cat_dim = 1) {
+  if (tensors.size() == 0) {
+    return at::empty({0}, at::TensorOptions().device(output_device));
+  }
+  // only support 2d tensor concatenation.
+  auto [output_shape, cumulative_dims, total_cat_dim] =
+      cat_dim_2d_output_shape(tensors, uncat_dim_size, cat_dim);
+
+  auto* prop = at::cuda::getCurrentDeviceProperties();
   auto output =
       at::empty(output_shape, tensors.front().options().device(output_device));
   TORCH_CHECK(
@@ -623,8 +640,22 @@ Tensor sum_reduce_to_one_device(
   init_p2p_access();
 
   return sum_reduce_to_one(input_tensors, target_device);
-};
+}
 
+Tensor merge_pooled_embeddings_meta(
+    std::vector<Tensor> pooled_embeddings,
+    int64_t uncat_dim_size,
+    at::Device /*target_device*/,
+    int64_t cat_dim) {
+  if (pooled_embeddings.size() == 0) {
+    return at::empty({0}, at::TensorOptions().device("meta"));
+  }
+
+  auto [output_shape, cumulative_dims, total_cat_dim] =
+      cat_dim_2d_output_shape(pooled_embeddings, uncat_dim_size, cat_dim);
+
+  return at::empty(output_shape, pooled_embeddings.front().options());
+}
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
@@ -638,4 +669,6 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "sum_reduce_to_one(Tensor[] input_tensors, Device target_device) -> Tensor");
   DISPATCH_TO_CUDA("sum_reduce_to_one", fbgemm_gpu::sum_reduce_to_one_device);
+  DISPATCH_TO_META(
+      "merge_pooled_embeddings", fbgemm_gpu::merge_pooled_embeddings_meta);
 }
diff --git a/fbgemm_gpu/test/merge_pooled_embeddings_test.py b/fbgemm_gpu/test/merge_pooled_embeddings_test.py
index 65822921a3..0e4b7986e7 100644
--- a/fbgemm_gpu/test/merge_pooled_embeddings_test.py
+++ b/fbgemm_gpu/test/merge_pooled_embeddings_test.py
@@ -8,6 +8,7 @@
 # pyre-unsafe
 
 import unittest
+from typing import Tuple
 
 import hypothesis.strategies as st
 import torch
@@ -29,6 +30,8 @@
 
     open_source = False
 
+typed_gpu_unavailable: Tuple[bool, str] = gpu_unavailable
+
 
 @unittest.skipIf(*gpu_unavailable)
 @unittest.skipIf(open_source, "Not supported in open source yet")
@@ -154,6 +157,32 @@ def test_sum_reduce_to_one(
                 cuda_output.cpu(), torch.stack(inputs).sum(dim=0)
             )
 
+    @unittest.skipIf(*typed_gpu_unavailable)
+    def test_merge_pooled_embeddings_meta(self) -> None:
+        """
+        Test that merge_pooled_embeddings works with meta tensor and
+        dynamo export mode
+        """
+        uncat_size = 2
+        cat_dim = 1
+        pooled_embeddings = [torch.ones(uncat_size, 4), torch.ones(uncat_size, 8)]
+
+        def fbgemm_merge_pooled_embeddings(device):
+            pooled_embeddings_device = [
+                pooled_embedding.to(device) for pooled_embedding in pooled_embeddings
+            ]
+            return torch.ops.fbgemm.merge_pooled_embeddings(
+                pooled_embeddings_device, uncat_size, device, cat_dim
+            )
+
+        output_cpu = fbgemm_merge_pooled_embeddings(torch.device("cpu"))
+        output_meta = fbgemm_merge_pooled_embeddings(torch.device("meta"))
+
+        self.assertFalse(output_meta.is_cpu)
+        self.assertTrue(output_meta.is_meta)
+
+        assert output_meta.shape == output_cpu.shape
+
 
 if __name__ == "__main__":
     unittest.main()

From ae529023563872a36c7dffae2fc6c7b8571a679d Mon Sep 17 00:00:00 2001
From: Richard Zou <rzou@meta.com>
Date: Thu, 12 Oct 2023 08:10:54 -0700
Subject: [PATCH 76/94] Turn on generate_opcheck_tests for
 jagged_tensor_ops_test.py (#2070)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2070

This diff adds autogenerated opcheck tests to the jagged tensor operator tests.
We skip some torch.compile and gradcheck tests because those take an
unreasonable amount of time to run.

Reviewed By: ezyang

Differential Revision: D50129561

fbshipit-source-id: e03595488a7162c9cdff4a70ff06bc3d5ebad521
---
 fbgemm_gpu/test/failures_dict.json        | 334 +++++++++++++++++++++-
 fbgemm_gpu/test/jagged_tensor_ops_test.py |  41 ++-
 fbgemm_gpu/test/test_utils.py             |  55 +++-
 3 files changed, 423 insertions(+), 7 deletions(-)

diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index 7a7e1d40fa..28b1a6db69 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -23,6 +23,7 @@
         "status": "xfail"
       }
     },
+    "fbgemm::batched_dense_vec_jagged_2d_mul": {},
     "fbgemm::block_bucketize_sparse_features": {
       "SparseOpsTest.test_aot_dispatch_dynamic__test_block_bucketize_sparse_features": {
         "comment": "",
@@ -103,6 +104,70 @@
         "status": "xfail"
       }
     },
+    "fbgemm::dense_to_jagged": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_dense_to_jagged": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_dense_to_jagged_meta_backend": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_dense_to_jagged_opt": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_dense_to_jagged_opt_large_batch": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_dense_to_jagged": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_dense_to_jagged_meta_backend": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_dense_to_jagged_opt": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_dense_to_jagged_opt_large_batch": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged_meta_backend": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged_opt": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged_opt_large_batch": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::expand_into_jagged_permute": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_expand_into_jagged_permute": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_expand_into_jagged_permute": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_expand_into_jagged_permute": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
     "fbgemm::generic_histogram_binning_calibration_by_feature": {
       "SparseOpsTest.test_aot_dispatch_dynamic__test_generic_histogram_binning_calibration_by_feature": {
         "comment": "",
@@ -181,7 +246,242 @@
         "status": "xfail"
       }
     },
-    "fbgemm::offsets_range": {},
+    "fbgemm::jagged_1d_to_dense": {},
+    "fbgemm::jagged_1d_to_truncated_values": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_1d_to_truncated_values": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_1d_to_truncated_values": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_jagged_1d_to_truncated_values": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::jagged_2d_to_dense": {},
+    "fbgemm::jagged_dense_bmm": {},
+    "fbgemm::jagged_dense_dense_elementwise_add_jagged_output": {},
+    "fbgemm::jagged_dense_elementwise_add": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_elementwise_binary": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_elementwise_binary": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_autograd_registration__test_jagged_elementwise_binary": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::jagged_dense_elementwise_add_jagged_output": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_elementwise_binary": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_elementwise_binary_opt": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_elementwise_binary": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_elementwise_binary_opt": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_autograd_registration__test_jagged_elementwise_binary": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_autograd_registration__test_jagged_elementwise_binary_opt": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_jagged_elementwise_binary": {
+        "comment": "This is a real failure, it just doesn't fail under all situations",
+        "status": "skip"
+      }
+    },
+    "fbgemm::jagged_dense_elementwise_mul": {},
+    "fbgemm::jagged_hash_size_cumsum": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_unique_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_unique_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_jagged_unique_indices": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::jagged_index_select": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_index_select_2d": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_index_select_2d_in_inference": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_keyed_jagged_index_select_dim1": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_index_select_2d": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_index_select_2d_in_inference": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_keyed_jagged_index_select_dim1": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_jagged_index_select_2d": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_jagged_index_select_2d_in_inference": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_keyed_jagged_index_select_dim1": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::jagged_jagged_bmm": {},
+    "fbgemm::jagged_slice": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_slice": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_slice": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_jagged_slice": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::jagged_softmax": {},
+    "fbgemm::jagged_to_padded_dense": {},
+    "fbgemm::jagged_unique_indices": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_unique_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_unique_indices_empty": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_unique_indices_multi_keys": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_unique_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_unique_indices_empty": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_unique_indices_multi_keys": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_jagged_unique_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_jagged_unique_indices_empty": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_jagged_unique_indices_multi_keys": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::keyed_jagged_index_select_dim1": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_keyed_jagged_index_select_dim1": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_keyed_jagged_index_select_dim1": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_autograd_registration__test_keyed_jagged_index_select_dim1": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_keyed_jagged_index_select_dim1": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::masked_select_jagged_1d": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_masked_select_jagged_1d": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_masked_select_jagged_1d": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_masked_select_jagged_1d": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::offsets_range": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_1d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_2d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_2d_to_dense_truncation": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_stacked_jagged_2d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_1d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_2d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_2d_to_dense_truncation": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_stacked_jagged_2d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
     "fbgemm::pack_segments": {},
     "fbgemm::permute102_baddbmm_permute102": {
       "SparseOpsTest.test_aot_dispatch_dynamic__test_permute102_baddbmm_permute102": {
@@ -372,6 +672,38 @@
         "comment": "",
         "status": "xfail"
       }
+    },
+    "fbgemm::stacked_jagged_1d_to_dense": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_stacked_jagged_1d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_stacked_jagged_1d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_stacked_jagged_1d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::stacked_jagged_2d_to_dense": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_stacked_jagged_2d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_stacked_jagged_2d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_autograd_registration__test_stacked_jagged_2d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_stacked_jagged_2d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      }
     }
   }
 }
diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py
index e202268f67..1062aaf8f3 100644
--- a/fbgemm_gpu/test/jagged_tensor_ops_test.py
+++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py
@@ -10,12 +10,13 @@
 import itertools
 import random
 import unittest
-from typing import List, Tuple
+from typing import Any, Callable, List, Tuple, Union
 
 import hypothesis.strategies as st
 import numpy as np
 import torch
 import torch._dynamo
+import torch.testing._internal.optests as optests
 from hypothesis import assume, given, settings, Verbosity
 
 try:
@@ -24,6 +25,7 @@
 
     # pyre-ignore[21]
     from test_utils import (
+        generate_opcheck_tests,
         gpu_available,
         gpu_unavailable,
         on_arm_platform,
@@ -34,6 +36,7 @@
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
     from fbgemm_gpu.test.test_utils import (
+        generate_opcheck_tests,
         gpu_available,
         gpu_unavailable,
         on_arm_platform,
@@ -49,6 +52,26 @@ def lengths_to_segment_ids(lengths: torch.Tensor) -> torch.Tensor:
     )
 
 
+# Version of torch.autograd.gradcheck that works with generate_opcheck_tests.
+# The problem with just torch.autograd.gradcheck is that it results in
+# very slow tests when composed with generate_opcheck_tests.
+def gradcheck(
+    # pyre-ignore[24]: Generic type `Callable` expects 2 type parameters.
+    f: Callable,
+    # pyre-ignore[2]
+    inputs: Union[torch.Tensor, Tuple[Any, ...]],
+    *args: Any,
+    **kwargs: Any,
+) -> None:
+    if optests.is_inside_opcheck_mode():
+        if isinstance(inputs, torch.Tensor):
+            f(inputs)
+        else:
+            f(*inputs)
+        return
+    torch.autograd.gradcheck(f, inputs, *args, **kwargs)
+
+
 # Converts lengths + values format to COO format
 # [B], [N] -> [B, N'].
 # pyre-ignore Missing return annotation [3]
@@ -108,6 +131,7 @@ def hash_size_cumsum_to_offsets(hash_size_cum_sum_list: List[int]) -> List[int]:
     return hash_size_offsets_list
 
 
+@generate_opcheck_tests
 class JaggedTensorOpsTest(unittest.TestCase):
     def setUp(self) -> None:
         if symint_vector_unsupported()[0]:
@@ -295,6 +319,7 @@ def test_jagged_2d_to_dense_truncation(self) -> None:
             output_values.backward(ref_output_values)
             torch.testing.assert_close(expected_grad, values.grad)
 
+    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @settings(
         verbosity=Verbosity.verbose,
@@ -507,6 +532,7 @@ def test_jagged_1d_to_dense_truncation(self) -> None:
             )
             torch.testing.assert_close(ref_output, output)
 
+    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @settings(
         verbosity=Verbosity.verbose,
@@ -901,6 +927,7 @@ def test_dense_to_jagged_meta_backend(
         # verify forward
         assert dense.size() == dense2.size()
 
+    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @given(
         num_jagged_dim=st.integers(1, 5),
@@ -1045,7 +1072,7 @@ def test_jagged_to_padded_dense(
 
         torch.testing.assert_close(output, output_ref)
 
-        torch.autograd.gradcheck(
+        gradcheck(
             torch.ops.fbgemm.jagged_to_padded_dense,
             (
                 x_values.double().requires_grad_(True),
@@ -1171,7 +1198,7 @@ def mul_func(*args) -> torch.Tensor:
 
             f = mul_func
 
-        torch.autograd.gradcheck(
+        gradcheck(
             f,
             (
                 x_values.double().requires_grad_(True),
@@ -1239,6 +1266,7 @@ def test_jagged_elementwise_binary_opt(
             device_type,
         )
 
+    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @given(
         num_jagged_dim=st.integers(1, 5),
@@ -1396,7 +1424,7 @@ def add_jagged_output_func(*args) -> torch.Tensor:
 
         f = add_jagged_output_func
 
-        torch.autograd.gradcheck(
+        gradcheck(
             f,
             (
                 x_values.double().requires_grad_(True),
@@ -1512,6 +1540,7 @@ def test_jagged_dense_dense_elementwise_add_jagged_output_meta_backend(
 
         assert output.size() == output_ref.size()
 
+    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @given(
         num_jagged_dim=st.integers(1, 4),
@@ -1647,7 +1676,7 @@ def test_batched_dense_vec_jagged_2d_mul(
             atol=1e-2 if dtype in [torch.half, torch.bfloat16] else None,
         )
 
-        torch.autograd.gradcheck(
+        gradcheck(
             torch.ops.fbgemm.batched_dense_vec_jagged_2d_mul,
             (
                 dense.clone().detach().double().requires_grad_(True),
@@ -1715,6 +1744,7 @@ def test_batched_dense_vec_jagged_2d_mul_meta_backend(
         )
         assert output.size() == output_ref.size()
 
+    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @settings(
         verbosity=Verbosity.verbose,
@@ -2436,6 +2466,7 @@ def test_jagged_dense_bmm(
         torch.testing.assert_close(x_values.grad, x_values_ref.grad)
         torch.testing.assert_close(y.grad, y_ref.grad)
 
+    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @given(
         B=st.integers(10, 512),
diff --git a/fbgemm_gpu/test/test_utils.py b/fbgemm_gpu/test/test_utils.py
index 64236296fc..a92fbb4404 100644
--- a/fbgemm_gpu/test/test_utils.py
+++ b/fbgemm_gpu/test/test_utils.py
@@ -4,13 +4,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import inspect
 import math
 import os
 import struct
 import subprocess
 import unittest
 from functools import wraps
-from typing import Any, Callable, List, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import hypothesis.strategies as st
 import numpy as np
@@ -202,6 +203,58 @@ def cpu_and_maybe_gpu() -> st.SearchStrategy[List[torch.device]]:
     )
 
 
+# Usage examples:
+#
+# @generate_opcheck_tests
+# class MyOpTest(unittest.TestCase):
+#     ...
+#
+# @generate_opcheck_tests(additional_decorators={})
+# class MyOpTest(unittest.TestCase):
+#     ...
+#
+# pyre-ignore[3]
+def generate_opcheck_tests(
+    test_class: Optional[unittest.TestCase] = None,
+    *,
+    # pyre-ignore[24]: Generic type `Callable` expects 2 type parameters.
+    additional_decorators: Optional[Dict[str, Callable]] = None,
+):
+    if additional_decorators is None:
+        additional_decorators = {}
+
+    def decorator(test_class: unittest.TestCase) -> unittest.TestCase:
+        if not torch.__version__ >= "2.2.*":
+            return test_class
+        import torch.testing._internal.optests as optests
+        from torch._utils_internal import get_file_path_2
+
+        filename = inspect.getfile(test_class)
+        failures_dict_path = get_file_path_2(
+            "", os.path.dirname(filename), "failures_dict.json"
+        )
+        optests.generate_opcheck_tests(
+            test_class,
+            ["fb", "fbgemm"],
+            failures_dict_path,
+            # pyre-ignore[6]
+            additional_decorators,
+            [
+                "test_schema",
+                "test_autograd_registration",
+                "test_faketensor",
+                "test_aot_dispatch_static",
+                "test_aot_dispatch_dynamic",
+            ],
+        )
+        return test_class
+
+    if test_class is None:
+        return decorator
+    else:
+        return decorator(test_class)
+
+
 def cpu_only() -> st.SearchStrategy[List[torch.device]]:
     return st.sampled_from([torch.device("cpu")])
 

From 2b6ff684a4d1b59ec3b650510a050df52c347ab8 Mon Sep 17 00:00:00 2001
From: Richard Zou <rzou@fb.com>
Date: Thu, 12 Oct 2023 13:02:55 -0700
Subject: [PATCH 77/94] Revert D50129561: Turn on generate_opcheck_tests for
 jagged_tensor_ops_test.py

Differential Revision:
D50129561

Original commit changeset: e03595488a71

Original Phabricator Diff: D50129561

fbshipit-source-id: 1b7ccc2c8e781cfdc635eeb7d83129335f071ce1
---
 fbgemm_gpu/test/failures_dict.json        | 334 +---------------------
 fbgemm_gpu/test/jagged_tensor_ops_test.py |  41 +--
 fbgemm_gpu/test/test_utils.py             |  55 +---
 3 files changed, 7 insertions(+), 423 deletions(-)

diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index 28b1a6db69..7a7e1d40fa 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -23,7 +23,6 @@
         "status": "xfail"
       }
     },
-    "fbgemm::batched_dense_vec_jagged_2d_mul": {},
     "fbgemm::block_bucketize_sparse_features": {
       "SparseOpsTest.test_aot_dispatch_dynamic__test_block_bucketize_sparse_features": {
         "comment": "",
@@ -104,70 +103,6 @@
         "status": "xfail"
       }
     },
-    "fbgemm::dense_to_jagged": {
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_dense_to_jagged": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_dense_to_jagged_meta_backend": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_dense_to_jagged_opt": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_dense_to_jagged_opt_large_batch": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_dense_to_jagged": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_dense_to_jagged_meta_backend": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_dense_to_jagged_opt": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_dense_to_jagged_opt_large_batch": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged_meta_backend": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged_opt": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged_opt_large_batch": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
-    "fbgemm::expand_into_jagged_permute": {
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_expand_into_jagged_permute": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_expand_into_jagged_permute": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_expand_into_jagged_permute": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
     "fbgemm::generic_histogram_binning_calibration_by_feature": {
       "SparseOpsTest.test_aot_dispatch_dynamic__test_generic_histogram_binning_calibration_by_feature": {
         "comment": "",
@@ -246,242 +181,7 @@
         "status": "xfail"
       }
     },
-    "fbgemm::jagged_1d_to_dense": {},
-    "fbgemm::jagged_1d_to_truncated_values": {
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_1d_to_truncated_values": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_1d_to_truncated_values": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_jagged_1d_to_truncated_values": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
-    "fbgemm::jagged_2d_to_dense": {},
-    "fbgemm::jagged_dense_bmm": {},
-    "fbgemm::jagged_dense_dense_elementwise_add_jagged_output": {},
-    "fbgemm::jagged_dense_elementwise_add": {
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_elementwise_binary": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_elementwise_binary": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_autograd_registration__test_jagged_elementwise_binary": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
-    "fbgemm::jagged_dense_elementwise_add_jagged_output": {
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_elementwise_binary": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_elementwise_binary_opt": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_elementwise_binary": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_elementwise_binary_opt": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_autograd_registration__test_jagged_elementwise_binary": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_autograd_registration__test_jagged_elementwise_binary_opt": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_jagged_elementwise_binary": {
-        "comment": "This is a real failure, it just doesn't fail under all situations",
-        "status": "skip"
-      }
-    },
-    "fbgemm::jagged_dense_elementwise_mul": {},
-    "fbgemm::jagged_hash_size_cumsum": {
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_unique_indices": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_unique_indices": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_jagged_unique_indices": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
-    "fbgemm::jagged_index_select": {
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_index_select_2d": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_index_select_2d_in_inference": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_keyed_jagged_index_select_dim1": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_index_select_2d": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_index_select_2d_in_inference": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_keyed_jagged_index_select_dim1": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_jagged_index_select_2d": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_jagged_index_select_2d_in_inference": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_keyed_jagged_index_select_dim1": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
-    "fbgemm::jagged_jagged_bmm": {},
-    "fbgemm::jagged_slice": {
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_slice": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_slice": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_jagged_slice": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
-    "fbgemm::jagged_softmax": {},
-    "fbgemm::jagged_to_padded_dense": {},
-    "fbgemm::jagged_unique_indices": {
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_unique_indices": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_unique_indices_empty": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_unique_indices_multi_keys": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_unique_indices": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_unique_indices_empty": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_unique_indices_multi_keys": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_jagged_unique_indices": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_jagged_unique_indices_empty": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_jagged_unique_indices_multi_keys": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
-    "fbgemm::keyed_jagged_index_select_dim1": {
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_keyed_jagged_index_select_dim1": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_keyed_jagged_index_select_dim1": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_autograd_registration__test_keyed_jagged_index_select_dim1": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_keyed_jagged_index_select_dim1": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
-    "fbgemm::masked_select_jagged_1d": {
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_masked_select_jagged_1d": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_masked_select_jagged_1d": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_masked_select_jagged_1d": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
-    "fbgemm::offsets_range": {
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_1d_to_dense": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_2d_to_dense": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_2d_to_dense_truncation": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_stacked_jagged_2d_to_dense": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_1d_to_dense": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_2d_to_dense": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_2d_to_dense_truncation": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_stacked_jagged_2d_to_dense": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
+    "fbgemm::offsets_range": {},
     "fbgemm::pack_segments": {},
     "fbgemm::permute102_baddbmm_permute102": {
       "SparseOpsTest.test_aot_dispatch_dynamic__test_permute102_baddbmm_permute102": {
@@ -672,38 +372,6 @@
         "comment": "",
         "status": "xfail"
       }
-    },
-    "fbgemm::stacked_jagged_1d_to_dense": {
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_stacked_jagged_1d_to_dense": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_stacked_jagged_1d_to_dense": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_stacked_jagged_1d_to_dense": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
-    "fbgemm::stacked_jagged_2d_to_dense": {
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_stacked_jagged_2d_to_dense": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_stacked_jagged_2d_to_dense": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_autograd_registration__test_stacked_jagged_2d_to_dense": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_stacked_jagged_2d_to_dense": {
-        "comment": "",
-        "status": "xfail"
-      }
     }
   }
 }
diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py
index 1062aaf8f3..e202268f67 100644
--- a/fbgemm_gpu/test/jagged_tensor_ops_test.py
+++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py
@@ -10,13 +10,12 @@
 import itertools
 import random
 import unittest
-from typing import Any, Callable, List, Tuple, Union
+from typing import List, Tuple
 
 import hypothesis.strategies as st
 import numpy as np
 import torch
 import torch._dynamo
-import torch.testing._internal.optests as optests
 from hypothesis import assume, given, settings, Verbosity
 
 try:
@@ -25,7 +24,6 @@
 
     # pyre-ignore[21]
     from test_utils import (
-        generate_opcheck_tests,
         gpu_available,
         gpu_unavailable,
         on_arm_platform,
@@ -36,7 +34,6 @@
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
     from fbgemm_gpu.test.test_utils import (
-        generate_opcheck_tests,
         gpu_available,
         gpu_unavailable,
         on_arm_platform,
@@ -52,26 +49,6 @@ def lengths_to_segment_ids(lengths: torch.Tensor) -> torch.Tensor:
     )
 
 
-# Version of torch.autograd.gradcheck that works with generate_opcheck_tests.
-# The problem with just torch.autograd.gradcheck is that it results in
-# very slow tests when composed with generate_opcheck_tests.
-def gradcheck(
-    # pyre-ignore[24]: Generic type `Callable` expects 2 type parameters.
-    f: Callable,
-    # pyre-ignore[2]
-    inputs: Union[torch.Tensor, Tuple[Any, ...]],
-    *args: Any,
-    **kwargs: Any,
-) -> None:
-    if optests.is_inside_opcheck_mode():
-        if isinstance(inputs, torch.Tensor):
-            f(inputs)
-        else:
-            f(*inputs)
-        return
-    torch.autograd.gradcheck(f, inputs, *args, **kwargs)
-
-
 # Converts lengths + values format to COO format
 # [B], [N] -> [B, N'].
 # pyre-ignore Missing return annotation [3]
@@ -131,7 +108,6 @@ def hash_size_cumsum_to_offsets(hash_size_cum_sum_list: List[int]) -> List[int]:
     return hash_size_offsets_list
 
 
-@generate_opcheck_tests
 class JaggedTensorOpsTest(unittest.TestCase):
     def setUp(self) -> None:
         if symint_vector_unsupported()[0]:
@@ -319,7 +295,6 @@ def test_jagged_2d_to_dense_truncation(self) -> None:
             output_values.backward(ref_output_values)
             torch.testing.assert_close(expected_grad, values.grad)
 
-    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @settings(
         verbosity=Verbosity.verbose,
@@ -532,7 +507,6 @@ def test_jagged_1d_to_dense_truncation(self) -> None:
             )
             torch.testing.assert_close(ref_output, output)
 
-    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @settings(
         verbosity=Verbosity.verbose,
@@ -927,7 +901,6 @@ def test_dense_to_jagged_meta_backend(
         # verify forward
         assert dense.size() == dense2.size()
 
-    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @given(
         num_jagged_dim=st.integers(1, 5),
@@ -1072,7 +1045,7 @@ def test_jagged_to_padded_dense(
 
         torch.testing.assert_close(output, output_ref)
 
-        gradcheck(
+        torch.autograd.gradcheck(
             torch.ops.fbgemm.jagged_to_padded_dense,
             (
                 x_values.double().requires_grad_(True),
@@ -1198,7 +1171,7 @@ def mul_func(*args) -> torch.Tensor:
 
             f = mul_func
 
-        gradcheck(
+        torch.autograd.gradcheck(
             f,
             (
                 x_values.double().requires_grad_(True),
@@ -1266,7 +1239,6 @@ def test_jagged_elementwise_binary_opt(
             device_type,
         )
 
-    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @given(
         num_jagged_dim=st.integers(1, 5),
@@ -1424,7 +1396,7 @@ def add_jagged_output_func(*args) -> torch.Tensor:
 
         f = add_jagged_output_func
 
-        gradcheck(
+        torch.autograd.gradcheck(
             f,
             (
                 x_values.double().requires_grad_(True),
@@ -1540,7 +1512,6 @@ def test_jagged_dense_dense_elementwise_add_jagged_output_meta_backend(
 
         assert output.size() == output_ref.size()
 
-    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @given(
         num_jagged_dim=st.integers(1, 4),
@@ -1676,7 +1647,7 @@ def test_batched_dense_vec_jagged_2d_mul(
             atol=1e-2 if dtype in [torch.half, torch.bfloat16] else None,
         )
 
-        gradcheck(
+        torch.autograd.gradcheck(
             torch.ops.fbgemm.batched_dense_vec_jagged_2d_mul,
             (
                 dense.clone().detach().double().requires_grad_(True),
@@ -1744,7 +1715,6 @@ def test_batched_dense_vec_jagged_2d_mul_meta_backend(
         )
         assert output.size() == output_ref.size()
 
-    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @settings(
         verbosity=Verbosity.verbose,
@@ -2466,7 +2436,6 @@ def test_jagged_dense_bmm(
         torch.testing.assert_close(x_values.grad, x_values_ref.grad)
         torch.testing.assert_close(y.grad, y_ref.grad)
 
-    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @given(
         B=st.integers(10, 512),
diff --git a/fbgemm_gpu/test/test_utils.py b/fbgemm_gpu/test/test_utils.py
index a92fbb4404..64236296fc 100644
--- a/fbgemm_gpu/test/test_utils.py
+++ b/fbgemm_gpu/test/test_utils.py
@@ -4,14 +4,13 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import inspect
 import math
 import os
 import struct
 import subprocess
 import unittest
 from functools import wraps
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any, Callable, List, Tuple
 
 import hypothesis.strategies as st
 import numpy as np
@@ -203,58 +202,6 @@ def cpu_and_maybe_gpu() -> st.SearchStrategy[List[torch.device]]:
     )
 
 
-# Usage examples:
-#
-# @generate_opcheck_tests
-# class MyOpTest(unittest.TestCase):
-#     ...
-#
-# @generate_opcheck_tests(additional_decorators={})
-# class MyOpTest(unittest.TestCase):
-#     ...
-#
-# pyre-ignore[3]
-def generate_opcheck_tests(
-    test_class: Optional[unittest.TestCase] = None,
-    *,
-    # pyre-ignore[24]: Generic type `Callable` expects 2 type parameters.
-    additional_decorators: Optional[Dict[str, Callable]] = None,
-):
-    if additional_decorators is None:
-        additional_decorators = {}
-
-    def decorator(test_class: unittest.TestCase) -> unittest.TestCase:
-        if not torch.__version__ >= "2.2.*":
-            return test_class
-        import torch.testing._internal.optests as optests
-        from torch._utils_internal import get_file_path_2
-
-        filename = inspect.getfile(test_class)
-        failures_dict_path = get_file_path_2(
-            "", os.path.dirname(filename), "failures_dict.json"
-        )
-        optests.generate_opcheck_tests(
-            test_class,
-            ["fb", "fbgemm"],
-            failures_dict_path,
-            # pyre-ignore[6]
-            additional_decorators,
-            [
-                "test_schema",
-                "test_autograd_registration",
-                "test_faketensor",
-                "test_aot_dispatch_static",
-                "test_aot_dispatch_dynamic",
-            ],
-        )
-        return test_class
-
-    if test_class is None:
-        return decorator
-    else:
-        return decorator(test_class)
-
-
 def cpu_only() -> st.SearchStrategy[List[torch.device]]:
     return st.sampled_from([torch.device("cpu")])
 

From 924f3100090ae855f3d2c4b474715c1ddd606d11 Mon Sep 17 00:00:00 2001
From: Lei Chen <raychen@meta.com>
Date: Fri, 13 Oct 2023 10:27:37 -0700
Subject: [PATCH 78/94] Add meta implementation for
 asynchronous_exclusive_cumsum (#2072)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2072

same as title.

Follow the same pattern with ``asynchronous_complete_cumsum``  in https://www.internalfb.com/diff/D47834850.

Reviewed By: ezyang

Differential Revision: D50195064

fbshipit-source-id: c2506688ef01c7e56392f05bc1d0d7ee9a856cab
---
 fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h    | 3 +++
 fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp | 8 ++++----
 fbgemm_gpu/test/sparse_ops_test.py            | 6 +++---
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
index 9164de0b65..162c4b9ecc 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
@@ -45,6 +45,9 @@ at::Tensor asynchronous_inclusive_cumsum_cpu(const at::Tensor& t_in);
 ///@ingroup sparse-data-cuda
 at::Tensor asynchronous_complete_cumsum_meta(const at::Tensor& t_in);
 
+///@ingroup sparse-data-cuda
+at::Tensor asynchronous_exclusive_cumsum_meta(const at::Tensor& t_in);
+
 ///@ingroup sparse-data-cuda
 at::Tensor offsets_range_cuda(const at::Tensor& offsets, int64_t range_size);
 
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp
index 85e2f9c60b..a82b96e71c 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp
@@ -31,6 +31,10 @@ Tensor asynchronous_complete_cumsum_meta(const Tensor& t_in) {
   return output;
 }
 
+Tensor asynchronous_exclusive_cumsum_meta(const Tensor& t_in) {
+  return at::zeros_symint(t_in.sym_sizes(), t_in.options());
+}
+
 namespace {
 
 Tensor pack_segments_forward_meta(
@@ -77,10 +81,6 @@ Tensor asynchronous_inclusive_cumsum_meta(const Tensor& t_in) {
   return at::empty_symint(t_in.sym_sizes(), t_in.options());
 }
 
-Tensor asynchronous_exclusive_cumsum_meta(const Tensor& t_in) {
-  return at::empty_symint(t_in.sym_sizes(), t_in.options());
-}
-
 } // namespace
 
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/test/sparse_ops_test.py b/fbgemm_gpu/test/sparse_ops_test.py
index b4c1143157..8418f966de 100644
--- a/fbgemm_gpu/test/sparse_ops_test.py
+++ b/fbgemm_gpu/test/sparse_ops_test.py
@@ -610,11 +610,11 @@ def test_cumsum(self, n: int, long_index: bool) -> None:
 
         # meta tests
         mx = torch.randint(low=0, high=100, size=(n,)).type(index_dtype).to("meta")
-        # mze = torch.ops.fbgemm.asynchronous_exclusive_cumsum(mx)
+        mze = torch.ops.fbgemm.asynchronous_exclusive_cumsum(mx)
+        self.assertEqual(ze.size(), mze.size())
         # mzi = torch.ops.fbgemm.asynchronous_inclusive_cumsum(mx)
-        mzc = torch.ops.fbgemm.asynchronous_complete_cumsum(mx)
-        # self.assertEqual(ze.size(), mze.size())
         # self.assertEqual(zi.size(), mzi.size())
+        mzc = torch.ops.fbgemm.asynchronous_complete_cumsum(mx)
         self.assertEqual(zc.size(), mzc.size())
 
         if gpu_available:

From 70c6e83c29f67278751abd0e28433c50743ccbe9 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Tue, 17 Oct 2023 00:05:15 -0700
Subject: [PATCH 79/94] Fix heap-buffer-overflow in `radix_sort_parallel `
 (#2075)

Summary:
Setting `histogram_ps[RDX_HIST_SIZE * (nthreads - 1) + 127] = offset;` in `combine_prefix_sum_for_msb` is guaranteed to result in `heap-buffer-overflow` if bucket is not empty during the scatter stage (as all values of `histogram_ps` should be strictly less than `element_count`

Factor out common code from `RadixSortTest.cc` into `test_tempalte` and add regression test for buffer overflow, which before the test will fail as follows:
```
[ RUN      ] cpuKernelTest.raidx_sort_heap_overflow
/home/nshulga/git/pytorch/FBGEMM/test/RadixSortTest.cc:36: Failure
Expected equality of these values:
  expected_keys
    Which is: { 2, 3, 5, -1, -1, 2147483647, 2147483647, 2147483647 }
  keys
    Which is: { -1, -1, -1, -1, -1, -1, -1, -1 }
/home/nshulga/git/pytorch/FBGEMM/test/RadixSortTest.cc:37: Failure
Expected equality of these values:
  expected_values
    Which is: { 1, 4, 6, 7, 8, 2, 3, 5 }
  values
    Which is: { 2147483647, 4, 6, 7, 8, 6, 7, 8 }
[  FAILED  ] cpuKernelTest.raidx_sort_heap_overflow (0 ms)
```

Will fix https://github.com/pytorch/pytorch/issues/111189 once FBGEMM is updated to the correct version

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2075

Reviewed By: kit1980, jianyuh

Differential Revision: D50256504

Pulled By: malfet

fbshipit-source-id: f805607595e324999cea07dcacdee8317a008221
---
 src/Utils.cc          |  8 ++--
 test/RadixSortTest.cc | 91 +++++++++++++++++++++++++------------------
 2 files changed, 57 insertions(+), 42 deletions(-)

diff --git a/src/Utils.cc b/src/Utils.cc
index 9403070c41..c61b1a4dd1 100644
--- a/src/Utils.cc
+++ b/src/Utils.cc
@@ -623,7 +623,6 @@ void combine_prefix_sum(
   int64_t offset = 0;
   update_prefsum_and_offset_in_range(
       offset, 0, RDX_HIST_SIZE, nthreads, histogram, histogram_ps);
-  histogram_ps[RDX_HIST_SIZE * nthreads] = offset;
   // TODO(DamianSzwichtenberg): Is assert sufficient? In most cases, it will
   // work only in debug build.
   assert(offset == elements_count);
@@ -641,7 +640,6 @@ void combine_prefix_sum_for_msb(
       offset, 128, RDX_HIST_SIZE, nthreads, histogram, histogram_ps);
   update_prefsum_and_offset_in_range(
       offset, 0, 128, nthreads, histogram, histogram_ps);
-  histogram_ps[RDX_HIST_SIZE * (nthreads - 1) + 127] = offset;
   // TODO(DamianSzwichtenberg): Is assert sufficient? In most cases, it will
   // work only in debug build.
   assert(offset == elements_count);
@@ -760,13 +758,13 @@ std::pair<K*, V*> radix_sort_parallel(
   const size_t array_size = (size_t)RDX_HIST_SIZE * maxthreads;
   // fixes MSVC error C2131
   auto* const histogram = static_cast<int64_t*>(
-      fbgemm::fbgemmAlignedAlloc(64, (array_size) * sizeof(int64_t)));
+      fbgemm::fbgemmAlignedAlloc(64, array_size * sizeof(int64_t)));
   auto* const histogram_ps = static_cast<int64_t*>(
-      fbgemm::fbgemmAlignedAlloc(64, (array_size + 1) * sizeof(int64_t)));
+      fbgemm::fbgemmAlignedAlloc(64, array_size * sizeof(int64_t)));
 
 #else
   alignas(64) int64_t histogram[RDX_HIST_SIZE * maxthreads];
-  alignas(64) int64_t histogram_ps[RDX_HIST_SIZE * maxthreads + 1];
+  alignas(64) int64_t histogram_ps[RDX_HIST_SIZE * maxthreads];
 #endif
   // If negative values are present, we want to perform all passes
   // up to a sign bit
diff --git a/test/RadixSortTest.cc b/test/RadixSortTest.cc
index aa93584539..b62e07ea5f 100644
--- a/test/RadixSortTest.cc
+++ b/test/RadixSortTest.cc
@@ -11,53 +11,70 @@
 #include <limits>
 
 #include "fbgemm/Utils.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 
-TEST(cpuKernelTest, radix_sort_parallel_test) {
-  std::array<int, 8> keys = {1, 2, 4, 5, 4, 3, 2, 9};
-  std::array<int, 8> values = {0, 0, 0, 0, 1, 1, 1, 1};
-
-  std::array<int, 8> keys_tmp;
-  std::array<int, 8> values_tmp;
-
+namespace {
+template <typename T, unsigned N>
+void test_template(
+    std::array<T, N> keys,
+    std::array<T, N> values,
+    std::array<T, N> expected_keys,
+    std::array<T, N> expected_values,
+    T max_val = std::numeric_limits<T>::max(),
+    bool may_be_neg = std::is_signed_v<T>) {
+  std::array<T, N> keys_tmp;
+  std::array<T, N> values_tmp;
   const auto [sorted_keys, sorted_values] = fbgemm::radix_sort_parallel(
       keys.data(),
       values.data(),
       keys_tmp.data(),
       values_tmp.data(),
       keys.size(),
-      10);
-
-  std::array<int, 8> expect_keys_tmp = {1, 2, 2, 3, 4, 4, 5, 9};
-  std::array<int, 8> expect_values_tmp = {0, 0, 1, 1, 0, 1, 0, 1};
-  EXPECT_EQ(sorted_keys, keys_tmp.data());
-  EXPECT_EQ(sorted_values, values_tmp.data());
-  EXPECT_EQ(keys_tmp, expect_keys_tmp);
-  EXPECT_EQ(values_tmp, expect_values_tmp);
+      max_val,
+      may_be_neg);
+  if (sorted_keys == keys.data()) { // even number of passes
+    EXPECT_EQ(expected_keys, keys);
+    EXPECT_EQ(expected_values, values);
+  } else { // odd number of passes
+    EXPECT_EQ(expected_keys, keys_tmp);
+    EXPECT_EQ(expected_values, values_tmp);
+  }
 }
 
-TEST(cpuKernelTest, radix_sort_parallel_test_neg_vals) {
-  std::array<int64_t, 8> keys = {-4, -3, 0, 1, -2, -1, 3, 2};
-  std::array<int64_t, 8> values = {0, 0, 0, 0, 1, 1, 1, 1};
+} // anonymous namespace
 
-  std::array<int64_t, 8> keys_tmp;
-  std::array<int64_t, 8> values_tmp;
+TEST(cpuKernelTest, radix_sort_parallel_test) {
+  test_template<int, 8>(
+      {1, 2, 4, 5, 4, 3, 2, 9},
+      {0, 0, 0, 0, 1, 1, 1, 1},
+      {1, 2, 2, 3, 4, 4, 5, 9},
+      {0, 0, 1, 1, 0, 1, 0, 1},
+      10,
+      false);
+}
 
-  const auto [sorted_keys, sorted_values] = fbgemm::radix_sort_parallel(
-      keys.data(),
-      values.data(),
-      keys_tmp.data(),
-      values_tmp.data(),
-      keys.size(),
-      std::numeric_limits<int64_t>::max(),
-      /*maybe_with_neg_vals=*/true);
+TEST(cpuKernelTest, radix_sort_parallel_test_neg_vals) {
+  test_template<int64_t, 8>(
+      {-4, -3, 0, 1, -2, -1, 3, 2},
+      {0, 0, 0, 0, 1, 1, 1, 1},
+      {-4, -3, -2, -1, 0, 1, 2, 3},
+      {0, 0, 1, 1, 0, 0, 1, 1});
+}
 
-  std::array<int64_t, 8> expect_keys_tmp = {-4, -3, -2, -1, 0, 1, 2, 3};
-  std::array<int64_t, 8> expect_values_tmp = {0, 0, 1, 1, 0, 0, 1, 1};
-  if (sorted_keys == keys.data()) { // even number of passes
-    EXPECT_EQ(expect_keys_tmp, keys);
-    EXPECT_EQ(expect_values_tmp, values);
-  } else { // odd number of passes
-    EXPECT_EQ(expect_keys_tmp, keys_tmp);
-    EXPECT_EQ(expect_values_tmp, values_tmp);
-  }
+TEST(cpuKernelTest, raidx_sort_heap_overflow) {
+#ifdef _OPENMP
+  const auto orig_threads = omp_get_num_threads();
+  omp_set_num_threads(1);
+#endif
+  constexpr auto max = std::numeric_limits<int>::max();
+  test_template<int, 8>(
+      {-1, max, max, -1, max, -1, -1, -1},
+      {1, 2, 3, 4, 5, 6, 7, 8},
+      {-1, -1, -1, -1, -1, max, max, max},
+      {1, 4, 6, 7, 8, 2, 3, 5});
+#ifdef _OPENMP
+  omp_set_num_threads(orig_threads);
+#endif
 }

From daf9042e6da9a6348f62323bbcc356cea7d1444c Mon Sep 17 00:00:00 2001
From: Richard Zou <rzou@meta.com>
Date: Tue, 17 Oct 2023 15:44:01 -0700
Subject: [PATCH 80/94] Turn on generate_opcheck_tests for
 jagged_tensor_ops_test.py (#2073)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2073

This PR turns on automatically generated opcheck tests for
jagged_tensor_ops_test.py. I reverted the original one because the fbgemm OSS
CI failures were real.

Reviewed By: ezyang

Differential Revision: D50241452

fbshipit-source-id: 98192cbbb7d0e4e72c3cafd3c85ab6ad3a166da1
---
 fbgemm_gpu/test/failures_dict.json        | 343 +++++++++++++++++++++-
 fbgemm_gpu/test/jagged_tensor_ops_test.py |  36 ++-
 fbgemm_gpu/test/test_utils.py             |  99 ++++++-
 3 files changed, 471 insertions(+), 7 deletions(-)

diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index 7a7e1d40fa..e4da2429b9 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -23,6 +23,7 @@
         "status": "xfail"
       }
     },
+    "fbgemm::batched_dense_vec_jagged_2d_mul": {},
     "fbgemm::block_bucketize_sparse_features": {
       "SparseOpsTest.test_aot_dispatch_dynamic__test_block_bucketize_sparse_features": {
         "comment": "",
@@ -103,6 +104,70 @@
         "status": "xfail"
       }
     },
+    "fbgemm::dense_to_jagged": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_dense_to_jagged": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_dense_to_jagged_meta_backend": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_dense_to_jagged_opt": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_dense_to_jagged_opt_large_batch": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_dense_to_jagged": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_dense_to_jagged_meta_backend": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_dense_to_jagged_opt": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_dense_to_jagged_opt_large_batch": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged_meta_backend": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged_opt": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged_opt_large_batch": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::expand_into_jagged_permute": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_expand_into_jagged_permute": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_expand_into_jagged_permute": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_expand_into_jagged_permute": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
     "fbgemm::generic_histogram_binning_calibration_by_feature": {
       "SparseOpsTest.test_aot_dispatch_dynamic__test_generic_histogram_binning_calibration_by_feature": {
         "comment": "",
@@ -181,7 +246,251 @@
         "status": "xfail"
       }
     },
-    "fbgemm::offsets_range": {},
+    "fbgemm::jagged_1d_to_dense": {},
+    "fbgemm::jagged_1d_to_truncated_values": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_1d_to_truncated_values": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_1d_to_truncated_values": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_jagged_1d_to_truncated_values": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::jagged_2d_to_dense": {},
+    "fbgemm::jagged_dense_bmm": {},
+    "fbgemm::jagged_dense_dense_elementwise_add_jagged_output": {},
+    "fbgemm::jagged_dense_elementwise_add": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_elementwise_binary": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_elementwise_binary": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_autograd_registration__test_jagged_elementwise_binary": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::jagged_dense_elementwise_add_jagged_output": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_elementwise_binary": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_elementwise_binary_opt": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_elementwise_binary": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_elementwise_binary_opt": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_autograd_registration__test_jagged_elementwise_binary": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_autograd_registration__test_jagged_elementwise_binary_opt": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_jagged_elementwise_binary": {
+        "comment": "This is a real failure, it just doesn't fail under all situations",
+        "status": "skip"
+      }
+    },
+    "fbgemm::jagged_dense_elementwise_mul": {},
+    "fbgemm::jagged_hash_size_cumsum": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_unique_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_unique_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_jagged_unique_indices": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::jagged_index_select": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_index_select_2d": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_index_select_2d_in_inference": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_keyed_jagged_index_select_dim1": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_index_select_2d": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_index_select_2d_in_inference": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_keyed_jagged_index_select_dim1": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_jagged_index_select_2d": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_jagged_index_select_2d_in_inference": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_keyed_jagged_index_select_dim1": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::jagged_jagged_bmm": {},
+    "fbgemm::jagged_slice": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_slice": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_slice": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_jagged_slice": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::jagged_softmax": {},
+    "fbgemm::jagged_to_padded_dense": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_to_padded_dense": {
+        "comment": "seems nondeterministic, but error is real",
+        "status": "skip"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_to_padded_dense": {
+        "comment": "seems nondeterministic but error is real",
+        "status": "skip"
+      }
+    },
+    "fbgemm::jagged_unique_indices": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_unique_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_unique_indices_empty": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_unique_indices_multi_keys": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_unique_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_unique_indices_empty": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_unique_indices_multi_keys": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_jagged_unique_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_jagged_unique_indices_empty": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_jagged_unique_indices_multi_keys": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::keyed_jagged_index_select_dim1": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_keyed_jagged_index_select_dim1": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_keyed_jagged_index_select_dim1": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_autograd_registration__test_keyed_jagged_index_select_dim1": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_keyed_jagged_index_select_dim1": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::masked_select_jagged_1d": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_masked_select_jagged_1d": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_masked_select_jagged_1d": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_masked_select_jagged_1d": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::offsets_range": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_1d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_2d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_2d_to_dense_truncation": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_stacked_jagged_2d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_1d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_2d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_2d_to_dense_truncation": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_stacked_jagged_2d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
     "fbgemm::pack_segments": {},
     "fbgemm::permute102_baddbmm_permute102": {
       "SparseOpsTest.test_aot_dispatch_dynamic__test_permute102_baddbmm_permute102": {
@@ -372,6 +681,38 @@
         "comment": "",
         "status": "xfail"
       }
+    },
+    "fbgemm::stacked_jagged_1d_to_dense": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_stacked_jagged_1d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_stacked_jagged_1d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_stacked_jagged_1d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::stacked_jagged_2d_to_dense": {
+      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_stacked_jagged_2d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_stacked_jagged_2d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_autograd_registration__test_stacked_jagged_2d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_faketensor__test_stacked_jagged_2d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      }
     }
   }
 }
diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py
index e202268f67..ddd4464a7e 100644
--- a/fbgemm_gpu/test/jagged_tensor_ops_test.py
+++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py
@@ -16,7 +16,7 @@
 import numpy as np
 import torch
 import torch._dynamo
-from hypothesis import assume, given, settings, Verbosity
+from hypothesis import assume, given, HealthCheck, settings, Verbosity
 
 try:
     # pyre-ignore[21]
@@ -26,7 +26,9 @@
     from test_utils import (
         gpu_available,
         gpu_unavailable,
+        gradcheck,
         on_arm_platform,
+        optests,
         symint_vector_unsupported,
         TEST_WITH_ROCM,
     )
@@ -36,12 +38,28 @@
     from fbgemm_gpu.test.test_utils import (
         gpu_available,
         gpu_unavailable,
+        gradcheck,
         on_arm_platform,
+        optests,
         symint_vector_unsupported,
         TEST_WITH_ROCM,
     )
 
 
+suppressed_list: List[HealthCheck] = (
+    # pyre-fixme[16]: Module `HealthCheck` has no attribute `differing_executors`.
+    [HealthCheck.differing_executors]
+    if getattr(HealthCheck, "differing_executors", False)
+    else []
+)
+
+# This health check seems incorrect
+settings.register_profile(
+    "suppress_differing_executors_check", suppress_health_check=suppressed_list
+)
+settings.load_profile("suppress_differing_executors_check")
+
+
 def lengths_to_segment_ids(lengths: torch.Tensor) -> torch.Tensor:
     return torch.repeat_interleave(
         torch._dim_arange(lengths, 0).long(),
@@ -108,6 +126,7 @@ def hash_size_cumsum_to_offsets(hash_size_cum_sum_list: List[int]) -> List[int]:
     return hash_size_offsets_list
 
 
+@optests.generate_opcheck_tests
 class JaggedTensorOpsTest(unittest.TestCase):
     def setUp(self) -> None:
         if symint_vector_unsupported()[0]:
@@ -295,6 +314,7 @@ def test_jagged_2d_to_dense_truncation(self) -> None:
             output_values.backward(ref_output_values)
             torch.testing.assert_close(expected_grad, values.grad)
 
+    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @settings(
         verbosity=Verbosity.verbose,
@@ -507,6 +527,7 @@ def test_jagged_1d_to_dense_truncation(self) -> None:
             )
             torch.testing.assert_close(ref_output, output)
 
+    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @settings(
         verbosity=Verbosity.verbose,
@@ -901,6 +922,7 @@ def test_dense_to_jagged_meta_backend(
         # verify forward
         assert dense.size() == dense2.size()
 
+    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @given(
         num_jagged_dim=st.integers(1, 5),
@@ -1045,7 +1067,7 @@ def test_jagged_to_padded_dense(
 
         torch.testing.assert_close(output, output_ref)
 
-        torch.autograd.gradcheck(
+        gradcheck(
             torch.ops.fbgemm.jagged_to_padded_dense,
             (
                 x_values.double().requires_grad_(True),
@@ -1171,7 +1193,7 @@ def mul_func(*args) -> torch.Tensor:
 
             f = mul_func
 
-        torch.autograd.gradcheck(
+        gradcheck(
             f,
             (
                 x_values.double().requires_grad_(True),
@@ -1239,6 +1261,7 @@ def test_jagged_elementwise_binary_opt(
             device_type,
         )
 
+    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @given(
         num_jagged_dim=st.integers(1, 5),
@@ -1396,7 +1419,7 @@ def add_jagged_output_func(*args) -> torch.Tensor:
 
         f = add_jagged_output_func
 
-        torch.autograd.gradcheck(
+        gradcheck(
             f,
             (
                 x_values.double().requires_grad_(True),
@@ -1512,6 +1535,7 @@ def test_jagged_dense_dense_elementwise_add_jagged_output_meta_backend(
 
         assert output.size() == output_ref.size()
 
+    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @given(
         num_jagged_dim=st.integers(1, 4),
@@ -1647,7 +1671,7 @@ def test_batched_dense_vec_jagged_2d_mul(
             atol=1e-2 if dtype in [torch.half, torch.bfloat16] else None,
         )
 
-        torch.autograd.gradcheck(
+        gradcheck(
             torch.ops.fbgemm.batched_dense_vec_jagged_2d_mul,
             (
                 dense.clone().detach().double().requires_grad_(True),
@@ -1715,6 +1739,7 @@ def test_batched_dense_vec_jagged_2d_mul_meta_backend(
         )
         assert output.size() == output_ref.size()
 
+    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @settings(
         verbosity=Verbosity.verbose,
@@ -2436,6 +2461,7 @@ def test_jagged_dense_bmm(
         torch.testing.assert_close(x_values.grad, x_values_ref.grad)
         torch.testing.assert_close(y.grad, y_ref.grad)
 
+    @optests.dontGenerateOpCheckTests("tests that call torch.compile are slow")
     @unittest.skipIf(*symint_vector_unsupported())
     @given(
         B=st.integers(10, 512),
diff --git a/fbgemm_gpu/test/test_utils.py b/fbgemm_gpu/test/test_utils.py
index 64236296fc..17d96e16ae 100644
--- a/fbgemm_gpu/test/test_utils.py
+++ b/fbgemm_gpu/test/test_utils.py
@@ -4,13 +4,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import inspect
 import math
 import os
 import struct
 import subprocess
 import unittest
 from functools import wraps
-from typing import Any, Callable, List, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import hypothesis.strategies as st
 import numpy as np
@@ -202,6 +203,102 @@ def cpu_and_maybe_gpu() -> st.SearchStrategy[List[torch.device]]:
     )
 
 
+def has_optests() -> bool:
+    return torch.__version__ >= "2.2.*"
+
+
+class optests:
+    # Usage examples:
+    #
+    # @generate_opcheck_tests
+    # class MyOpTest(unittest.TestCase):
+    #     ...
+    #
+    # @generate_opcheck_tests(additional_decorators={})
+    # class MyOpTest(unittest.TestCase):
+    #     ...
+    #
+    @staticmethod
+    # pyre-ignore[3]
+    def generate_opcheck_tests(
+        test_class: Optional[unittest.TestCase] = None,
+        *,
+        # pyre-ignore[24]: Generic type `Callable` expects 2 type parameters.
+        additional_decorators: Optional[Dict[str, Callable]] = None,
+    ):
+        if additional_decorators is None:
+            additional_decorators = {}
+
+        def decorator(test_class: unittest.TestCase) -> unittest.TestCase:
+            if not has_optests():
+                return test_class
+            import torch.testing._internal.optests as optests
+            from torch._utils_internal import get_file_path_2
+
+            filename = inspect.getfile(test_class)
+            failures_dict_path = get_file_path_2(
+                "", os.path.dirname(filename), "failures_dict.json"
+            )
+            optests.generate_opcheck_tests(
+                test_class,
+                ["fb", "fbgemm"],
+                failures_dict_path,
+                # pyre-ignore[6]
+                additional_decorators,
+                [
+                    "test_schema",
+                    "test_autograd_registration",
+                    "test_faketensor",
+                    "test_aot_dispatch_static",
+                    "test_aot_dispatch_dynamic",
+                ],
+            )
+            return test_class
+
+        if test_class is None:
+            return decorator
+        else:
+            return decorator(test_class)
+
+    @staticmethod
+    def is_inside_opcheck_mode() -> bool:
+        if not has_optests():
+            return False
+
+        import torch.testing._internal.optests as optests
+
+        return optests.is_inside_opcheck_mode()
+
+    @staticmethod
+    # pyre-ignore[3]
+    def dontGenerateOpCheckTests(reason: str):
+        if not has_optests():
+            return lambda fun: fun
+        import torch.testing._internal.optests as optests
+
+        return optests.dontGenerateOpCheckTests(reason)
+
+
+# Version of torch.autograd.gradcheck that works with generate_opcheck_tests.
+# The problem with just torch.autograd.gradcheck is that it results in
+# very slow tests when composed with generate_opcheck_tests.
+def gradcheck(
+    # pyre-ignore[24]: Generic type `Callable` expects 2 type parameters.
+    f: Callable,
+    # pyre-ignore[2]
+    inputs: Union[torch.Tensor, Tuple[Any, ...]],
+    *args: Any,
+    **kwargs: Any,
+) -> None:
+    if optests.is_inside_opcheck_mode():
+        if isinstance(inputs, torch.Tensor):
+            f(inputs)
+        else:
+            f(*inputs)
+        return
+    torch.autograd.gradcheck(f, inputs, *args, **kwargs)
+
+
 def cpu_only() -> st.SearchStrategy[List[torch.device]]:
     return st.sampled_from([torch.device("cpu")])
 

From e93fb0c31298877576aba41861095d1d83f3994e Mon Sep 17 00:00:00 2001
From: Jianyu Huang <jianyuhuang@meta.com>
Date: Thu, 19 Oct 2023 17:16:15 -0700
Subject: [PATCH 81/94] Fix torchscript issue (#2078)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2078

Check https://fb.workplace.com/groups/957614988799577/permalink/995501075010968/

Similar to D29056288 .

Didn't know `split_embedding_weights_with_scale_bias` is used in forward path in torchscript?

Reviewed By: q10, mengyingdu

Differential Revision: D50474951

fbshipit-source-id: d043bcad27a381eef90ddde5a58eb890a983a475
---
 .../split_table_batched_embeddings_ops_inference.py  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_inference.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_inference.py
index 0fb8188553..261ce7c74e 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_inference.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_inference.py
@@ -1192,9 +1192,9 @@ def split_embedding_weights_with_scale_bias(
                     ),
                 ]
                 if (
-                    weight_ty == SparseType.INT8
-                    or weight_ty == SparseType.INT4
-                    or weight_ty == SparseType.INT2
+                    weight_ty.value == SparseType.INT8.value
+                    or weight_ty.value == SparseType.INT4.value
+                    or weight_ty.value == SparseType.INT2.value
                 ):
                     if split_scale_bias_mode == 1:
                         splits.append(
@@ -1220,9 +1220,9 @@ def split_embedding_weights_with_scale_bias(
                             )
                         )
                 elif (
-                    weight_ty == SparseType.FP8
-                    or weight_ty == SparseType.FP16
-                    or weight_ty == SparseType.FP32
+                    weight_ty.value == SparseType.FP8.value
+                    or weight_ty.value == SparseType.FP16.value
+                    or weight_ty.value == SparseType.FP32.value
                 ):
                     splits.append(
                         (

From 4a97e799879e3a02a6ba541446b99cc02954ebb7 Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Thu, 19 Oct 2023 20:22:56 -0700
Subject: [PATCH 82/94] Get total D from CPU buffer in batch_index_select_dim0
 (#2079)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2079

This diff shuffles the code to allow `batch_index_select_dim0` to get
total D from `D_offsets` when it is still in the host memory.

Reviewed By: q10, bigning

Differential Revision: D50477287

fbshipit-source-id: 715de7711e052baf27e417a93f9a4267697bcdbb
---
 fbgemm_gpu/codegen/batch_index_select_dim0_host.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fbgemm_gpu/codegen/batch_index_select_dim0_host.cpp b/fbgemm_gpu/codegen/batch_index_select_dim0_host.cpp
index 2206b56914..24d82ead27 100644
--- a/fbgemm_gpu/codegen/batch_index_select_dim0_host.cpp
+++ b/fbgemm_gpu/codegen/batch_index_select_dim0_host.cpp
@@ -259,10 +259,6 @@ Tensor batch_index_select_dim0_gpu(
   // Transfer helper tensors to GPU
   const auto device = inputs.device();
   constexpr bool non_blocking = true;
-  D_offsets = D_offsets.to(device, non_blocking);
-  input_offsets = input_offsets.to(device, non_blocking);
-  input_row_offsets = input_row_offsets.to(device, non_blocking);
-  total_L_offsets = total_L_offsets.to(device, non_blocking);
 
   int64_t output_size;
   Tensor output_offsets;
@@ -279,6 +275,11 @@ Tensor batch_index_select_dim0_gpu(
     output_offsets = output_offsets.to(device, non_blocking);
   }
 
+  D_offsets = D_offsets.to(device, non_blocking);
+  input_offsets = input_offsets.to(device, non_blocking);
+  input_row_offsets = input_row_offsets.to(device, non_blocking);
+  total_L_offsets = total_L_offsets.to(device, non_blocking);
+
   const auto sparse_type = fbgemm_gpu::getSparseType(inputs.scalar_type());
   TORCH_CHECK(
       sparse_type == SparseType::FP32 || sparse_type == SparseType::FP16,

From b1049cf76046045508fc2a4242e3cf33afc5a70d Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Mon, 23 Oct 2023 11:32:41 -0700
Subject: [PATCH 83/94] Split up lfu_cache.cu (#2083)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2083

- Split up lfu_cache.cu for easier maintenance

Reviewed By: spcyppt

Differential Revision: D50522858

fbshipit-source-id: caa99dc973cabdb4fd61e85e23e3de1db568d788
---
 fbgemm_gpu/CMakeLists.txt                     |   4 +-
 .../src/split_embeddings_cache/common.cuh     |  22 +-
 .../src/split_embeddings_cache/lfu_cache.cu   | 749 ------------------
 .../split_embeddings_cache/lfu_cache_find.cu  | 183 +++++
 .../lfu_cache_populate.cu                     | 312 ++++++++
 .../lfu_cache_populate_byte.cu                | 282 +++++++
 6 files changed, 801 insertions(+), 751 deletions(-)
 delete mode 100644 fbgemm_gpu/src/split_embeddings_cache/lfu_cache.cu
 create mode 100644 fbgemm_gpu/src/split_embeddings_cache/lfu_cache_find.cu
 create mode 100644 fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate.cu
 create mode 100644 fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate_byte.cu

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 39fb050114..c193c5b733 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -675,7 +675,9 @@ if(NOT FBGEMM_CPU_ONLY)
       src/sparse_ops/sparse_reorder_batched_ad.cu
       src/sparse_ops/sparse_segment_sum_csr.cu
       src/sparse_ops/sparse_zipf.cu
-      src/split_embeddings_cache/lfu_cache.cu
+      src/split_embeddings_cache/lfu_cache_find.cu
+      src/split_embeddings_cache/lfu_cache_populate.cu
+      src/split_embeddings_cache/lfu_cache_populate_byte.cu
       src/split_embeddings_cache/lru_cache_find.cu
       src/split_embeddings_cache/lru_cache_populate.cu
       src/split_embeddings_cache/lru_cache_populate_byte.cu
diff --git a/fbgemm_gpu/src/split_embeddings_cache/common.cuh b/fbgemm_gpu/src/split_embeddings_cache/common.cuh
index d6e1eb6be3..7efabb8519 100644
--- a/fbgemm_gpu/src/split_embeddings_cache/common.cuh
+++ b/fbgemm_gpu/src/split_embeddings_cache/common.cuh
@@ -43,7 +43,6 @@
 #include "fbgemm_gpu/split_embeddings_utils.cuh"
 
 using Tensor = at::Tensor;
-using namespace fbgemm_gpu;
 
 namespace {
 
@@ -51,6 +50,10 @@ constexpr size_t kCacheMaxThreads = 512;
 constexpr int32_t kCacheLocationMissing = -1;
 constexpr int64_t kCacheStateInvalid = -1;
 
+constexpr int32_t kCacheSetBits = 24;
+constexpr int32_t kLFUCounterBits = 40;
+static_assert(kCacheSetBits + kLFUCounterBits == 8 * sizeof(int64_t), "");
+
 // // TODO: do we care about 64-bit indices? Currently we just ignore.
 // __host__ DEVICE_INLINE uint32_t cache_slot(int32_t h_in, int32_t C) {
 //   // MurmorHash3 32-bit mixing function.
@@ -88,3 +91,20 @@ int get_max_thread_blocks_for_cache_kernels_() {
 }
 
 } // namespace
+
+namespace fbgemm_gpu {
+
+void lfu_update_counts_cuda(
+    Tensor unique_indices,
+    Tensor unique_indices_length,
+    Tensor unique_indices_count,
+    Tensor lfu_state);
+
+std::pair<Tensor, Tensor> lfu_cache_find_uncached_cuda(
+    Tensor unique_indices,
+    Tensor unique_indices_length,
+    int64_t max_indices,
+    Tensor lxu_cache_state,
+    Tensor lfu_state);
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache.cu b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache.cu
deleted file mode 100644
index f2080dc5d3..0000000000
--- a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache.cu
+++ /dev/null
@@ -1,749 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include "common.cuh"
-
-using Tensor = at::Tensor;
-using namespace fbgemm_gpu;
-
-namespace {
-
-constexpr int32_t kCacheSetBits = 24;
-constexpr int32_t kLFUCounterBits = 40;
-static_assert(kCacheSetBits + kLFUCounterBits == 8 * sizeof(int64_t), "");
-
-template <typename index_t>
-__global__ __launch_bounds__(kMaxThreads) void lfu_update_counts_kernel(
-    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        unique_indices,
-    const int32_t* __restrict__ N_unique,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        unique_indices_count,
-    pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits> lfu_state) {
-  CUDA_KERNEL_LOOP(n, *N_unique) {
-    const auto idx = unique_indices[n];
-    lfu_state[idx] += unique_indices_count[n];
-  }
-}
-
-void lfu_update_counts_cuda(
-    Tensor unique_indices,
-    Tensor unique_indices_length,
-    Tensor unique_indices_count,
-    Tensor lfu_state) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      unique_indices, unique_indices_length, unique_indices_count, lfu_state);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(unique_indices.get_device());
-
-  const int32_t N = unique_indices.size(0);
-  AT_DISPATCH_INDEX_TYPES(
-      unique_indices.scalar_type(), "lfu_update_counts_cuda", [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "lfu_update_counts_kernel";
-#endif
-        lfu_update_counts_kernel<<<
-            std::min(
-                div_round_up(N, kMaxThreads),
-                get_max_thread_blocks_for_cache_kernels_()),
-            kMaxThreads,
-            0,
-            at::cuda::getCurrentCUDAStream()>>>(
-            MAKE_PTA_WITH_NAME(func_name, unique_indices, index_t, 1, 32),
-            unique_indices_length.data_ptr<int32_t>(),
-            MAKE_PTA_WITH_NAME(func_name, unique_indices_count, int32_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64));
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
-}
-
-template <typename index_t>
-__global__ __launch_bounds__(kMaxThreads) void lfu_cache_find_uncached_kernel(
-    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        unique_indices,
-    const int32_t* __restrict__ N_unique,
-    int64_t max_indices,
-    const pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
-        lxu_cache_state,
-    uint64_t* __restrict__ cache_sets,
-    const pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
-        lfu_state) {
-  const int32_t C = lxu_cache_state.size(0);
-
-  for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique;
-       n += gridDim.x * blockDim.y) {
-    const int64_t idx = unique_indices[n];
-    if (idx == max_indices) {
-      // cache_sets are initialized with sentinel values in
-      // lfu_cache_find_uncached_cuda
-      continue;
-    }
-    const uint32_t cache_set = cache_slot(idx, C);
-
-    const auto slot = threadIdx.x;
-    const bool found = ::__ldg((&lxu_cache_state[cache_set][0]) + slot) == idx;
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (!__any_sync(0xFFFFFFFFFFFFFFFF, found)) {
-#else
-    if (!__any_sync(0xFFFFFFFF, found)) {
-#endif
-      if (threadIdx.x == 0) {
-        // sort so the highest LFUs come first in the segment.
-        // assume lfu_state[idx] <= 2^40 - 1 and cache_set < 2^24 -1
-        cache_sets[n] =
-            ((static_cast<uint64_t>(cache_set) << kLFUCounterBits)) |
-            ((static_cast<uint64_t>(1) << kLFUCounterBits) - 1 -
-             lfu_state[idx]);
-      }
-    }
-  }
-}
-
-std::pair<Tensor, Tensor> lfu_cache_find_uncached_cuda(
-    Tensor unique_indices,
-    Tensor unique_indices_length,
-    int64_t max_indices,
-    Tensor lxu_cache_state,
-    Tensor lfu_state) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      unique_indices, unique_indices_length, lxu_cache_state, lfu_state);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(unique_indices.get_device());
-
-  auto cache_sets = full_like(
-      unique_indices,
-      static_cast<int64_t>(
-          static_cast<uint64_t>(lxu_cache_state.size(0)) << kLFUCounterBits),
-      unique_indices.options().dtype(at::kLong));
-  const int32_t N = unique_indices.numel();
-  auto sorted_cache_sets = empty_like(cache_sets);
-  auto cache_set_sorted_unique_indices = empty_like(unique_indices);
-
-  AT_DISPATCH_INDEX_TYPES(
-      unique_indices.scalar_type(), "lfu_cache_find_uncached_cuda", [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "lfu_cache_find_uncached_kernel";
-#endif
-        // Find uncached indices
-        lfu_cache_find_uncached_kernel<<<
-            std::min(
-                div_round_up(N, kMaxThreads / kWarpSize),
-                get_max_thread_blocks_for_cache_kernels_()),
-            dim3(kWarpSize, kMaxThreads / kWarpSize),
-            0,
-            at::cuda::getCurrentCUDAStream()>>>(
-            MAKE_PTA_WITH_NAME(func_name, unique_indices, index_t, 1, 32),
-            unique_indices_length.data_ptr<int32_t>(),
-            max_indices,
-            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
-            (uint64_t*)cache_sets.data_ptr<int64_t>(),
-            MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64));
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-        // Sort the cache sets and ids
-        size_t temp_storage_bytes = 0;
-        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs(
-            nullptr,
-            temp_storage_bytes,
-            (uint64_t*)cache_sets.data_ptr<int64_t>(),
-            (uint64_t*)sorted_cache_sets.data_ptr<int64_t>(),
-            unique_indices.data_ptr<index_t>(),
-            cache_set_sorted_unique_indices.data_ptr<index_t>(),
-            N,
-            0,
-            int(log2(float(lxu_cache_state.size(0) + 1)) + 1) + kLFUCounterBits,
-            at::cuda::getCurrentCUDAStream(),
-            false));
-        auto temp_storage = at::empty(
-            {static_cast<int64_t>(temp_storage_bytes)},
-            unique_indices.options().dtype(at::kByte));
-        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs(
-            temp_storage.data_ptr(),
-            temp_storage_bytes,
-            (uint64_t*)cache_sets.data_ptr<int64_t>(),
-            (uint64_t*)sorted_cache_sets.data_ptr<int64_t>(),
-            unique_indices.data_ptr<index_t>(),
-            cache_set_sorted_unique_indices.data_ptr<index_t>(),
-            N,
-            0,
-            int(log2(float(lxu_cache_state.size(0) + 1)) + 1) + kLFUCounterBits,
-            at::cuda::getCurrentCUDAStream(),
-            false));
-      });
-  return {sorted_cache_sets, cache_set_sorted_unique_indices};
-}
-
-template <typename emb_t, typename cache_t>
-__global__ __launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_kernel(
-    pta::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> weights,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        cache_hash_size_cumsum,
-    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
-        cache_index_table_map,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        weights_offsets,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        D_offsets,
-    const uint64_t* __restrict__ sorted_cache_sets,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        cache_set_sorted_indices,
-    const int32_t* __restrict__ N_unique,
-    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
-        lxu_cache_state,
-    pta::PackedTensorAccessor64<cache_t, 2, at::RestrictPtrTraits>
-        lxu_cache_weights,
-    const pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
-        lfu_state,
-    bool stochastic_rounding,
-    at::PhiloxCudaState stochastic_rounding_philox_args) {
-  const int32_t C = lxu_cache_state.size(0);
-  for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique;
-       n += gridDim.x * blockDim.y) {
-    // check if this warp is responsible for this whole segment.
-    const bool segment_start =
-        (n == 0 ||
-         (sorted_cache_sets[n - 1] >> kLFUCounterBits) !=
-             (sorted_cache_sets[n] >> kLFUCounterBits));
-
-    if (!segment_start) {
-      // don't have *warp* divergence since we launch full warps in blockDim.x,
-      // so we can just exit this warp entirely.
-      continue;
-    }
-    const uint32_t cache_set = (sorted_cache_sets[n] >> kLFUCounterBits);
-    if (cache_set == C) {
-      // ignore the already-existing elements
-      continue;
-    }
-
-    int32_t SL = 1;
-    while (n + SL < *N_unique &&
-           (sorted_cache_sets[n + SL] >> kLFUCounterBits) == cache_set) {
-      SL += 1;
-    }
-
-    // now, we need to insert the (unique!) values in indices[n:n + SL] into
-    // our slots.
-    const int32_t slot = threadIdx.x;
-    const int64_t current_idx = lxu_cache_state[cache_set][slot];
-    const int64_t current_lfu_cost =
-        (current_idx != static_cast<int64_t>(kCacheStateInvalid))
-        ? lfu_state[current_idx]
-        : -1;
-    int64_t costs[1] = {current_lfu_cost};
-    int32_t slots[1] = {slot};
-
-    BitonicSort<int64_t, int32_t, 1, Comparator<int64_t>>::sort(costs, slots);
-    const int32_t sorted_slot = slots[0];
-    const int64_t sorted_lfu_cost = costs[0];
-
-    for (int32_t l = 0; l < min(SL, kWarpSize); ++l) {
-      const int32_t insert_slot = shfl_sync(sorted_slot, l);
-      const int64_t insert_current_lfu_cost = shfl_sync(sorted_lfu_cost, l);
-      const int64_t insert_idx = cache_set_sorted_indices[n + l];
-      const int64_t insert_lfu_cost = lfu_state[insert_idx];
-
-      if (insert_current_lfu_cost > insert_lfu_cost) {
-        // don't insert.
-        // all subsequent `current_lfu_cost` values are greater, and all
-        // subsequent `insert_lfu_cost` values are smaller, so we can exit
-        // early here.
-        break;
-      }
-      const int32_t t_insert = cache_index_table_map[insert_idx];
-      const int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
-      const int64_t weights_offset_insert = weights_offsets[t_insert];
-      const int32_t D_start_insert = D_offsets[t_insert];
-      const int32_t D_end_insert = D_offsets[t_insert + 1];
-      const int32_t D_insert = D_end_insert - D_start_insert;
-
-      // not empty
-      if (insert_current_lfu_cost != -1) {
-        // ensure that threadIdx.x is the only thread reading/writing to
-        // lxu_cache_state
-        int64_t current_idx =
-            threadIdx.x == 0 ? lxu_cache_state[cache_set][insert_slot] : 0;
-        current_idx = shfl_sync(current_idx, 0);
-        const int32_t t_current = cache_index_table_map[current_idx];
-        const int64_t idx_current =
-            current_idx - cache_hash_size_cumsum[t_current];
-        const int64_t weights_offset_current = weights_offsets[t_current];
-        const int32_t D_start_current = D_offsets[t_current];
-        const int32_t D_end_current = D_offsets[t_current + 1];
-        const int32_t D_current = D_end_current - D_start_current;
-
-        int32_t D_emb = D_current;
-        if constexpr (std::is_same_v<emb_t, uint8_t>) {
-          D_emb += kINT8QparamsBytes;
-        }
-        auto weight_row = WeightRow<emb_t, cache_t, cache_t>(
-            &weights[weights_offset_current + idx_current * D_emb + 0],
-            &lxu_cache_weights[cache_set * kWarpSize + insert_slot][0],
-            D_current,
-            nullptr);
-
-        weight_row.set_stochastic_rounding(
-            stochastic_rounding,
-            stochastic_rounding_philox_args,
-            (blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x +
-             threadIdx.x) *
-                    kWarpSize +
-                l);
-
-        weight_row.warp_evict(D_current, blockDim.x, threadIdx.x);
-      }
-
-      // insert into cache
-      int32_t D_emb = D_insert;
-      if constexpr (std::is_same_v<emb_t, uint8_t>) {
-        D_emb += kINT8QparamsBytes;
-      }
-
-      auto weight_row_cache = WeightRow<emb_t, cache_t, cache_t>(
-          &weights[weights_offset_insert + idx_insert * D_emb + 0],
-          &lxu_cache_weights[cache_set * kWarpSize + insert_slot][0],
-          D_insert,
-          nullptr);
-
-      auto weight_row_emb = WeightRow<emb_t, cache_t, cache_t>(
-          &weights[weights_offset_insert + idx_insert * D_emb + 0],
-          nullptr,
-          D_insert,
-          nullptr);
-
-      weight_row_emb.warp_copy_to(
-          weight_row_cache, D_insert, blockDim.x, threadIdx.x);
-
-      if (threadIdx.x == 0) {
-        lxu_cache_state[cache_set][insert_slot] = insert_idx;
-      }
-    }
-  }
-}
-
-void lfu_cache_insert_cuda(
-    Tensor weights,
-    Tensor cache_hash_size_cumsum,
-    Tensor cache_index_table_map,
-    Tensor weights_offsets,
-    Tensor D_offsets,
-    Tensor sorted_cache_sets,
-    Tensor cache_set_sorted_unique_indices,
-    Tensor unique_indices_length,
-    Tensor lxu_cache_state,
-    Tensor lxu_cache_weights,
-    Tensor lfu_state,
-    bool stochastic_rounding) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      D_offsets,
-      sorted_cache_sets,
-      cache_set_sorted_unique_indices,
-      unique_indices_length,
-      lxu_cache_state,
-      lxu_cache_weights,
-      lfu_state);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(weights.get_device());
-
-  const int32_t N = cache_set_sorted_unique_indices.numel();
-
-  DISPATCH_EMB_CACHE_TYPES(
-      weights.scalar_type(),
-      lxu_cache_weights.scalar_type(),
-      "lfu_cache_insert_kernel_2",
-      ([&] {
-        at::PhiloxCudaState rng_engine_inputs;
-        if (stochastic_rounding && !std::is_same<emb_t, float>::value) {
-          auto gen = at::cuda::detail::getDefaultCUDAGenerator();
-          std::lock_guard<std::mutex> lock(gen.mutex());
-          rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)
-                                  ->philox_cuda_state(4);
-        }
-
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "lfu_cache_insert_kernel";
-#endif
-
-        lfu_cache_insert_kernel<emb_t, cache_t>
-            <<<std::min(
-                   div_round_up(N, kCacheMaxThreads / kWarpSize),
-                   get_max_thread_blocks_for_cache_kernels_()),
-               dim3(kWarpSize, kCacheMaxThreads / kWarpSize),
-               0,
-               at::cuda::getCurrentCUDAStream()>>>(
-                MAKE_PTA_WITH_NAME(func_name, weights, emb_t, 1, 64),
-                MAKE_PTA_WITH_NAME(
-                    func_name, cache_hash_size_cumsum, int64_t, 1, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name, cache_index_table_map, int32_t, 1, 64),
-                MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
-                MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
-                (uint64_t*)sorted_cache_sets.data_ptr<int64_t>(),
-                MAKE_PTA_WITH_NAME(
-                    func_name, cache_set_sorted_unique_indices, int64_t, 1, 32),
-                unique_indices_length.data_ptr<int32_t>(),
-                MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name, lxu_cache_weights, cache_t, 2, 64),
-                MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64),
-                stochastic_rounding,
-                rng_engine_inputs);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      }));
-}
-
-} // namespace
-
-DLL_PUBLIC void lfu_cache_populate_cuda(
-    Tensor weights,
-    Tensor cache_hash_size_cumsum,
-    int64_t total_cache_hash_size,
-    Tensor cache_index_table_map,
-    Tensor weights_offsets,
-    Tensor D_offsets,
-    Tensor linear_cache_indices,
-    Tensor lxu_cache_state,
-    Tensor lxu_cache_weights,
-    Tensor lfu_state,
-    bool stochastic_rounding) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      D_offsets,
-      linear_cache_indices,
-      lxu_cache_state,
-      lxu_cache_weights,
-      lfu_state);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(weights.get_device());
-
-  TORCH_CHECK(
-      linear_cache_indices.numel() < std::numeric_limits<int32_t>::max());
-  if (linear_cache_indices.numel() == 0) {
-    // nothing to do
-    return;
-  }
-
-  // get unqiue indices
-  Tensor unique_indices;
-  Tensor unique_indices_length;
-  c10::optional<Tensor> unique_indices_count;
-  std::tie(unique_indices, unique_indices_length, unique_indices_count) =
-      get_unique_indices_cuda(
-          linear_cache_indices, total_cache_hash_size, true);
-
-  // update lfu counts
-  lfu_update_counts_cuda(
-      unique_indices, unique_indices_length, *unique_indices_count, lfu_state);
-
-  // find uncached indices
-  auto cache_sets_and_unique_indices = lfu_cache_find_uncached_cuda(
-      unique_indices,
-      unique_indices_length,
-      total_cache_hash_size,
-      lxu_cache_state,
-      lfu_state);
-  const auto sorted_cache_sets = cache_sets_and_unique_indices.first;
-  const auto cache_set_sorted_unique_indices =
-      cache_sets_and_unique_indices.second;
-
-  // insert caching weights
-  lfu_cache_insert_cuda(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      D_offsets,
-      sorted_cache_sets,
-      cache_set_sorted_unique_indices,
-      unique_indices_length,
-      lxu_cache_state,
-      lxu_cache_weights,
-      lfu_state,
-      stochastic_rounding);
-}
-
-namespace {
-
-// In `lfu_cache_insert_kernel`, we use `emb_t` and `cache_t` for the
-// high-precision cache implementation, where we can have {FP32, FP16, INT8}
-// for embedding precision (data types), and {FP32, FP16} for cache precision
-// (data types).
-//
-// In `lfu_cache_insert_byte_kernel`, we only use uint8_t for the both embedding
-// and cache data type (conforming to the inference TBE kernel logics).
-// - We pass in `weights_tys` to denote the real data types for the embeddings:
-// {FP32, FP16, INT8, INT4, INT2}. For example, FP32 is 4 byte element in the
-// byte tensor, and INT4 is half byte element in the byte tensor.
-// - We only assume that the embedding and cache have the same precisions (the
-// real "precision" is determined by `weights_tys` although the data types are
-// uint8_t only). Basically no "high-precision cache" support for now.
-// - The insert/evict of embedding row from the cache are done in a byte-by-byte
-// manner.
-template <typename index_t>
-__global__
-__launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_byte_kernel(
-    pta::PackedTensorAccessor64<uint8_t, 1, at::RestrictPtrTraits> weights,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        cache_hash_size_cumsum,
-    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
-        cache_index_table_map,
-    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        weights_offsets,
-    const pta::PackedTensorAccessor32<uint8_t, 1, at::RestrictPtrTraits>
-        weights_tys,
-    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        D_offsets,
-    const uint64_t* __restrict__ sorted_cache_sets,
-    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        cache_set_sorted_indices,
-    const int32_t* __restrict__ N_unique,
-    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
-        lxu_cache_state,
-    pta::PackedTensorAccessor64<uint8_t, 2, at::RestrictPtrTraits>
-        lxu_cache_weights,
-    const pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
-        lfu_state,
-    const int64_t row_alignment) {
-  const int32_t C = lxu_cache_state.size(0);
-  for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique;
-       n += gridDim.x * blockDim.y) {
-    // check if this warp is responsible for this whole segment.
-    const bool segment_start =
-        (n == 0 ||
-         (sorted_cache_sets[n - 1] >> kLFUCounterBits) !=
-             (sorted_cache_sets[n] >> kLFUCounterBits));
-
-    if (!segment_start) {
-      // don't have *warp* divergence since we launch full warps in blockDim.x,
-      // so we can just exit this warp entirely.
-      continue;
-    }
-    const uint32_t cache_set = (sorted_cache_sets[n] >> kLFUCounterBits);
-    if (cache_set == C) {
-      // ignore the already-existing elements
-      continue;
-    }
-
-    int32_t SL = 1;
-    while (n + SL < *N_unique &&
-           (sorted_cache_sets[n + SL] >> kLFUCounterBits) == cache_set) {
-      SL += 1;
-    }
-
-    // now, we need to insert the (unique!) values in indices[n:n + SL] into
-    // our slots.
-    const int32_t slot = threadIdx.x;
-    const int64_t current_idx = lxu_cache_state[cache_set][slot];
-    const int64_t current_lfu_cost =
-        (current_idx != static_cast<int64_t>(kCacheStateInvalid))
-        ? lfu_state[current_idx]
-        : -1;
-    int64_t costs[1] = {current_lfu_cost};
-    int32_t slots[1] = {slot};
-
-    BitonicSort<int64_t, int32_t, 1, Comparator<int64_t>>::sort(costs, slots);
-    const int32_t sorted_slot = slots[0];
-    const int64_t sorted_lfu_cost = costs[0];
-
-    for (int32_t l = 0; l < min(SL, kWarpSize); ++l) {
-      const int32_t insert_slot = shfl_sync(sorted_slot, l);
-      const int64_t insert_current_lfu_cost = shfl_sync(sorted_lfu_cost, l);
-      const index_t insert_idx = cache_set_sorted_indices[n + l];
-      const int64_t insert_lfu_cost = lfu_state[insert_idx];
-
-      if (insert_current_lfu_cost > insert_lfu_cost) {
-        // don't insert.
-        // all subsequent `current_lfu_cost` values are greater, and all
-        // subsequent `insert_lfu_cost` values are smaller, so we can exit
-        // early here.
-        break;
-      }
-      const int32_t t_insert = cache_index_table_map[insert_idx];
-      const SparseType weight_ty_insert =
-          static_cast<SparseType>(weights_tys[t_insert]);
-      const int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
-      const int64_t weights_offset_insert = weights_offsets[t_insert];
-      const int32_t D_start_insert = D_offsets[t_insert];
-      const int32_t D_end_insert = D_offsets[t_insert + 1];
-      const int32_t D_insert = D_end_insert - D_start_insert;
-
-      const int32_t D_insert_bytes = nbit::padded_row_size_in_bytes(
-          D_insert, weight_ty_insert, row_alignment);
-
-      // insert into cache. Note that nbit::padded_row_size_in_bytes pad each
-      // row with row_alignment (16 bytes on GPUs) So each row will be multiple
-      // of 16 bytes (uint4 = 32bit x 4 = 16 bytes).
-      auto row = reinterpret_cast<const uint4*>(
-          &weights[weights_offset_insert + idx_insert * D_insert_bytes + 0]);
-      auto cache_row = reinterpret_cast<uint4*>(
-          &lxu_cache_weights[cache_set * kWarpSize + insert_slot][0]);
-      for (int32_t d = threadIdx.x; d * sizeof(uint4) < D_insert_bytes;
-           d += blockDim.x) {
-        cache_row[d] = row[d];
-      }
-      if (threadIdx.x == 0) {
-        lxu_cache_state[cache_set][insert_slot] = insert_idx;
-      }
-    }
-  }
-}
-
-void lfu_cache_insert_byte_cuda(
-    Tensor weights,
-    Tensor cache_hash_size_cumsum,
-    Tensor cache_index_table_map,
-    Tensor weights_offsets,
-    Tensor weights_tys,
-    Tensor D_offsets,
-    Tensor sorted_cache_sets,
-    Tensor cache_set_sorted_unique_indices,
-    Tensor unique_indices_length,
-    Tensor lxu_cache_state,
-    Tensor lxu_cache_weights,
-    Tensor lfu_state,
-    int64_t row_alignment) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      weights_tys,
-      D_offsets,
-      sorted_cache_sets,
-      cache_set_sorted_unique_indices,
-      unique_indices_length,
-      lxu_cache_state,
-      lxu_cache_weights,
-      lfu_state);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(weights.get_device());
-
-  const int32_t N = cache_set_sorted_unique_indices.numel();
-
-  AT_DISPATCH_INDEX_TYPES(
-      cache_set_sorted_unique_indices.scalar_type(),
-      "lfu_cache_insert_byte_cuda",
-      [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "lfu_cache_insert_byte_kernel";
-#endif
-        lfu_cache_insert_byte_kernel<<<
-            std::min(
-                div_round_up(N, kCacheMaxThreads / kWarpSize),
-                get_max_thread_blocks_for_cache_kernels_()),
-            dim3(kWarpSize, kCacheMaxThreads / kWarpSize),
-            0,
-            at::cuda::getCurrentCUDAStream()>>>(
-            MAKE_PTA_WITH_NAME(func_name, weights, uint8_t, 1, 64),
-            MAKE_PTA_WITH_NAME(
-                func_name, cache_hash_size_cumsum, int64_t, 1, 32),
-            MAKE_PTA_WITH_NAME(
-                func_name, cache_index_table_map, int32_t, 1, 64),
-            MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, weights_tys, uint8_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
-            (uint64_t*)sorted_cache_sets.data_ptr<int64_t>(),
-            MAKE_PTA_WITH_NAME(
-                func_name, cache_set_sorted_unique_indices, index_t, 1, 32),
-            unique_indices_length.data_ptr<int32_t>(),
-            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
-            MAKE_PTA_WITH_NAME(func_name, lxu_cache_weights, uint8_t, 2, 64),
-            MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64),
-            row_alignment);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
-}
-
-} // namespace
-
-DLL_PUBLIC void lfu_cache_populate_byte_cuda(
-    Tensor weights,
-    Tensor cache_hash_size_cumsum,
-    int64_t total_cache_hash_size,
-    Tensor cache_index_table_map,
-    Tensor weights_offsets,
-    Tensor weights_tys,
-    Tensor D_offsets,
-    Tensor linear_cache_indices,
-    Tensor lxu_cache_state,
-    Tensor lxu_cache_weights,
-    Tensor lfu_state,
-    int64_t row_alignment) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      weights_tys,
-      D_offsets,
-      linear_cache_indices,
-      lxu_cache_state,
-      lxu_cache_weights,
-      lfu_state);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(weights.get_device());
-
-  TORCH_CHECK(
-      linear_cache_indices.numel() < std::numeric_limits<int32_t>::max());
-  if (linear_cache_indices.numel() == 0) {
-    // nothing to do
-    return;
-  }
-
-  // get unqiue indices
-  Tensor unique_indices;
-  Tensor unique_indices_length;
-  c10::optional<Tensor> unique_indices_count;
-  std::tie(unique_indices, unique_indices_length, unique_indices_count) =
-      get_unique_indices_cuda(
-          linear_cache_indices, total_cache_hash_size, true);
-
-  // update lfu counts
-  lfu_update_counts_cuda(
-      unique_indices, unique_indices_length, *unique_indices_count, lfu_state);
-
-  // find uncached indices
-  const auto cache_sets_and_unique_indices = lfu_cache_find_uncached_cuda(
-      unique_indices,
-      unique_indices_length,
-      total_cache_hash_size,
-      lxu_cache_state,
-      lfu_state);
-  const auto sorted_cache_sets = cache_sets_and_unique_indices.first;
-  const auto cache_set_sorted_unique_indices =
-      cache_sets_and_unique_indices.second;
-
-  // insert caching weights
-  lfu_cache_insert_byte_cuda(
-      weights,
-      cache_hash_size_cumsum,
-      cache_index_table_map,
-      weights_offsets,
-      weights_tys,
-      D_offsets,
-      sorted_cache_sets,
-      cache_set_sorted_unique_indices,
-      unique_indices_length,
-      lxu_cache_state,
-      lxu_cache_weights,
-      lfu_state,
-      row_alignment);
-}
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_find.cu b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_find.cu
new file mode 100644
index 0000000000..3c154d7c0d
--- /dev/null
+++ b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_find.cu
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "common.cuh"
+
+using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+namespace {
+
+template <typename index_t>
+__global__ __launch_bounds__(kMaxThreads) void lfu_update_counts_kernel(
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        unique_indices,
+    const int32_t* __restrict__ N_unique,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        unique_indices_count,
+    pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits> lfu_state) {
+  CUDA_KERNEL_LOOP(n, *N_unique) {
+    const auto idx = unique_indices[n];
+    lfu_state[idx] += unique_indices_count[n];
+  }
+}
+
+template <typename index_t>
+__global__ __launch_bounds__(kMaxThreads) void lfu_cache_find_uncached_kernel(
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        unique_indices,
+    const int32_t* __restrict__ N_unique,
+    int64_t max_indices,
+    const pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+        lxu_cache_state,
+    uint64_t* __restrict__ cache_sets,
+    const pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
+        lfu_state) {
+  const int32_t C = lxu_cache_state.size(0);
+
+  for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique;
+       n += gridDim.x * blockDim.y) {
+    const int64_t idx = unique_indices[n];
+    if (idx == max_indices) {
+      // cache_sets are initialized with sentinel values in
+      // lfu_cache_find_uncached_cuda
+      continue;
+    }
+    const uint32_t cache_set = cache_slot(idx, C);
+
+    const auto slot = threadIdx.x;
+    const bool found = ::__ldg((&lxu_cache_state[cache_set][0]) + slot) == idx;
+
+#ifdef __HIP_PLATFORM_HCC__
+    if (!__any_sync(0xFFFFFFFFFFFFFFFF, found)) {
+#else
+    if (!__any_sync(0xFFFFFFFF, found)) {
+#endif
+      if (threadIdx.x == 0) {
+        // sort so the highest LFUs come first in the segment.
+        // assume lfu_state[idx] <= 2^40 - 1 and cache_set < 2^24 -1
+        cache_sets[n] =
+            ((static_cast<uint64_t>(cache_set) << kLFUCounterBits)) |
+            ((static_cast<uint64_t>(1) << kLFUCounterBits) - 1 -
+             lfu_state[idx]);
+      }
+    }
+  }
+}
+
+} // namespace
+
+namespace fbgemm_gpu {
+
+void lfu_update_counts_cuda(
+    Tensor unique_indices,
+    Tensor unique_indices_length,
+    Tensor unique_indices_count,
+    Tensor lfu_state) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      unique_indices, unique_indices_length, unique_indices_count, lfu_state);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(unique_indices.get_device());
+
+  const int32_t N = unique_indices.size(0);
+  AT_DISPATCH_INDEX_TYPES(
+      unique_indices.scalar_type(), "lfu_update_counts_cuda", [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lfu_update_counts_kernel";
+#endif
+        lfu_update_counts_kernel<<<
+            std::min(
+                div_round_up(N, kMaxThreads),
+                get_max_thread_blocks_for_cache_kernels_()),
+            kMaxThreads,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            MAKE_PTA_WITH_NAME(func_name, unique_indices, index_t, 1, 32),
+            unique_indices_length.data_ptr<int32_t>(),
+            MAKE_PTA_WITH_NAME(func_name, unique_indices_count, int32_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64));
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
+}
+
+std::pair<Tensor, Tensor> lfu_cache_find_uncached_cuda(
+    Tensor unique_indices,
+    Tensor unique_indices_length,
+    int64_t max_indices,
+    Tensor lxu_cache_state,
+    Tensor lfu_state) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      unique_indices, unique_indices_length, lxu_cache_state, lfu_state);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(unique_indices.get_device());
+
+  auto cache_sets = full_like(
+      unique_indices,
+      static_cast<int64_t>(
+          static_cast<uint64_t>(lxu_cache_state.size(0)) << kLFUCounterBits),
+      unique_indices.options().dtype(at::kLong));
+  const int32_t N = unique_indices.numel();
+  auto sorted_cache_sets = empty_like(cache_sets);
+  auto cache_set_sorted_unique_indices = empty_like(unique_indices);
+
+  AT_DISPATCH_INDEX_TYPES(
+      unique_indices.scalar_type(), "lfu_cache_find_uncached_cuda", [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lfu_cache_find_uncached_kernel";
+#endif
+        // Find uncached indices
+        lfu_cache_find_uncached_kernel<<<
+            std::min(
+                div_round_up(N, kMaxThreads / kWarpSize),
+                get_max_thread_blocks_for_cache_kernels_()),
+            dim3(kWarpSize, kMaxThreads / kWarpSize),
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            MAKE_PTA_WITH_NAME(func_name, unique_indices, index_t, 1, 32),
+            unique_indices_length.data_ptr<int32_t>(),
+            max_indices,
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+            (uint64_t*)cache_sets.data_ptr<int64_t>(),
+            MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64));
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+        // Sort the cache sets and ids
+        size_t temp_storage_bytes = 0;
+        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs(
+            nullptr,
+            temp_storage_bytes,
+            (uint64_t*)cache_sets.data_ptr<int64_t>(),
+            (uint64_t*)sorted_cache_sets.data_ptr<int64_t>(),
+            unique_indices.data_ptr<index_t>(),
+            cache_set_sorted_unique_indices.data_ptr<index_t>(),
+            N,
+            0,
+            int(log2(float(lxu_cache_state.size(0) + 1)) + 1) + kLFUCounterBits,
+            at::cuda::getCurrentCUDAStream(),
+            false));
+        auto temp_storage = at::empty(
+            {static_cast<int64_t>(temp_storage_bytes)},
+            unique_indices.options().dtype(at::kByte));
+        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs(
+            temp_storage.data_ptr(),
+            temp_storage_bytes,
+            (uint64_t*)cache_sets.data_ptr<int64_t>(),
+            (uint64_t*)sorted_cache_sets.data_ptr<int64_t>(),
+            unique_indices.data_ptr<index_t>(),
+            cache_set_sorted_unique_indices.data_ptr<index_t>(),
+            N,
+            0,
+            int(log2(float(lxu_cache_state.size(0) + 1)) + 1) + kLFUCounterBits,
+            at::cuda::getCurrentCUDAStream(),
+            false));
+      });
+  return {sorted_cache_sets, cache_set_sorted_unique_indices};
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate.cu b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate.cu
new file mode 100644
index 0000000000..76cbe6d440
--- /dev/null
+++ b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate.cu
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "common.cuh"
+
+using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+namespace {
+
+template <typename emb_t, typename cache_t>
+__global__ __launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_kernel(
+    pta::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> weights,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        cache_hash_size_cumsum,
+    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
+        cache_index_table_map,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        weights_offsets,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        D_offsets,
+    const uint64_t* __restrict__ sorted_cache_sets,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        cache_set_sorted_indices,
+    const int32_t* __restrict__ N_unique,
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+        lxu_cache_state,
+    pta::PackedTensorAccessor64<cache_t, 2, at::RestrictPtrTraits>
+        lxu_cache_weights,
+    const pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
+        lfu_state,
+    bool stochastic_rounding,
+    at::PhiloxCudaState stochastic_rounding_philox_args) {
+  const int32_t C = lxu_cache_state.size(0);
+  for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique;
+       n += gridDim.x * blockDim.y) {
+    // check if this warp is responsible for this whole segment.
+    const bool segment_start =
+        (n == 0 ||
+         (sorted_cache_sets[n - 1] >> kLFUCounterBits) !=
+             (sorted_cache_sets[n] >> kLFUCounterBits));
+
+    if (!segment_start) {
+      // don't have *warp* divergence since we launch full warps in blockDim.x,
+      // so we can just exit this warp entirely.
+      continue;
+    }
+    const uint32_t cache_set = (sorted_cache_sets[n] >> kLFUCounterBits);
+    if (cache_set == C) {
+      // ignore the already-existing elements
+      continue;
+    }
+
+    int32_t SL = 1;
+    while (n + SL < *N_unique &&
+           (sorted_cache_sets[n + SL] >> kLFUCounterBits) == cache_set) {
+      SL += 1;
+    }
+
+    // now, we need to insert the (unique!) values in indices[n:n + SL] into
+    // our slots.
+    const int32_t slot = threadIdx.x;
+    const int64_t current_idx = lxu_cache_state[cache_set][slot];
+    const int64_t current_lfu_cost =
+        (current_idx != static_cast<int64_t>(kCacheStateInvalid))
+        ? lfu_state[current_idx]
+        : -1;
+    int64_t costs[1] = {current_lfu_cost};
+    int32_t slots[1] = {slot};
+
+    BitonicSort<int64_t, int32_t, 1, Comparator<int64_t>>::sort(costs, slots);
+    const int32_t sorted_slot = slots[0];
+    const int64_t sorted_lfu_cost = costs[0];
+
+    for (int32_t l = 0; l < min(SL, kWarpSize); ++l) {
+      const int32_t insert_slot = shfl_sync(sorted_slot, l);
+      const int64_t insert_current_lfu_cost = shfl_sync(sorted_lfu_cost, l);
+      const int64_t insert_idx = cache_set_sorted_indices[n + l];
+      const int64_t insert_lfu_cost = lfu_state[insert_idx];
+
+      if (insert_current_lfu_cost > insert_lfu_cost) {
+        // don't insert.
+        // all subsequent `current_lfu_cost` values are greater, and all
+        // subsequent `insert_lfu_cost` values are smaller, so we can exit
+        // early here.
+        break;
+      }
+      const int32_t t_insert = cache_index_table_map[insert_idx];
+      const int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
+      const int64_t weights_offset_insert = weights_offsets[t_insert];
+      const int32_t D_start_insert = D_offsets[t_insert];
+      const int32_t D_end_insert = D_offsets[t_insert + 1];
+      const int32_t D_insert = D_end_insert - D_start_insert;
+
+      // not empty
+      if (insert_current_lfu_cost != -1) {
+        // ensure that threadIdx.x is the only thread reading/writing to
+        // lxu_cache_state
+        int64_t current_idx =
+            threadIdx.x == 0 ? lxu_cache_state[cache_set][insert_slot] : 0;
+        current_idx = shfl_sync(current_idx, 0);
+        const int32_t t_current = cache_index_table_map[current_idx];
+        const int64_t idx_current =
+            current_idx - cache_hash_size_cumsum[t_current];
+        const int64_t weights_offset_current = weights_offsets[t_current];
+        const int32_t D_start_current = D_offsets[t_current];
+        const int32_t D_end_current = D_offsets[t_current + 1];
+        const int32_t D_current = D_end_current - D_start_current;
+
+        int32_t D_emb = D_current;
+        if constexpr (std::is_same_v<emb_t, uint8_t>) {
+          D_emb += kINT8QparamsBytes;
+        }
+        auto weight_row = WeightRow<emb_t, cache_t, cache_t>(
+            &weights[weights_offset_current + idx_current * D_emb + 0],
+            &lxu_cache_weights[cache_set * kWarpSize + insert_slot][0],
+            D_current,
+            nullptr);
+
+        weight_row.set_stochastic_rounding(
+            stochastic_rounding,
+            stochastic_rounding_philox_args,
+            (blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x +
+             threadIdx.x) *
+                    kWarpSize +
+                l);
+
+        weight_row.warp_evict(D_current, blockDim.x, threadIdx.x);
+      }
+
+      // insert into cache
+      int32_t D_emb = D_insert;
+      if constexpr (std::is_same_v<emb_t, uint8_t>) {
+        D_emb += kINT8QparamsBytes;
+      }
+
+      auto weight_row_cache = WeightRow<emb_t, cache_t, cache_t>(
+          &weights[weights_offset_insert + idx_insert * D_emb + 0],
+          &lxu_cache_weights[cache_set * kWarpSize + insert_slot][0],
+          D_insert,
+          nullptr);
+
+      auto weight_row_emb = WeightRow<emb_t, cache_t, cache_t>(
+          &weights[weights_offset_insert + idx_insert * D_emb + 0],
+          nullptr,
+          D_insert,
+          nullptr);
+
+      weight_row_emb.warp_copy_to(
+          weight_row_cache, D_insert, blockDim.x, threadIdx.x);
+
+      if (threadIdx.x == 0) {
+        lxu_cache_state[cache_set][insert_slot] = insert_idx;
+      }
+    }
+  }
+}
+
+void lfu_cache_insert_cuda(
+    Tensor weights,
+    Tensor cache_hash_size_cumsum,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor D_offsets,
+    Tensor sorted_cache_sets,
+    Tensor cache_set_sorted_unique_indices,
+    Tensor unique_indices_length,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    Tensor lfu_state,
+    bool stochastic_rounding) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      D_offsets,
+      sorted_cache_sets,
+      cache_set_sorted_unique_indices,
+      unique_indices_length,
+      lxu_cache_state,
+      lxu_cache_weights,
+      lfu_state);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(weights.get_device());
+
+  const int32_t N = cache_set_sorted_unique_indices.numel();
+
+  DISPATCH_EMB_CACHE_TYPES(
+      weights.scalar_type(),
+      lxu_cache_weights.scalar_type(),
+      "lfu_cache_insert_kernel_2",
+      ([&] {
+        at::PhiloxCudaState rng_engine_inputs;
+        if (stochastic_rounding && !std::is_same<emb_t, float>::value) {
+          auto gen = at::cuda::detail::getDefaultCUDAGenerator();
+          std::lock_guard<std::mutex> lock(gen.mutex());
+          rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)
+                                  ->philox_cuda_state(4);
+        }
+
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lfu_cache_insert_kernel";
+#endif
+
+        lfu_cache_insert_kernel<emb_t, cache_t>
+            <<<std::min(
+                   div_round_up(N, kCacheMaxThreads / kWarpSize),
+                   get_max_thread_blocks_for_cache_kernels_()),
+               dim3(kWarpSize, kCacheMaxThreads / kWarpSize),
+               0,
+               at::cuda::getCurrentCUDAStream()>>>(
+                MAKE_PTA_WITH_NAME(func_name, weights, emb_t, 1, 64),
+                MAKE_PTA_WITH_NAME(
+                    func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name, cache_index_table_map, int32_t, 1, 64),
+                MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
+                MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
+                (uint64_t*)sorted_cache_sets.data_ptr<int64_t>(),
+                MAKE_PTA_WITH_NAME(
+                    func_name, cache_set_sorted_unique_indices, int64_t, 1, 32),
+                unique_indices_length.data_ptr<int32_t>(),
+                MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+                MAKE_PTA_WITH_NAME(
+                    func_name, lxu_cache_weights, cache_t, 2, 64),
+                MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64),
+                stochastic_rounding,
+                rng_engine_inputs);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      }));
+}
+
+} // namespace
+
+DLL_PUBLIC void lfu_cache_populate_cuda(
+    Tensor weights,
+    Tensor cache_hash_size_cumsum,
+    int64_t total_cache_hash_size,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor D_offsets,
+    Tensor linear_cache_indices,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    Tensor lfu_state,
+    bool stochastic_rounding) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      D_offsets,
+      linear_cache_indices,
+      lxu_cache_state,
+      lxu_cache_weights,
+      lfu_state);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(weights.get_device());
+
+  TORCH_CHECK(
+      linear_cache_indices.numel() < std::numeric_limits<int32_t>::max());
+  if (linear_cache_indices.numel() == 0) {
+    // nothing to do
+    return;
+  }
+
+  // get unqiue indices
+  Tensor unique_indices;
+  Tensor unique_indices_length;
+  c10::optional<Tensor> unique_indices_count;
+  std::tie(unique_indices, unique_indices_length, unique_indices_count) =
+      get_unique_indices_cuda(
+          linear_cache_indices, total_cache_hash_size, true);
+
+  // update lfu counts
+  lfu_update_counts_cuda(
+      unique_indices, unique_indices_length, *unique_indices_count, lfu_state);
+
+  // find uncached indices
+  auto cache_sets_and_unique_indices = lfu_cache_find_uncached_cuda(
+      unique_indices,
+      unique_indices_length,
+      total_cache_hash_size,
+      lxu_cache_state,
+      lfu_state);
+  const auto sorted_cache_sets = cache_sets_and_unique_indices.first;
+  const auto cache_set_sorted_unique_indices =
+      cache_sets_and_unique_indices.second;
+
+  // insert caching weights
+  lfu_cache_insert_cuda(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      D_offsets,
+      sorted_cache_sets,
+      cache_set_sorted_unique_indices,
+      unique_indices_length,
+      lxu_cache_state,
+      lxu_cache_weights,
+      lfu_state,
+      stochastic_rounding);
+}
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate_byte.cu b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate_byte.cu
new file mode 100644
index 0000000000..b3906d844c
--- /dev/null
+++ b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate_byte.cu
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "common.cuh"
+
+using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+namespace {
+
+// In `lfu_cache_insert_kernel`, we use `emb_t` and `cache_t` for the
+// high-precision cache implementation, where we can have {FP32, FP16, INT8}
+// for embedding precision (data types), and {FP32, FP16} for cache precision
+// (data types).
+//
+// In `lfu_cache_insert_byte_kernel`, we only use uint8_t for the both embedding
+// and cache data type (conforming to the inference TBE kernel logics).
+// - We pass in `weights_tys` to denote the real data types for the embeddings:
+// {FP32, FP16, INT8, INT4, INT2}. For example, FP32 is 4 byte element in the
+// byte tensor, and INT4 is half byte element in the byte tensor.
+// - We only assume that the embedding and cache have the same precisions (the
+// real "precision" is determined by `weights_tys` although the data types are
+// uint8_t only). Basically no "high-precision cache" support for now.
+// - The insert/evict of embedding row from the cache are done in a byte-by-byte
+// manner.
+template <typename index_t>
+__global__
+__launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_byte_kernel(
+    pta::PackedTensorAccessor64<uint8_t, 1, at::RestrictPtrTraits> weights,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        cache_hash_size_cumsum,
+    const pta::PackedTensorAccessor64<int32_t, 1, at::RestrictPtrTraits>
+        cache_index_table_map,
+    const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        weights_offsets,
+    const pta::PackedTensorAccessor32<uint8_t, 1, at::RestrictPtrTraits>
+        weights_tys,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        D_offsets,
+    const uint64_t* __restrict__ sorted_cache_sets,
+    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        cache_set_sorted_indices,
+    const int32_t* __restrict__ N_unique,
+    pta::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits>
+        lxu_cache_state,
+    pta::PackedTensorAccessor64<uint8_t, 2, at::RestrictPtrTraits>
+        lxu_cache_weights,
+    const pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
+        lfu_state,
+    const int64_t row_alignment) {
+  const int32_t C = lxu_cache_state.size(0);
+  for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique;
+       n += gridDim.x * blockDim.y) {
+    // check if this warp is responsible for this whole segment.
+    const bool segment_start =
+        (n == 0 ||
+         (sorted_cache_sets[n - 1] >> kLFUCounterBits) !=
+             (sorted_cache_sets[n] >> kLFUCounterBits));
+
+    if (!segment_start) {
+      // don't have *warp* divergence since we launch full warps in blockDim.x,
+      // so we can just exit this warp entirely.
+      continue;
+    }
+    const uint32_t cache_set = (sorted_cache_sets[n] >> kLFUCounterBits);
+    if (cache_set == C) {
+      // ignore the already-existing elements
+      continue;
+    }
+
+    int32_t SL = 1;
+    while (n + SL < *N_unique &&
+           (sorted_cache_sets[n + SL] >> kLFUCounterBits) == cache_set) {
+      SL += 1;
+    }
+
+    // now, we need to insert the (unique!) values in indices[n:n + SL] into
+    // our slots.
+    const int32_t slot = threadIdx.x;
+    const int64_t current_idx = lxu_cache_state[cache_set][slot];
+    const int64_t current_lfu_cost =
+        (current_idx != static_cast<int64_t>(kCacheStateInvalid))
+        ? lfu_state[current_idx]
+        : -1;
+    int64_t costs[1] = {current_lfu_cost};
+    int32_t slots[1] = {slot};
+
+    BitonicSort<int64_t, int32_t, 1, Comparator<int64_t>>::sort(costs, slots);
+    const int32_t sorted_slot = slots[0];
+    const int64_t sorted_lfu_cost = costs[0];
+
+    for (int32_t l = 0; l < min(SL, kWarpSize); ++l) {
+      const int32_t insert_slot = shfl_sync(sorted_slot, l);
+      const int64_t insert_current_lfu_cost = shfl_sync(sorted_lfu_cost, l);
+      const index_t insert_idx = cache_set_sorted_indices[n + l];
+      const int64_t insert_lfu_cost = lfu_state[insert_idx];
+
+      if (insert_current_lfu_cost > insert_lfu_cost) {
+        // don't insert.
+        // all subsequent `current_lfu_cost` values are greater, and all
+        // subsequent `insert_lfu_cost` values are smaller, so we can exit
+        // early here.
+        break;
+      }
+      const int32_t t_insert = cache_index_table_map[insert_idx];
+      const SparseType weight_ty_insert =
+          static_cast<SparseType>(weights_tys[t_insert]);
+      const int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
+      const int64_t weights_offset_insert = weights_offsets[t_insert];
+      const int32_t D_start_insert = D_offsets[t_insert];
+      const int32_t D_end_insert = D_offsets[t_insert + 1];
+      const int32_t D_insert = D_end_insert - D_start_insert;
+
+      const int32_t D_insert_bytes = nbit::padded_row_size_in_bytes(
+          D_insert, weight_ty_insert, row_alignment);
+
+      // insert into cache. Note that nbit::padded_row_size_in_bytes pad each
+      // row with row_alignment (16 bytes on GPUs) So each row will be multiple
+      // of 16 bytes (uint4 = 32bit x 4 = 16 bytes).
+      auto row = reinterpret_cast<const uint4*>(
+          &weights[weights_offset_insert + idx_insert * D_insert_bytes + 0]);
+      auto cache_row = reinterpret_cast<uint4*>(
+          &lxu_cache_weights[cache_set * kWarpSize + insert_slot][0]);
+      for (int32_t d = threadIdx.x; d * sizeof(uint4) < D_insert_bytes;
+           d += blockDim.x) {
+        cache_row[d] = row[d];
+      }
+      if (threadIdx.x == 0) {
+        lxu_cache_state[cache_set][insert_slot] = insert_idx;
+      }
+    }
+  }
+}
+
+void lfu_cache_insert_byte_cuda(
+    Tensor weights,
+    Tensor cache_hash_size_cumsum,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor weights_tys,
+    Tensor D_offsets,
+    Tensor sorted_cache_sets,
+    Tensor cache_set_sorted_unique_indices,
+    Tensor unique_indices_length,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    Tensor lfu_state,
+    int64_t row_alignment) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      weights_tys,
+      D_offsets,
+      sorted_cache_sets,
+      cache_set_sorted_unique_indices,
+      unique_indices_length,
+      lxu_cache_state,
+      lxu_cache_weights,
+      lfu_state);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(weights.get_device());
+
+  const int32_t N = cache_set_sorted_unique_indices.numel();
+
+  AT_DISPATCH_INDEX_TYPES(
+      cache_set_sorted_unique_indices.scalar_type(),
+      "lfu_cache_insert_byte_cuda",
+      [&] {
+#ifdef FBGEMM_GPU_MEMCHECK
+        const char* func_name = "lfu_cache_insert_byte_kernel";
+#endif
+        lfu_cache_insert_byte_kernel<<<
+            std::min(
+                div_round_up(N, kCacheMaxThreads / kWarpSize),
+                get_max_thread_blocks_for_cache_kernels_()),
+            dim3(kWarpSize, kCacheMaxThreads / kWarpSize),
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            MAKE_PTA_WITH_NAME(func_name, weights, uint8_t, 1, 64),
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_index_table_map, int32_t, 1, 64),
+            MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, weights_tys, uint8_t, 1, 32),
+            MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
+            (uint64_t*)sorted_cache_sets.data_ptr<int64_t>(),
+            MAKE_PTA_WITH_NAME(
+                func_name, cache_set_sorted_unique_indices, index_t, 1, 32),
+            unique_indices_length.data_ptr<int32_t>(),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+            MAKE_PTA_WITH_NAME(func_name, lxu_cache_weights, uint8_t, 2, 64),
+            MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64),
+            row_alignment);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
+}
+
+} // namespace
+
+DLL_PUBLIC void lfu_cache_populate_byte_cuda(
+    Tensor weights,
+    Tensor cache_hash_size_cumsum,
+    int64_t total_cache_hash_size,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor weights_tys,
+    Tensor D_offsets,
+    Tensor linear_cache_indices,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    Tensor lfu_state,
+    int64_t row_alignment) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      weights_tys,
+      D_offsets,
+      linear_cache_indices,
+      lxu_cache_state,
+      lxu_cache_weights,
+      lfu_state);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(weights.get_device());
+
+  TORCH_CHECK(
+      linear_cache_indices.numel() < std::numeric_limits<int32_t>::max());
+  if (linear_cache_indices.numel() == 0) {
+    // nothing to do
+    return;
+  }
+
+  // get unqiue indices
+  Tensor unique_indices;
+  Tensor unique_indices_length;
+  c10::optional<Tensor> unique_indices_count;
+  std::tie(unique_indices, unique_indices_length, unique_indices_count) =
+      get_unique_indices_cuda(
+          linear_cache_indices, total_cache_hash_size, true);
+
+  // update lfu counts
+  lfu_update_counts_cuda(
+      unique_indices, unique_indices_length, *unique_indices_count, lfu_state);
+
+  // find uncached indices
+  const auto cache_sets_and_unique_indices = lfu_cache_find_uncached_cuda(
+      unique_indices,
+      unique_indices_length,
+      total_cache_hash_size,
+      lxu_cache_state,
+      lfu_state);
+  const auto sorted_cache_sets = cache_sets_and_unique_indices.first;
+  const auto cache_set_sorted_unique_indices =
+      cache_sets_and_unique_indices.second;
+
+  // insert caching weights
+  lfu_cache_insert_byte_cuda(
+      weights,
+      cache_hash_size_cumsum,
+      cache_index_table_map,
+      weights_offsets,
+      weights_tys,
+      D_offsets,
+      sorted_cache_sets,
+      cache_set_sorted_unique_indices,
+      unique_indices_length,
+      lxu_cache_state,
+      lxu_cache_weights,
+      lfu_state,
+      row_alignment);
+}

From f94254d24f8dc733a0a8c233e5ca368f0be04989 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@meta.com>
Date: Mon, 23 Oct 2023 23:45:09 -0700
Subject: [PATCH 84/94] Add scaffolding for Python impl_abstract in fbgemm,
 implement fbgemm.permute_1D_sparse_data (#2084)

Summary:
This also fixes a minor bug in GPU permute_1D_sparse_data where we need to clone the zero-size tensors to correctly setup (lack of) aliasing.

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2084

Reviewed By: sryap

Differential Revision: D50563192

fbshipit-source-id: 1dc31580c54d8a0dfd3aadaf9b440636fd1a8550
---
 fbgemm_gpu/fbgemm_gpu/__init__.py             |  3 +-
 fbgemm_gpu/fbgemm_gpu/sparse_operators.py     | 47 +++++++++++++++++++
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp  |  4 ++
 .../src/sparse_ops/sparse_permute_2d.cu       |  7 +--
 fbgemm_gpu/test/failures_dict.json            | 24 ----------
 fbgemm_gpu/test/sparse_ops_test.py            |  1 +
 6 files changed, 58 insertions(+), 28 deletions(-)
 create mode 100644 fbgemm_gpu/fbgemm_gpu/sparse_operators.py

diff --git a/fbgemm_gpu/fbgemm_gpu/__init__.py b/fbgemm_gpu/fbgemm_gpu/__init__.py
index 0ff9b00b79..f63fde22d2 100644
--- a/fbgemm_gpu/fbgemm_gpu/__init__.py
+++ b/fbgemm_gpu/fbgemm_gpu/__init__.py
@@ -19,7 +19,8 @@
 open_source: bool = True
 
 # Re-export docs
-from . import _fbgemm_gpu_docs  # noqa: F401, E402
+# Trigger meta registrations
+from . import _fbgemm_gpu_docs, sparse_operators  # noqa: F401, E402  # noqa: F401, E402
 
 # Re-export the version string from the auto-generated version file
 from ._fbgemm_gpu_version import __version__  # noqa: F401, E402
diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_operators.py b/fbgemm_gpu/fbgemm_gpu/sparse_operators.py
new file mode 100644
index 0000000000..c561857335
--- /dev/null
+++ b/fbgemm_gpu/fbgemm_gpu/sparse_operators.py
@@ -0,0 +1,47 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional, Tuple
+
+import torch
+from torch import Tensor
+
+try:
+    # pyre-ignore
+    from fbgemm_gpu import open_source  # noqa: F401
+except Exception:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+
+
+@torch.library.impl_abstract("fbgemm::permute_2D_sparse_data")
+def permute_2D_sparse_data_meta(
+    permute: Tensor,
+    lengths: Tensor,
+    values: Tensor,
+    weights: Optional[Tensor] = None,
+    permuted_lengths_sum: Optional[int] = None,
+) -> Tuple[Tensor, Tensor, Optional[Tensor]]:
+    torch._check(
+        lengths.dim() == 2, lambda: f"expected lengths.dim() == 2, got {lengths.dim()}"
+    )
+    T = permute.numel()
+    B = lengths.size(1)
+    indices = values
+    permuted_lengths = lengths.new_empty([T, B])
+    permuted_indices_size = 0
+    if permuted_lengths_sum is not None:
+        permuted_indices_size = permuted_lengths_sum
+    else:
+        ctx = torch._custom_op.impl.get_ctx()
+        permuted_indices_size = ctx.new_dynamic_size()
+    # pyre-fixme
+    permuted_indices = indices.new_empty(permuted_indices_size)
+    permuted_weights = None
+    if weights is not None:
+        # pyre-fixme
+        permuted_weights = weights.new_empty(permuted_indices_size)
+    return permuted_lengths, permuted_indices, permuted_weights
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index 274df50962..61e9d4b92b 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -2686,6 +2686,10 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       "permute_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, int? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
   m.def(
       "permute_2D_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, int? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
+  m.impl_abstract_pystub(
+      "permute_2D_sparse_data",
+      "fbgemm_gpu.operators",
+      "//deeplearning/fbgemm/fbgemm_gpu:operators");
   m.def(
       "permute_1D_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, int? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
   m.def("invert_permute(Tensor permute) -> Tensor");
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_permute_2d.cu b/fbgemm_gpu/src/sparse_ops/sparse_permute_2d.cu
index 5c1ca28e9c..24086e0031 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_permute_2d.cu
+++ b/fbgemm_gpu/src/sparse_ops/sparse_permute_2d.cu
@@ -92,9 +92,10 @@ permute_2D_sparse_data_cuda(
     // When T = 0 or B = 0, permutation will not be performed.  Return the
     // input tensors.
     return {
-        lengths,
-        indices,
-        weights,
+        lengths.clone(),
+        indices.clone(),
+        weights.has_value() ? c10::make_optional(weights->clone())
+                            : c10::nullopt,
     };
   }
 
diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index e4da2429b9..f2f2ee1088 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -525,18 +525,6 @@
       }
     },
     "fbgemm::permute_2D_sparse_data": {
-      "SparseOpsTest.test_aot_dispatch_dynamic__test_permute_embeddings": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SparseOpsTest.test_aot_dispatch_dynamic__test_permute_indices": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SparseOpsTest.test_aot_dispatch_dynamic__test_permute_indices_with_repeats": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SparseOpsTest.test_aot_dispatch_static__test_permute_embeddings": {
         "comment": "",
         "status": "xfail"
@@ -549,18 +537,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "SparseOpsTest.test_faketensor__test_permute_embeddings": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SparseOpsTest.test_faketensor__test_permute_indices": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SparseOpsTest.test_faketensor__test_permute_indices_with_repeats": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SparseOpsTest.test_schema__test_permute_indices": {
         "comment": "flaky",
         "status": "skip"
diff --git a/fbgemm_gpu/test/sparse_ops_test.py b/fbgemm_gpu/test/sparse_ops_test.py
index 8418f966de..d4da5cf592 100644
--- a/fbgemm_gpu/test/sparse_ops_test.py
+++ b/fbgemm_gpu/test/sparse_ops_test.py
@@ -36,6 +36,7 @@
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/codegen:index_select_ops")
+    import fbgemm_gpu.sparse_operators  # noqa: F401, E402
     from fbgemm_gpu.test.test_utils import gpu_available, gpu_unavailable, skipIfRocm
 
 suppressed_list: List[HealthCheck] = (

From 688ad6541dc8462726661bd236bca477b48a5e81 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@meta.com>
Date: Tue, 24 Oct 2023 14:33:35 -0700
Subject: [PATCH 85/94] Enable some `fbgemm::pack_segments` tests (#2082)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2082

See previous diff.

Reviewed By: zou3519

Differential Revision: D50509467

fbshipit-source-id: 7b4aab969ad4736e203c114cd13f379d05284501
---
 fbgemm_gpu/test/failures_dict.json | 11 ++++++++++-
 fbgemm_gpu/test/sparse_ops_test.py | 26 --------------------------
 2 files changed, 10 insertions(+), 27 deletions(-)

diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index f2f2ee1088..de0ae7d4c7 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -491,7 +491,16 @@
         "status": "xfail"
       }
     },
-    "fbgemm::pack_segments": {},
+    "fbgemm::pack_segments": {
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_pack_segments": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_pack_segments": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
     "fbgemm::permute102_baddbmm_permute102": {
       "SparseOpsTest.test_aot_dispatch_dynamic__test_permute102_baddbmm_permute102": {
         "comment": "",
diff --git a/fbgemm_gpu/test/sparse_ops_test.py b/fbgemm_gpu/test/sparse_ops_test.py
index d4da5cf592..19cbda8f1f 100644
--- a/fbgemm_gpu/test/sparse_ops_test.py
+++ b/fbgemm_gpu/test/sparse_ops_test.py
@@ -2415,38 +2415,12 @@ def validate(
     "test_faketensor__test_index_select_dim0": [unittest.skip("hangs")],
     "test_autograd_registration__test_index_select_dim0": [unittest.skip("hangs")],
     "test_schema__test_index_select_dim0": [unittest.skip("hangs")],
-    "test_aot_dispatch_dynamic__test_pack_segments": [
-        unittest.skip("ASAN heap buffer overflow")
-    ],
-    "test_aot_dispatch_static__test_pack_segments": [
-        unittest.skip("ASAN heap buffer overflow")
-    ],
-    "test_faketensor__test_pack_segments": [unittest.skip("ASAN heap buffer overflow")],
-    "test_autograd_registration__test_pack_segments": [
-        unittest.skip("ASAN heap buffer overflow")
-    ],
-    "test_schema__test_pack_segments": [unittest.skip("ASAN heap buffer overflow")],
     "test_aot_dispatch_static__test_group_index_select_dim0": [
         unittest.skip("CUDA memory error")
     ],
     "test_aot_dispatch_dynamic__test_group_index_select_dim0": [
         unittest.skip("CUDA memory error")
     ],
-    "test_aot_dispatch_dynamic__test_pack_segments_smaller_max_len": [
-        unittest.skip("RuntimeError: opcheck can only test operators without overloads")
-    ],
-    "test_aot_dispatch_static__test_pack_segments_smaller_max_len": [
-        unittest.skip("RuntimeError: opcheck can only test operators without overloads")
-    ],
-    "test_faketensor__test_pack_segments_smaller_max_len": [
-        unittest.skip("RuntimeError: opcheck can only test operators without overloads")
-    ],
-    "test_autograd_registration__test_pack_segments_smaller_max_len": [
-        unittest.skip("RuntimeError: opcheck can only test operators without overloads")
-    ],
-    "test_schema__test_pack_segments_smaller_max_len": [
-        unittest.skip("RuntimeError: opcheck can only test operators without overloads")
-    ],
 }
 
 # only generate tests on nightly pytorch (current release version is 2.1)

From 93f5859253f82be71384749588ca85cbc29698e0 Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Tue, 24 Oct 2023 16:47:27 -0700
Subject: [PATCH 86/94] updates for ROCm 6.0 support (#2088)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2088

ROCm 6.0 introduces backwards-incompatible changes such as removing the long-deprecated use of `__HIP_PLATFORM_HCC__`. It is better to use the USE_ROCM macro which is already defined and indicates a ROCm build. This PR also defines `__HIP_PLATFORM_AMD__` which is the new symbol name. This symbol is still required for compiling with HIP headers but when not using hip-clang.

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2086

Reviewed By: sryap

Differential Revision: D50580075

Pulled By: q10

fbshipit-source-id: 6ceaa8bb36d64f2665001f9fb8afe2d4e431acb7
---
 fbgemm_gpu/cmake/Hip.cmake                    |  1 +
 .../codegen/embedding_backward_dense_host.cpp |  2 +-
 ...embedding_backward_split_host_template.cpp |  2 +-
 .../embedding_backward_split_template.cu      |  8 +--
 ...mbedding_forward_quantized_split_lookup.cu |  2 +-
 .../embedding_forward_split_template.cu       |  4 +-
 .../include/fbgemm_gpu/fbgemm_cuda_utils.cuh  | 49 +++++++++----------
 fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh  |  2 +-
 fbgemm_gpu/src/jagged_tensor_ops/common.cuh   | 10 ++--
 ...e_elementwise_add_jagged_output_forward.cu |  6 +--
 fbgemm_gpu/src/quantize_ops/common.cuh        |  2 +-
 .../quantize_fused_8bit_rowwise.cu            |  2 +-
 fbgemm_gpu/src/sparse_ops/common.cuh          |  4 +-
 .../src/sparse_ops/sparse_permute102.cu       |  4 +-
 .../split_embeddings_cache/lfu_cache_find.cu  |  2 +-
 .../split_embeddings_cache/lru_cache_find.cu  |  2 +-
 .../lru_cache_populate_byte.cu                |  2 +-
 .../src/split_embeddings_cache/lxu_cache.cu   |  2 +-
 fbgemm_gpu/src/split_embeddings_utils.cu      |  2 +-
 fbgemm_gpu/src/topology_utils.cpp             |  4 +-
 src/EmbeddingSpMDM.cc                         |  4 +-
 21 files changed, 57 insertions(+), 59 deletions(-)

diff --git a/fbgemm_gpu/cmake/Hip.cmake b/fbgemm_gpu/cmake/Hip.cmake
index 5ef5ce0752..9730f1f945 100644
--- a/fbgemm_gpu/cmake/Hip.cmake
+++ b/fbgemm_gpu/cmake/Hip.cmake
@@ -103,6 +103,7 @@ set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH})
 # Disable Asserts In Code (Can't use asserts on HIP stack.)
 ADD_DEFINITIONS(-DNDEBUG)
 ADD_DEFINITIONS(-DUSE_ROCM)
+ADD_DEFINITIONS(-D__HIP_PLATFORM_AMD__)
 
 IF(NOT DEFINED ENV{PYTORCH_ROCM_ARCH})
   SET(FBGEMM_ROCM_ARCH gfx900;gfx906;gfx908;gfx90a)
diff --git a/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp b/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp
index 730c9e4cf4..108c61fa67 100644
--- a/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp
+++ b/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp
@@ -166,7 +166,7 @@ class SplitLookupFunction_Dense_Op
 
     TORCH_CHECK_EQ(grad_outputs.size(), 1);
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
     constexpr int32_t BT_block_size = 64;
     constexpr int32_t max_segment_length_per_warp = 64;
 #else
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
index a07f69977e..eb01e0b191 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
+++ b/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
@@ -398,7 +398,7 @@ class {{ autograd_func }} :
 
     TORCH_CHECK_EQ(grad_outputs.size(), 1);
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
     constexpr int32_t BT_block_size = 64;
     constexpr int32_t max_segment_length_per_warp = 64;
 #else
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/embedding_backward_split_template.cu
index 3a3cfb1983..c621769bdc 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_template.cu
+++ b/fbgemm_gpu/codegen/embedding_backward_split_template.cu
@@ -459,7 +459,7 @@ Tensor split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_e
 
     // V100: 96 KB; A100: 160 KB; H100: 228 KB.
     int max_shared_bytes = 0;
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
     cudaDeviceGetAttribute(&max_shared_bytes, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev_weights.get_device());
 #else
     // MI100 has 64 KB local memory (shared memory) per workgroup
@@ -468,7 +468,7 @@ Tensor split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_e
     C10_CUDA_KERNEL_LAUNCH_CHECK();
     int shared_kb = max_shared_bytes >> 10;
     // V100: 64 KB; A100: 96 KB; H100: 144 KB
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
     // Use 2/3 of the available GPU shared mem; leave rooms for L1$.
     int used_shared_kb = round_down(shared_kb * 2 / 3, 16);
     TORCH_CHECK_GT(used_shared_kb, 0);
@@ -740,7 +740,7 @@ Tensor split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_e
                          kMaxVecsPerThread,
                          kThreadGroupSize>;
 
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
                 cudaFuncSetAttribute(
                     backward_cta_per_row_kernel,
                     cudaFuncAttributeMaxDynamicSharedMemorySize,
@@ -851,7 +851,7 @@ Tensor split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_e
                 if (std::is_same<emb_t, uint8_t>::value) {
                     shmem_bytes = BT_block_size * sizeof(
                         at::acc_type<cache_t, true>) * 4 * kWarpSize * kMaxVecsPerThread;
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
                     cudaFuncSetAttribute(
                         backward_warp_per_row_kernel,
                         cudaFuncAttributeMaxDynamicSharedMemorySize,
diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_split_lookup.cu b/fbgemm_gpu/codegen/embedding_forward_quantized_split_lookup.cu
index 730788767f..4dcda7bf7e 100644
--- a/fbgemm_gpu/codegen/embedding_forward_quantized_split_lookup.cu
+++ b/fbgemm_gpu/codegen/embedding_forward_quantized_split_lookup.cu
@@ -53,7 +53,7 @@ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_forward_pru
 
   const uint32_t subwarp_id = threadIdx.x / 4;
   const uint32_t subwarp_tid = threadIdx.x % 4;
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
   const uint64_t subwarp_mask = static_cast<uint64_t>(0xF) << (4 * subwarp_id);
 #else
   const uint32_t subwarp_mask = static_cast<uint32_t>(0xF) << (4 * subwarp_id);
diff --git a/fbgemm_gpu/codegen/embedding_forward_split_template.cu b/fbgemm_gpu/codegen/embedding_forward_split_template.cu
index 60bd2a461c..d346e9441d 100644
--- a/fbgemm_gpu/codegen/embedding_forward_split_template.cu
+++ b/fbgemm_gpu/codegen/embedding_forward_split_template.cu
@@ -78,7 +78,7 @@ batch_index_select_dim0_codegen_forward_small_kernel(
 {%- endif %}
 
 {% if not dense %}
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
 // Support only the split-pooled TBE case
 template <
     typename emb_t,
@@ -647,7 +647,7 @@ batch_index_select_dim0_codegen_forward_cuda(
         // if (!is_experimental)
         } else {
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
             TORCH_CHECK(false, "is_experimental=True is not supported in ROCm");
 #else
             // Allocate num warps per table based on max_D
diff --git a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
index 1f33253964..41fef90327 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
@@ -13,7 +13,7 @@
 #include <ATen/cuda/CUDAGraphsUtils.cuh>
 
 // clang-format off
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
 #define HIPCUB_ARCH 1
 #include <hipcub/backend/rocprim/block/block_scan.hpp>
 #else
@@ -35,8 +35,7 @@
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
-#if !defined(__HIP_PLATFORM_HCC__) && defined(CUDA_VERSION) && \
-    CUDA_VERSION >= 9000
+#if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION >= 9000
 #define FBGEMM_USE_SUBWARP_SHUFFLE
 #endif
 
@@ -58,14 +57,14 @@ namespace fbgemm_gpu {
 
 enum class PrimitiveType : uint8_t { FP = 0, INT = 1, BF = 2 };
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
 namespace cub = hipcub;
 #endif
 
 #define DEVICE_INLINE __device__ inline __attribute__((always_inline))
 
 // Warp size
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
 static constexpr int32_t kWarpSize = 64;
 #else
 static constexpr int32_t kWarpSize = 32;
@@ -93,7 +92,7 @@ struct Half4 {
   half2 b;
 
   __device__ inline void store(at::Half* p) {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
     p[0] = __low2half(a);
     p[1] = __high2half(a);
     p[2] = __low2half(b);
@@ -157,7 +156,7 @@ struct Vec4T<float> {
   }
 
   DEVICE_INLINE void load(const at::Half* p) {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
     union U {
       half2 h[2];
       uint2 ui;
@@ -311,7 +310,7 @@ struct Vec4T<at::Half> {
   }
 
   DEVICE_INLINE void load(const at::Half* p) {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
     union U {
       half2 h[2];
       uint2 ui;
@@ -409,7 +408,7 @@ struct Vec4T<at::Half> {
   }
 
   DEVICE_INLINE static void copy(const at::Half* src, at::Half* dst) {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
     dst[0] = src[0];
     dst[1] = src[1];
     dst[2] = src[2];
@@ -525,7 +524,7 @@ struct Vec4T<at::BFloat16> {
   }
 
   DEVICE_INLINE void load(const at::Half* p) {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
     union U {
       half2 h[2];
       uint2 ui;
@@ -705,7 +704,7 @@ struct Vec4T<double> {
   }
 
   DEVICE_INLINE void load(const at::Half* p) {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
     union U {
       half2 h[2];
       uint2 ui;
@@ -854,7 +853,7 @@ DEVICE_INLINE T shfl_xor(
     int laneMask,
     int width = kWarpSize,
     unsigned shfl_sync_mask = kFullWarpMask) {
-#if defined(__HIP_PLATFORM_HCC__) || CUDA_VERSION < 9000
+#if defined(USE_ROCM) || CUDA_VERSION < 9000
   return __shfl_xor(val, laneMask, width);
 #else
   return __shfl_xor_sync(shfl_sync_mask, val, laneMask, width);
@@ -867,7 +866,7 @@ DEVICE_INLINE T shfl_sync(
     int srcLane = 0,
     int width = kWarpSize,
     unsigned shfl_sync_mask = kFullWarpMask) {
-#if defined(__HIP_PLATFORM_HCC__) || CUDA_VERSION < 9000
+#if defined(USE_ROCM) || CUDA_VERSION < 9000
   return __shfl(val, srcLane, width);
 #else
   return __shfl_sync(shfl_sync_mask, val, srcLane, width);
@@ -880,21 +879,21 @@ DEVICE_INLINE T shfl_down_sync(
     unsigned delta,
     int width = kWarpSize,
     unsigned shfl_sync_mask = kFullWarpMask) {
-#if defined(__HIP_PLATFORM_HCC__) || CUDA_VERSION < 9000
+#if defined(USE_ROCM) || CUDA_VERSION < 9000
   return __shfl_down(val, delta, width);
 #else
   return __shfl_down_sync(shfl_sync_mask, val, delta, width);
 #endif
 }
 
-#if defined(__HIP_PLATFORM_HCC__) || CUDA_VERSION < 9000
+#if defined(USE_ROCM) || CUDA_VERSION < 9000
 DEVICE_INLINE uint64_t ballot_sync(
 #else
 DEVICE_INLINE uint32_t ballot_sync(
 #endif
     int predicate,
     unsigned shfl_sync_mask = kFullWarpMask) {
-#if defined(__HIP_PLATFORM_HCC__) || CUDA_VERSION < 9000
+#if defined(USE_ROCM) || CUDA_VERSION < 9000
   return __ballot(predicate);
 #else
   return __ballot_sync(shfl_sync_mask, predicate);
@@ -913,7 +912,7 @@ warpReduceAllSum(T val, unsigned shfl_sync_mask = kFullWarpMask) {
 }
 
 DEVICE_INLINE void syncwarp() {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
   // Performance - replace a block level __syncthreads with per CU
   // __threadfence_block. It is a fine replacement for __syncwarp on AMD GPUs,
   // it is because a. memory fencing: __threadfence_block ops. at CU level,
@@ -1002,7 +1001,7 @@ inline __device__ void warpBitonicMergeLE16(K& k, V& v) {
 template <typename K, typename V, bool Dir, typename Comp>
 struct BitonicSort {
   static inline __device__ void sort(K k[1], V v[1]) {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
     static_assert(fbgemm_gpu::kWarpSize == 64, "unexpected warp size");
 #else
     static_assert(fbgemm_gpu::kWarpSize == 32, "unexpected warp size");
@@ -1607,7 +1606,7 @@ struct __align__(32) half16 {
   half2 vals[8];
 };
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
 using __nv_bfloat16 = hip_bfloat16;
 
 typedef struct __align__(4) {
@@ -1689,7 +1688,7 @@ DEVICE_INLINE half16 to_half16(float_16 v) {
 
 // Override __bfloat162float to accept at::BFloat16
 static DEVICE_INLINE float __bfloat162float(const at::BFloat16 input) {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
   return float(*reinterpret_cast<const __nv_bfloat16*>(&input));
 #else
   return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&input));
@@ -1709,7 +1708,7 @@ static DEVICE_INLINE float to_float(const at::BFloat16 input) {
   return __bfloat162float(input);
 }
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
 // the descriptions of __float2bfloat16 and __float2bfloat16_rn are identical
 // https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____BFLOAT16__MISC.html#group__CUDA__MATH____BFLOAT16__MISC
 static __host__ __device__ __nv_bfloat16 __float2bfloat16(float f) {
@@ -1829,8 +1828,7 @@ DEVICE_INLINE float_16 make_zero_float_16() {
 
 __forceinline__ __device__ __half2
 hfma2(const __half2 a, const __half2 b, const __half2 c) {
-#if (__CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610) || \
-    defined(__HIP_PLATFORM_HCC__)
+#if (__CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610) || defined(USE_ROCM)
   return __hfma2(a, b, c);
 #else
   float2 fa, fb, fc;
@@ -1844,8 +1842,7 @@ hfma2(const __half2 a, const __half2 b, const __half2 c) {
 }
 
 __forceinline__ __device__ half hmul(half a, half b) {
-#if (__CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610) || \
-    defined(__HIP_PLATFORM_HCC__)
+#if (__CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610) || defined(USE_ROCM)
   return __hmul(a, b);
 #else
   return __float2half(__half2float(a) * __half2float(b));
@@ -3603,7 +3600,7 @@ DEVICE_INLINE float float16_min(float_16 val) {
 // ROCm does not natively support __any_sync(). Using __ballot()
 // (https://rocmdocs.amd.com/en/latest/Programming_Guides/Kernel_language.html)
 // to implement __any_sync(). Note: the "warp-size" of AMD GPU is 64.
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
 __device__ int __any_sync(uint64_t mask, int predicate) {
   uint64_t predicate_bit_pattern = __ballot(predicate);
   return (predicate_bit_pattern & mask) > 0;
diff --git a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh
index 5179b53dda..20b43cded5 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh
@@ -8,7 +8,7 @@
 
 #pragma once
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
 #define HIPCUB_ARCH 1
 #endif
 
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/common.cuh b/fbgemm_gpu/src/jagged_tensor_ops/common.cuh
index 8a9f529a9b..da910d6f6b 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/common.cuh
+++ b/fbgemm_gpu/src/jagged_tensor_ops/common.cuh
@@ -661,7 +661,7 @@ inline bool jagged_dense_dense_elementwise_jagged_output_matches_opt(
   matches &= (y_0_reshaped.size(1) < INT_MAX);
 
   int max_shared_bytes;
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
   C10_CUDA_CHECK(cudaDeviceGetAttribute(
       &max_shared_bytes,
       cudaDevAttrMaxSharedMemoryPerBlockOptin,
@@ -671,7 +671,7 @@ inline bool jagged_dense_dense_elementwise_jagged_output_matches_opt(
   max_shared_bytes = 64 << 10;
 #endif
   int shared_kb = max_shared_bytes >> 10;
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
   // Use 2/3 of the available GPU shared mem; leave rooms for L1$.
   int used_shared_kb = round_down(shared_kb * 2 / 3, 16);
   TORCH_CHECK(used_shared_kb > 0);
@@ -779,7 +779,7 @@ void jagged_dense_elementwise_jagged_output_opt_(
               at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock;
           if (dynamic_smem_size > cur_max_shared_bytes) {
             int max_shared_bytes;
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
             C10_CUDA_CHECK(cudaDeviceGetAttribute(
                 &max_shared_bytes,
                 cudaDevAttrMaxSharedMemoryPerBlockOptin,
@@ -789,7 +789,7 @@ void jagged_dense_elementwise_jagged_output_opt_(
             max_shared_bytes = 64 << 10;
 #endif
             int shared_kb = max_shared_bytes >> 10;
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
             // Use 2/3 of the available GPU shared mem; leave rooms for L1$.
             int used_shared_kb = round_down(shared_kb * 2 / 3, 16);
             TORCH_CHECK(used_shared_kb > 0);
@@ -798,7 +798,7 @@ void jagged_dense_elementwise_jagged_output_opt_(
             int used_shared_kb = shared_kb;
 #endif
             int used_shared_bytes = used_shared_kb << 10;
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
             C10_CUDA_CHECK(cudaFuncSetAttribute(
                 jagged_dense_dense_elementwise_jagged_output_opt_search_kernel_<
                     index_t>,
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_dense_elementwise_add_jagged_output_forward.cu b/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_dense_elementwise_add_jagged_output_forward.cu
index ffdcf9db73..dc19378547 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_dense_elementwise_add_jagged_output_forward.cu
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_dense_elementwise_add_jagged_output_forward.cu
@@ -97,7 +97,7 @@ void jagged_dense_dense_elementwise_jagged_output_opt_(
               at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock;
           if (dynamic_smem_size > cur_max_shared_bytes) {
             int max_shared_bytes;
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
             C10_CUDA_CHECK(cudaDeviceGetAttribute(
                 &max_shared_bytes,
                 cudaDevAttrMaxSharedMemoryPerBlockOptin,
@@ -107,7 +107,7 @@ void jagged_dense_dense_elementwise_jagged_output_opt_(
             max_shared_bytes = 64 << 10;
 #endif
             int shared_kb = max_shared_bytes >> 10;
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
             // Use 2/3 of the available GPU shared mem; leave rooms for L1$.
             int used_shared_kb = round_down(shared_kb * 2 / 3, 16);
             TORCH_CHECK_GT(used_shared_kb, 0);
@@ -116,7 +116,7 @@ void jagged_dense_dense_elementwise_jagged_output_opt_(
             int used_shared_kb = shared_kb;
 #endif
             int used_shared_bytes = used_shared_kb << 10;
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
             C10_CUDA_CHECK(cudaFuncSetAttribute(
                 jagged_dense_dense_elementwise_jagged_output_opt_search_kernel_<
                     index_t>,
diff --git a/fbgemm_gpu/src/quantize_ops/common.cuh b/fbgemm_gpu/src/quantize_ops/common.cuh
index aff4b260a6..eb1a1eaadf 100644
--- a/fbgemm_gpu/src/quantize_ops/common.cuh
+++ b/fbgemm_gpu/src/quantize_ops/common.cuh
@@ -9,7 +9,7 @@
 #include <ATen/TensorIterator.h>
 #include <ATen/cuda/Exceptions.h>
 #include <c10/cuda/CUDAGuard.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
 #include <math_constants.h>
 #endif
 
diff --git a/fbgemm_gpu/src/quantize_ops/quantize_fused_8bit_rowwise.cu b/fbgemm_gpu/src/quantize_ops/quantize_fused_8bit_rowwise.cu
index dd6e088932..f90bdfe2f4 100644
--- a/fbgemm_gpu/src/quantize_ops/quantize_fused_8bit_rowwise.cu
+++ b/fbgemm_gpu/src/quantize_ops/quantize_fused_8bit_rowwise.cu
@@ -64,7 +64,7 @@ __global__ inline void _get_8bit_qparam_cuda_kernel(
   const int output_columns = ncols_aligned + 2 * sizeof(float);
 
   // starting values for future reductions
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
 #define HIPRT_INF_F __int_as_float(0x7f800000)
   float minimum_element = HIPRT_INF_F;
   float maximum_element = -HIPRT_INF_F;
diff --git a/fbgemm_gpu/src/sparse_ops/common.cuh b/fbgemm_gpu/src/sparse_ops/common.cuh
index 5ba8a34742..5cfca60e23 100644
--- a/fbgemm_gpu/src/sparse_ops/common.cuh
+++ b/fbgemm_gpu/src/sparse_ops/common.cuh
@@ -31,11 +31,11 @@
 #include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
 #include "fbgemm_gpu/split_embeddings_utils.cuh"
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
 #include <hipblas.h>
 #endif
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
 #define LDG(ptr) (*(ptr))
 #else
 #define LDG(ptr) (__ldg(ptr))
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_permute102.cu b/fbgemm_gpu/src/sparse_ops/sparse_permute102.cu
index e7fb33eda8..52dd6674fd 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_permute102.cu
+++ b/fbgemm_gpu/src/sparse_ops/sparse_permute102.cu
@@ -15,7 +15,7 @@
     https://www.reddit.com/r/LocalLLaMA/comments/162j9uj/the_final_guide_for_rocm_users/
     https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUBLAS_API_supported_by_HIP.html
 */
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
 #define rocblas_set_stream hipblasSetStream
 #define rocblas_status_success HIPBLAS_STATUS_SUCCESS
 #endif
@@ -63,7 +63,7 @@ DLL_PUBLIC Tensor permute102_baddbmm_permute102_cuda(
   // C (m, b, n) = A (m, b, k) * B (b, k, n) ---> row major
   // C (m, b, n) = (B^T (b, k, n) * A^T (m, b, k))^T ---> column major
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
   float alpha = 1.0f;
   float beta = 1.0f;
 
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_find.cu b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_find.cu
index 3c154d7c0d..0382de8e96 100644
--- a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_find.cu
+++ b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_find.cu
@@ -53,7 +53,7 @@ __global__ __launch_bounds__(kMaxThreads) void lfu_cache_find_uncached_kernel(
     const auto slot = threadIdx.x;
     const bool found = ::__ldg((&lxu_cache_state[cache_set][0]) + slot) == idx;
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
     if (!__any_sync(0xFFFFFFFFFFFFFFFF, found)) {
 #else
     if (!__any_sync(0xFFFFFFFF, found)) {
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu
index 95e2639464..a3c4926624 100644
--- a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu
+++ b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu
@@ -131,7 +131,7 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_find_uncached_kernel(
       }
     }
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
     if (!__any_sync(0xFFFFFFFFFFFFFFFF, found)) {
 #else
     if (!__any_sync(0xFFFFFFFF, found)) {
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cu b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cu
index 2f580d5a03..40be037da1 100644
--- a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cu
+++ b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cu
@@ -63,7 +63,7 @@ __launch_bounds__(kMaxThreads) void direct_mapped_lru_cache_find_uncached_kernel
       cache_sets[n] = -1; // sentinel value
     } else {
       // There is no atomicMax for int64_t...
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
       auto addr = reinterpret_cast<unsigned long long*>(
           &lxu_cache_miss_timestamp[cache_set][0]);
       auto val = static_cast<unsigned long long>(time_stamp + 1);
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu b/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu
index 445ad68c9d..6d43c259b5 100644
--- a/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu
+++ b/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu
@@ -280,7 +280,7 @@ __global__ __launch_bounds__(kMaxThreads) void lxu_cache_lookup_kernel(
     n_indices++;
     const bool found =
         (::__ldg((&lxu_cache_state[cache_set][0]) + slot) == idx);
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
     // FIXME: __ballot_sync with mask isn't supported by HIP yet.
     // See https://fburl.com/fvy7j0lq for the similar context.
     // assert false here with https://fburl.com/pfm7enw2
diff --git a/fbgemm_gpu/src/split_embeddings_utils.cu b/fbgemm_gpu/src/split_embeddings_utils.cu
index b9fba4b5f5..dd5c0ec70a 100644
--- a/fbgemm_gpu/src/split_embeddings_utils.cu
+++ b/fbgemm_gpu/src/split_embeddings_utils.cu
@@ -21,7 +21,7 @@
 #include "fbgemm_gpu/cub_namespace_postfix.cuh"
 // clang-format on
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
 #include <rocm_version.h>
 #endif
 
diff --git a/fbgemm_gpu/src/topology_utils.cpp b/fbgemm_gpu/src/topology_utils.cpp
index 7132eb1c12..9e53ae9a1e 100644
--- a/fbgemm_gpu/src/topology_utils.cpp
+++ b/fbgemm_gpu/src/topology_utils.cpp
@@ -14,7 +14,7 @@
 
 #include "fbgemm_gpu/topology_utils.h"
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
 #include "hip/hip_runtime.h"
 #include "rocm_smi/rocm_smi.h"
 
@@ -184,4 +184,4 @@ AdjacencyMatrix<Links> get_nvlink_matrix() {
 }
 } // namespace fbgemm_gpu
 
-#endif // __HIP_PLATFORM_HCC__
+#endif // USE_ROCM
diff --git a/src/EmbeddingSpMDM.cc b/src/EmbeddingSpMDM.cc
index c1e833e7fb..144f8c0730 100644
--- a/src/EmbeddingSpMDM.cc
+++ b/src/EmbeddingSpMDM.cc
@@ -1594,7 +1594,7 @@ void compressed_indices_remap(
 #ifndef NO_AVX512
   const inst_set_t isa = fbgemmInstructionSet();
   if (isZmm(isa)) {
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
     if (weights == nullptr) {
       internal::compressed_indices_remap_avx512<IndexType, false>(
           offsets_len,
@@ -1618,7 +1618,7 @@ void compressed_indices_remap(
           out_weights);
       return;
     }
-#endif // __HIP_PLATFORM_HCC__
+#endif // USE_ROCM
   }
 #endif // NO_AVX512
 #endif // CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64

From 97e62630fe9a8a7958c70336d0c035d1c330fd9d Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@meta.com>
Date: Tue, 24 Oct 2023 16:47:38 -0700
Subject: [PATCH 87/94] Remove impl_abstract_pystub for now, while old PyTorch
 is in support window

Reviewed By: zou3519

Differential Revision: D50610665

fbshipit-source-id: 53bff35c79ae11634380d967e737bdf00ab2015e
---
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index 61e9d4b92b..274df50962 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -2686,10 +2686,6 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       "permute_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, int? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
   m.def(
       "permute_2D_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, int? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
-  m.impl_abstract_pystub(
-      "permute_2D_sparse_data",
-      "fbgemm_gpu.operators",
-      "//deeplearning/fbgemm/fbgemm_gpu:operators");
   m.def(
       "permute_1D_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, int? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
   m.def("invert_permute(Tensor permute) -> Tensor");

From 9cd8ce8404c696a9cf619c47bd820ac7d9bd2263 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Tue, 24 Oct 2023 19:12:40 -0700
Subject: [PATCH 88/94] Migrate split embeddings cache registrations (#2085)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2085

- Migrate split embeddings cache ops registrations into the split_embeddings_cache directory

Reviewed By: spcyppt

Differential Revision: D50569476

fbshipit-source-id: 838efdf376abfd6f07104a05d59343e122038183
---
 fbgemm_gpu/CMakeLists.txt                                       | 2 +-
 fbgemm_gpu/docs/Doxyfile.in                                     | 2 +-
 .../split_embeddings_cache_ops.cpp}                             | 0
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename fbgemm_gpu/src/{split_table_batched_embeddings.cpp => split_embeddings_cache/split_embeddings_cache_ops.cpp} (100%)

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index c193c5b733..29141a5615 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -588,7 +588,7 @@ if(NOT FBGEMM_CPU_ONLY)
     src/quantize_ops/quantize_ops_gpu.cpp
     src/sparse_ops/sparse_ops_gpu.cpp
     src/split_embeddings_utils.cpp
-    src/split_table_batched_embeddings.cpp
+    src/split_embeddings_cache/split_embeddings_cache_ops.cpp
     src/metric_ops_host.cpp
     src/embedding_inplace_update_gpu.cpp
     src/input_combine_gpu.cpp
diff --git a/fbgemm_gpu/docs/Doxyfile.in b/fbgemm_gpu/docs/Doxyfile.in
index 363eb553d8..b823f869c1 100644
--- a/fbgemm_gpu/docs/Doxyfile.in
+++ b/fbgemm_gpu/docs/Doxyfile.in
@@ -921,7 +921,7 @@ INPUT                  = "../include/fbgemm_gpu" \
                          "../include/fbgemm_gpu/sparse_ops.h" \
                          "../fbgemm_gpu/src/quantize_ops.cu" \
                          "../src/quantize_ops_cpu.cpp" \
-                         "../src/split_table_batched_embeddings.cpp" \
+                         "../src/split_embeddings_cache/split_embeddings_cache_ops.cpp" \
                          "../src/jagged_tensor_ops.cu" \
                          "../src/jagged_tensor_ops_cpu.cpp" \
                          "../src/cumem_utils.h" \
diff --git a/fbgemm_gpu/src/split_table_batched_embeddings.cpp b/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cpp
similarity index 100%
rename from fbgemm_gpu/src/split_table_batched_embeddings.cpp
rename to fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cpp

From 34f62ad539c3af8cad26ac3e52d4e692caa851ff Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@meta.com>
Date: Wed, 25 Oct 2023 20:43:25 -0700
Subject: [PATCH 89/94] impl_abstract for permute_1D_sparse_data (#2087)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2087

Reviewed By: zou3519

Differential Revision: D50584541

fbshipit-source-id: 6adcb833e779ba35464bdb3e4ec0f13fbe514f8a
---
 fbgemm_gpu/fbgemm_gpu/sparse_operators.py | 26 +++++++++++++++++++++++
 fbgemm_gpu/test/failures_dict.json        |  8 -------
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_operators.py b/fbgemm_gpu/fbgemm_gpu/sparse_operators.py
index c561857335..091792ac19 100644
--- a/fbgemm_gpu/fbgemm_gpu/sparse_operators.py
+++ b/fbgemm_gpu/fbgemm_gpu/sparse_operators.py
@@ -45,3 +45,29 @@ def permute_2D_sparse_data_meta(
         # pyre-fixme
         permuted_weights = weights.new_empty(permuted_indices_size)
     return permuted_lengths, permuted_indices, permuted_weights
+
+
+@torch.library.impl_abstract("fbgemm::permute_1D_sparse_data")
+def permute_1D_sparse_data_meta(
+    permute: Tensor,
+    lengths: Tensor,
+    values: Tensor,
+    weights: Optional[Tensor] = None,
+    permuted_lengths_sum: Optional[int] = None,
+) -> Tuple[Tensor, Tensor, Optional[Tensor]]:
+    indices = values
+    permuted_lengths_size = permute.numel()
+    permuted_lengths = lengths.new_empty([permuted_lengths_size])
+    permuted_indices_size = 0
+    if permuted_lengths_sum is not None:
+        permuted_indices_size = permuted_lengths_sum
+    else:
+        ctx = torch._custom_op.impl.get_ctx()
+        permuted_indices_size = ctx.new_dynamic_size()
+    # pyre-fixme
+    permuted_indices = indices.new_empty(permuted_indices_size)
+    permuted_weights = None
+    if weights is not None:
+        # pyre-fixme
+        permuted_weights = weights.new_empty(permuted_indices_size)
+    return permuted_lengths, permuted_indices, permuted_weights
diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index de0ae7d4c7..daca769e1e 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -516,18 +516,10 @@
       }
     },
     "fbgemm::permute_1D_sparse_data": {
-      "SparseOpsTest.test_aot_dispatch_dynamic__test_permute_indices": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SparseOpsTest.test_aot_dispatch_static__test_permute_indices": {
         "comment": "",
         "status": "xfail"
       },
-      "SparseOpsTest.test_faketensor__test_permute_indices": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SparseOpsTest.test_schema__test_permute_indices": {
         "comment": "flaky",
         "status": "skip"

From 3283a686e0dd46a42e8f819a03d668d435f10019 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@meta.com>
Date: Wed, 25 Oct 2023 20:43:25 -0700
Subject: [PATCH 90/94] impl_abstract expand_into_jagged_permute (#2090)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2090

Reviewed By: zou3519

Differential Revision: D50586828

fbshipit-source-id: 2f92a717877dfaf7b56fbea56df67acc272fd8f5
---
 fbgemm_gpu/fbgemm_gpu/sparse_operators.py | 20 ++++++++++++++++++++
 fbgemm_gpu/test/failures_dict.json        | 15 +--------------
 fbgemm_gpu/test/jagged_tensor_ops_test.py |  1 +
 3 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_operators.py b/fbgemm_gpu/fbgemm_gpu/sparse_operators.py
index 091792ac19..8107ba09f2 100644
--- a/fbgemm_gpu/fbgemm_gpu/sparse_operators.py
+++ b/fbgemm_gpu/fbgemm_gpu/sparse_operators.py
@@ -71,3 +71,23 @@ def permute_1D_sparse_data_meta(
         # pyre-fixme
         permuted_weights = weights.new_empty(permuted_indices_size)
     return permuted_lengths, permuted_indices, permuted_weights
+
+
+@torch.library.impl_abstract("fbgemm::expand_into_jagged_permute")
+def expand_into_jagged_permute_meta(
+    permute: Tensor,
+    input_offsets: Tensor,
+    output_offsets: Tensor,
+    output_size: Tuple[int, ...],
+) -> Tensor:
+    torch._check(permute.numel() > 0, lambda: "expected {permute.numel} > 0")
+    torch._check(
+        permute.numel() == input_offsets.numel() - 1,
+        lambda: f"expected {permute.numel()} == {input_offsets.numel()} - 1",
+    )
+    torch._check(
+        permute.numel() == output_offsets.numel() - 1,
+        lambda: f"expected {permute.numel()} == {output_offsets.numel()} - 1",
+    )
+    output_permute = input_offsets.new_empty(output_size)
+    return output_permute
diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index daca769e1e..b8cbae093b 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -154,20 +154,7 @@
         "status": "xfail"
       }
     },
-    "fbgemm::expand_into_jagged_permute": {
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_expand_into_jagged_permute": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_expand_into_jagged_permute": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_expand_into_jagged_permute": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
+    "fbgemm::expand_into_jagged_permute": {},
     "fbgemm::generic_histogram_binning_calibration_by_feature": {
       "SparseOpsTest.test_aot_dispatch_dynamic__test_generic_histogram_binning_calibration_by_feature": {
         "comment": "",
diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py
index ddd4464a7e..f83bd941bc 100644
--- a/fbgemm_gpu/test/jagged_tensor_ops_test.py
+++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py
@@ -35,6 +35,7 @@
 except Exception:
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+    import fbgemm_gpu.sparse_operators  # noqa: F401, E402
     from fbgemm_gpu.test.test_utils import (
         gpu_available,
         gpu_unavailable,

From bbc5358bb9d5de38301f8e7e29eb295af3615452 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@meta.com>
Date: Wed, 25 Oct 2023 20:43:25 -0700
Subject: [PATCH 91/94] SymInt'ify all operator schemas in sparse_ops (#2089)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2089

I'm not entirely sure about some of these (e.g., bin_ctr_in_use_after) but it should
be harmless enough to accept symbolic SymInt at all these sites.

Reviewed By: zou3519

Differential Revision: D50587015

fbshipit-source-id: 65484b2535753cd2732c80e16b7cc590ace9e723
---
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp | 36 ++++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index 274df50962..8d2e781fe8 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -2683,43 +2683,43 @@ Tensor bottom_k_per_row(
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
-      "permute_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, int? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
+      "permute_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
   m.def(
-      "permute_2D_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, int? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
+      "permute_2D_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
   m.def(
-      "permute_1D_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, int? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
+      "permute_1D_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
   m.def("invert_permute(Tensor permute) -> Tensor");
   m.def(
-      "expand_into_jagged_permute(Tensor permute, Tensor input_offset, Tensor output_offset, int output_size) -> Tensor");
+      "expand_into_jagged_permute(Tensor permute, Tensor input_offset, Tensor output_offset, SymInt output_size) -> Tensor");
   m.def(
-      "block_bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, bool sequence, Tensor block_sizes, int my_size, Tensor? weights=None, Tensor? batch_size_per_feature=None, int max_B= -1) -> (Tensor, Tensor, Tensor?, Tensor?, Tensor?)");
+      "block_bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, bool sequence, Tensor block_sizes, SymInt my_size, Tensor? weights=None, Tensor? batch_size_per_feature=None, SymInt max_B= -1) -> (Tensor, Tensor, Tensor?, Tensor?, Tensor?)");
   m.def(
-      "bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, int my_size, Tensor? weights=None) -> (Tensor, Tensor, Tensor?, Tensor?)");
+      "bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, SymInt my_size, Tensor? weights=None) -> (Tensor, Tensor, Tensor?, Tensor?)");
   m.def("asynchronous_exclusive_cumsum(Tensor t_in) -> Tensor");
   m.def("asynchronous_inclusive_cumsum(Tensor t_in) -> Tensor");
   m.def("asynchronous_complete_cumsum(Tensor t_in) -> Tensor");
   m.def(
-      "reorder_batched_ad_lengths(Tensor cat_ad_lengths, Tensor batch_offsets, int num_ads_in_batch, bool broadcast_lengths=False) -> Tensor");
+      "reorder_batched_ad_lengths(Tensor cat_ad_lengths, Tensor batch_offsets, SymInt num_ads_in_batch, bool broadcast_lengths=False) -> Tensor");
   m.def(
-      "reorder_batched_ad_indices(Tensor cat_ad_offsets, Tensor cat_ad_indices, Tensor reordered_cat_ad_offsets, Tensor batch_offsets, int num_ads_in_batch, bool broadcast_indices=False, int num_indices_after_broadcast=-1) -> Tensor");
+      "reorder_batched_ad_indices(Tensor cat_ad_offsets, Tensor cat_ad_indices, Tensor reordered_cat_ad_offsets, Tensor batch_offsets, SymInt num_ads_in_batch, bool broadcast_indices=False, SymInt num_indices_after_broadcast=-1) -> Tensor");
   m.def(
-      "cat_reorder_batched_ad_indices(Tensor cat_ad_offsets, Tensor[] cat_ad_indices, Tensor reordered_cat_ad_offsets, Tensor batch_offsets, int num_ads_in_batch, bool broadcast_indices, int total_num_indices, bool pinned_memory=False) -> Tensor");
+      "cat_reorder_batched_ad_indices(Tensor cat_ad_offsets, Tensor[] cat_ad_indices, Tensor reordered_cat_ad_offsets, Tensor batch_offsets, SymInt num_ads_in_batch, bool broadcast_indices, SymInt total_num_indices, bool pinned_memory=False) -> Tensor");
   m.def("offsets_range(Tensor offsets, SymInt range_size) -> Tensor");
   m.def(
       "batched_unary_embeddings(Tensor weight, Tensor table_offsets, Tensor offsets, Tensor indices) -> Tensor");
   m.def(
-      "histogram_binning_calibration(Tensor logit, Tensor bin_num_examples, Tensor bin_num_positives, float positive_weight, float lower_bound, float upper_bound, int bin_ctr_in_use_after, float bin_ctr_weight_value) -> (Tensor, Tensor)");
+      "histogram_binning_calibration(Tensor logit, Tensor bin_num_examples, Tensor bin_num_positives, float positive_weight, float lower_bound, float upper_bound, SymInt bin_ctr_in_use_after, float bin_ctr_weight_value) -> (Tensor, Tensor)");
   m.def(
-      "histogram_binning_calibration_by_feature(Tensor logit, Tensor segment_value, Tensor segment_lengths, int num_segments, Tensor bin_num_examples, Tensor bin_num_positives, int num_bins, float positive_weight, float lower_bound, float upper_bound, int bin_ctr_in_use_after, float bin_ctr_weight_value) -> (Tensor, Tensor)");
+      "histogram_binning_calibration_by_feature(Tensor logit, Tensor segment_value, Tensor segment_lengths, SymInt num_segments, Tensor bin_num_examples, Tensor bin_num_positives, SymInt num_bins, float positive_weight, float lower_bound, float upper_bound, SymInt bin_ctr_in_use_after, float bin_ctr_weight_value) -> (Tensor, Tensor)");
   m.def(
-      "generic_histogram_binning_calibration_by_feature(Tensor logit, Tensor segment_value, Tensor segment_lengths, int num_segments, Tensor bin_num_examples, Tensor bin_num_positives, Tensor bin_boundaries, float positive_weight, int bin_ctr_in_use_after, float bin_ctr_weight_value) -> (Tensor, Tensor)");
+      "generic_histogram_binning_calibration_by_feature(Tensor logit, Tensor segment_value, Tensor segment_lengths, SymInt num_segments, Tensor bin_num_examples, Tensor bin_num_positives, Tensor bin_boundaries, float positive_weight, SymInt bin_ctr_in_use_after, float bin_ctr_weight_value) -> (Tensor, Tensor)");
   m.def(
-      "segment_sum_csr(int batch_size, Tensor csr_seg, Tensor values) -> Tensor");
+      "segment_sum_csr(SymInt batch_size, Tensor csr_seg, Tensor values) -> Tensor");
   m.def(
-      "embedding_bag_rowwise_prune(Tensor weight, Tensor indicator, float threshold, ScalarType compressed_indices_dtype, bool abs=True, int min_num_rows=0, float? min_save_ratio=1.0) -> (Tensor, Tensor)");
-  m.def("lengths_range(Tensor t_in, int[]? shape=None) -> Tensor");
+      "embedding_bag_rowwise_prune(Tensor weight, Tensor indicator, float threshold, ScalarType compressed_indices_dtype, bool abs=True, SymInt min_num_rows=0, float? min_save_ratio=1.0) -> (Tensor, Tensor)");
+  m.def("lengths_range(Tensor t_in, SymInt[]? shape=None) -> Tensor");
   m.def(
-      "lengths_range_out(Tensor output, Tensor t_in, int[]? shape=None) -> Tensor");
+      "lengths_range_out(Tensor output, Tensor t_in, SymInt[]? shape=None) -> Tensor");
   m.def(
       "permute_sparse_features(Tensor permute, Tensor lengths, Tensor indices, Tensor? weights=None) -> (Tensor, Tensor, Tensor?)");
   m.def("Bfloat16QuantizedToFloat(Tensor input) -> Tensor");
@@ -2750,7 +2750,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   //
   // skip_indices_sorting_fwd is for skipping indices sorting in forward
   m.def(
-      "index_select_dim0(Tensor input, Tensor indices, int? consecutive_range_start=0, int? consecutive_range_length=0, bool? skip_indices_sorting_fwd=None) -> Tensor");
+      "index_select_dim0(Tensor input, Tensor indices, SymInt? consecutive_range_start=0, SymInt? consecutive_range_length=0, bool? skip_indices_sorting_fwd=None) -> Tensor");
   m.def(
       "group_index_select_dim0(Tensor[] input_group, Tensor[] indices_group) -> Tensor[]");
   // This is an one-off op to be used in split_embedding_utils.py for zipf
@@ -2763,7 +2763,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "bottom_k_per_row(Tensor input, Tensor k_offsets, bool requires_unique) -> Tensor");
   m.def(
-      "keyed_jagged_index_select_dim1(Tensor values, Tensor lengths, Tensor offsets, Tensor indices, int batch_size, Tensor? weights=None) -> Tensor[]");
+      "keyed_jagged_index_select_dim1(Tensor values, Tensor lengths, Tensor offsets, Tensor indices, SymInt batch_size, Tensor? weights=None) -> Tensor[]");
 }
 
 TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {

From 6c2be8831a67d4ab5c12fc7456e1e4e192c67c38 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@meta.com>
Date: Wed, 25 Oct 2023 20:43:25 -0700
Subject: [PATCH 92/94] Turn on opcheck for split_table_batched_embeddings_test
 (#2092)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2092

Reviewed By: zou3519

Differential Revision: D50613728

fbshipit-source-id: 5b3a8d91166a491147c728e5c024de9e118088ef
---
 fbgemm_gpu/test/failures_dict_fast.json       | 810 ++++++++++++++++++
 .../split_table_batched_embeddings_test.py    |  46 +-
 fbgemm_gpu/test/test_utils.py                 |  26 +-
 3 files changed, 870 insertions(+), 12 deletions(-)
 create mode 100644 fbgemm_gpu/test/failures_dict_fast.json

diff --git a/fbgemm_gpu/test/failures_dict_fast.json b/fbgemm_gpu/test/failures_dict_fast.json
new file mode 100644
index 0000000000..0e7d666814
--- /dev/null
+++ b/fbgemm_gpu/test/failures_dict_fast.json
@@ -0,0 +1,810 @@
+{
+  "_description": "This is a dict containing failures for tests autogenerated by generate_opcheck_tests. For more details, please see https://docs.google.com/document/d/1Pj5HRZvdOq3xpFpbEjUZp2hBovhy7Wnxw14m6lF2154/edit",
+  "_version": 1,
+  "data": {
+    "fbgemm::FloatToFused8BitRowwiseQuantized": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_cpu_int8": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_fused_pooled_emb_quant": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_gpu_no_cache_int8": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_gpu_uvm_cache_int8": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_fused_pooled_emb_quant": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::FloatToFusedNBitRowwiseQuantizedSBHalf": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_fused_pooled_emb_quant": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::FloatToHFP8Quantized": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_cpu": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_cpu_bf16_out": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_gpu_no_cache": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_gpu_no_cache_fp8_2048": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::Fused8BitRowwiseQuantizedToFloat": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_cpu_int8": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_fused_pooled_emb_quant": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_gpu_no_cache_int8": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_gpu_uvm_cache_int8": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_fused_pooled_emb_quant": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::HFP8QuantizedToFloat": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_cpu": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_cpu_bf16_out": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_gpu_no_cache": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_gpu_no_cache_fp8_2048": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::bounds_check_indices": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmNONE": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmSUM": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmNONE": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_none": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_none_with_rowwise_adagrad": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_adagrad": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_adam": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_lamb": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_lars": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd_really_long_segments": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_bounds_check": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_miss_counter": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_pipeline": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_1": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_2": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_fused_pooled_emb_quant": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_int_nbit_split_embedding_uvm_caching_codegen_lookup_function": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_cache_miss_counter": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_direct_mapped_uvm_cache_stats": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_fused_pooled_emb_quant": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_uvm_cache_stats": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_stb_uvm_cache_stats": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::dense_embedding_codegen_lookup_function": {
+      "SplitTableBatchedEmbeddingsTest.test_autograd_registration__test_backward_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_dense": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::direct_mapped_lru_cache_populate_byte": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_direct_mapped_uvm_cache_stats": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::direct_mapped_lxu_cache_lookup": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_direct_mapped_uvm_cache_stats": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::emb_inplace_update": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_embedding_inplace_update": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::int_nbit_split_embedding_codegen_lookup_function": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_int_nbit_split_embedding_uvm_caching_codegen_lookup_function": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_cache_miss_counter": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_direct_mapped_uvm_cache_stats": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_fused_pooled_emb_quant": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_uvm_cache_stats": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::int_nbit_split_embedding_uvm_caching_codegen_lookup_function": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_int_nbit_split_embedding_uvm_caching_codegen_lookup_function": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_schema__test_int_nbit_split_embedding_uvm_caching_codegen_lookup_function": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::lfu_cache_populate": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_pipeline": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_schema__test_cache_pipeline": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::lfu_cache_populate_byte": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::linearize_cache_indices": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmNONE": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd_really_long_segments": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_miss_counter": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_pipeline": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_1": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_2": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_int_nbit_split_embedding_uvm_caching_codegen_lookup_function": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_linearize_cache_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_cache_miss_counter": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_direct_mapped_uvm_cache_stats": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_uvm_cache_stats": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_stb_uvm_cache_stats": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::linearize_cache_indices_from_row_idx": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_linearize_cache_indices_from_row_idx": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::lru_cache_populate": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmNONE": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd_really_long_segments": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_miss_counter": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_pipeline": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_1": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_2": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_stb_uvm_cache_stats": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_schema__test_cache_pipeline": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_schema__test_cache_prefetch_pipeline": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_schema__test_cache_prefetch_pipeline_stream_1": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_schema__test_cache_prefetch_pipeline_stream_2": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::lru_cache_populate_byte": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_int_nbit_split_embedding_uvm_caching_codegen_lookup_function": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_cache_miss_counter": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_uvm_cache_stats": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::lxu_cache_flush": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmNONE": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd_really_long_segments": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_pipeline": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_1": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_2": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::lxu_cache_locking_counter_decrement": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_lxu_cache_locking_counter_decrement": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::lxu_cache_lookup": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmNONE": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd_really_long_segments": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_miss_counter": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_pipeline": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_1": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_2": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_int_nbit_split_embedding_uvm_caching_codegen_lookup_function": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_lxu_cache_lookup": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_cache_miss_counter": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_uvm_cache_stats": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_stb_uvm_cache_stats": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::new_managed_tensor": {},
+    "fbgemm::new_unified_tensor": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_int_nbit_split_embedding_uvm_caching_codegen_lookup_function": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_cache_miss_counter": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_cache_update_function": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_direct_mapped_uvm_cache_stats": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_gpu_no_cache": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_gpu_no_cache_fp8_2048": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_uvm_cache_stats": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::pruned_array_lookup": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_pruning": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::pruned_hashmap_insert": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_gpu_no_cache": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_pruning": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::pruned_hashmap_lookup": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_pruning": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::reset_weight_momentum": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_reset_embedding_weight_momentum": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_schema__test_reset_embedding_weight_momentum": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::split_embedding_codegen_lookup_adagrad_function": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmNONE": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmSUM": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmNONE": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_adagrad": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::split_embedding_codegen_lookup_adagrad_function_cpu": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_adagrad": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::split_embedding_codegen_lookup_adam_function": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_adam": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::split_embedding_codegen_lookup_lamb_function": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_lamb": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::split_embedding_codegen_lookup_lars_sgd_function": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_lars": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::split_embedding_codegen_lookup_none_function": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_none": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_none_with_rowwise_adagrad": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::split_embedding_codegen_lookup_partial_rowwise_adam_function": {},
+    "fbgemm::split_embedding_codegen_lookup_partial_rowwise_lamb_function": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_lamb": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::split_embedding_codegen_lookup_rowwise_adagrad_function": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmNONE": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmSUM": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmNONE": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_none_with_rowwise_adagrad": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_adagrad": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::split_embedding_codegen_lookup_rowwise_adagrad_function_cpu": {},
+    "fbgemm::split_embedding_codegen_lookup_rowwise_weighted_adagrad_function": {},
+    "fbgemm::split_embedding_codegen_lookup_sgd_function": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd_really_long_segments": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_miss_counter": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_pipeline": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_1": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_2": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_fused_pooled_emb_quant": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_stb_uvm_cache_stats": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::split_embedding_codegen_lookup_sgd_function_cpu": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd": {
+        "comment": "",
+        "status": "xfail"
+      }
+    }
+  }
+}
diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
index 7c9afcb506..641b3c1e31 100644
--- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py
+++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -13,7 +13,7 @@
 import random
 import unittest
 from itertools import accumulate
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import fbgemm_gpu
 import hypothesis.strategies as st
@@ -73,11 +73,19 @@
 
 if open_source:
     # pyre-ignore[21]
-    from test_utils import gpu_available, gpu_unavailable, TEST_WITH_ROCM
+    from test_utils import (
+        gpu_available,
+        gpu_unavailable,
+        gradcheck,
+        optests,
+        TEST_WITH_ROCM,
+    )
 else:
     from fbgemm_gpu.test.test_utils import (
         gpu_available,
         gpu_unavailable,
+        gradcheck,
+        optests,
         TEST_WITH_ROCM,
     )
 
@@ -88,6 +96,10 @@
 MAX_EXAMPLES_LONG_RUNNING = 15
 
 
+settings.register_profile("derandomize", derandomize=True)
+settings.load_profile("derandomize")
+
+
 @composite
 # pyre-ignore
 def get_nbit_weights_ty(draw) -> Optional[SparseType]:
@@ -144,6 +156,29 @@ def format_ref_tensors_in_mixed_B_layout(
     return torch.cat(concat_list, dim=0)
 
 
+# pyre-ignore
+additional_decorators: Dict[str, List[Callable]] = {
+    "test_schema__test_backward_none_with_rowwise_adagrad": [
+        unittest.skip("Cannot access data pointer of Tensor that doesn't have storage")
+    ],
+    "test_faketensor__test_backward_none_with_rowwise_adagrad": [
+        unittest.skip("Cannot access data pointer of Tensor that doesn't have storage")
+    ],
+    "test_autograd_registration__test_backward_none_with_rowwise_adagrad": [
+        unittest.skip("Cannot access data pointer of Tensor that doesn't have storage")
+    ],
+    "test_faketensor__test_cache_prefetch_pipeline_stream_2": [unittest.skip("OOM")],
+    "test_faketensor__test_cache_prefetch_pipeline": [unittest.skip("OOM")],
+    "test_faketensor__test_cache_prefetch_pipeline_stream_1": [
+        unittest.skip("IMA on exit")
+    ],
+    "test_faketensor__test_cache_pipeline": [
+        unittest.skip("OOM when run serially"),
+    ],
+}
+
+
+@optests.generate_opcheck_tests(fast=True, additional_decorators=additional_decorators)
 class SplitTableBatchedEmbeddingsTest(unittest.TestCase):
     def execute_forward_(  # noqa C901
         self,
@@ -1497,7 +1532,7 @@ def test_backward_dense(  # noqa C901
         offsets.requires_grad = False
         for param in cc.parameters():
             param.requires_grad = False
-        torch.autograd.gradcheck(
+        gradcheck(
             cc, (indices, offsets, per_sample_weights), eps=1e-2, atol=1e-3, rtol=1e-3
         )
 
@@ -2527,7 +2562,7 @@ def execute_backward_adagrad_(  # noqa C901
         offsets.requires_grad = False
         for param in cc.parameters():
             param.requires_grad = False
-        torch.autograd.gradcheck(
+        gradcheck(
             cc,
             (
                 indices,
@@ -3956,6 +3991,9 @@ def get_wts_from_counter_adagrad(
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
     )
     @unittest.skipIf(*gpu_unavailable)
+    @unittest.skip(
+        "is flaky, see https://www.internalfb.com/intern/test/281475047227145?ref_report_id=0"
+    )
     def test_backward_optimizers_adam(  # noqa C901
         self,
         T: int,
diff --git a/fbgemm_gpu/test/test_utils.py b/fbgemm_gpu/test/test_utils.py
index 17d96e16ae..d45600bbb9 100644
--- a/fbgemm_gpu/test/test_utils.py
+++ b/fbgemm_gpu/test/test_utils.py
@@ -223,6 +223,7 @@ class optests:
     def generate_opcheck_tests(
         test_class: Optional[unittest.TestCase] = None,
         *,
+        fast: bool = False,
         # pyre-ignore[24]: Generic type `Callable` expects 2 type parameters.
         additional_decorators: Optional[Dict[str, Callable]] = None,
     ):
@@ -236,22 +237,31 @@ def decorator(test_class: unittest.TestCase) -> unittest.TestCase:
             from torch._utils_internal import get_file_path_2
 
             filename = inspect.getfile(test_class)
+            failures_dict_name = "failures_dict.json"
+            if fast:
+                failures_dict_name = "failures_dict_fast.json"
             failures_dict_path = get_file_path_2(
-                "", os.path.dirname(filename), "failures_dict.json"
+                "", os.path.dirname(filename), failures_dict_name
             )
+            tests_to_run = [
+                "test_schema",
+                "test_autograd_registration",
+                "test_faketensor",
+            ]
+            if not fast:
+                tests_to_run.extend(
+                    [
+                        "test_aot_dispatch_static",
+                        "test_aot_dispatch_dynamic",
+                    ]
+                )
             optests.generate_opcheck_tests(
                 test_class,
                 ["fb", "fbgemm"],
                 failures_dict_path,
                 # pyre-ignore[6]
                 additional_decorators,
-                [
-                    "test_schema",
-                    "test_autograd_registration",
-                    "test_faketensor",
-                    "test_aot_dispatch_static",
-                    "test_aot_dispatch_dynamic",
-                ],
+                tests_to_run,
             )
             return test_class
 

From 35580373bd309c534bcf7f88948227bfc69ff9d3 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Thu, 26 Oct 2023 01:52:55 -0700
Subject: [PATCH 93/94] Register fake CPU dispatch for split embeddings cache
 ops (#2095)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2095

- Register fake CPU dispatch for split embeddings cache ops

Reviewed By: sryap

Differential Revision: D50665599

fbshipit-source-id: aca450e2e501e648ad07fb0ed8738ef5ec7e6bec
---
 fbgemm_gpu/CMakeLists.txt                     |   7 +-
 .../src/split_embeddings_cache/common.cuh     |  14 +--
 .../src/split_embeddings_cache/common.h       | 101 ++++++++++++++++++
 .../lfu_cache_populate_byte.cpp               |  32 ++++++
 .../linearize_cache_indices.cpp               |  29 +++++
 .../lru_cache_populate_byte.cpp               |  54 ++++++++++
 .../src/split_embeddings_cache/lxu_cache.cpp  |  35 ++++++
 .../split_embeddings_cache_ops.cpp            |  46 ++++----
 .../split_embeddings_cache_ops.cu             |  38 +++++++
 9 files changed, 315 insertions(+), 41 deletions(-)
 create mode 100644 fbgemm_gpu/src/split_embeddings_cache/common.h
 create mode 100644 fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate_byte.cpp
 create mode 100644 fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cpp
 create mode 100644 fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cpp
 create mode 100644 fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cpp
 create mode 100644 fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cu

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 29141a5615..11ff4a62ff 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -574,6 +574,11 @@ set(fbgemm_gpu_sources_static_cpu
     src/sparse_ops/sparse_ops_cpu.cpp
     src/sparse_ops/sparse_ops_meta.cpp
     src/embedding_inplace_update_cpu.cpp
+    src/split_embeddings_cache/linearize_cache_indices.cpp
+    src/split_embeddings_cache/lfu_cache_populate_byte.cpp
+    src/split_embeddings_cache/lru_cache_populate_byte.cpp
+    src/split_embeddings_cache/lxu_cache.cpp
+    src/split_embeddings_cache/split_embeddings_cache_ops.cpp
     codegen/batch_index_select_dim0_cpu_host.cpp)
 
 if(NOT FBGEMM_CPU_ONLY)
@@ -588,7 +593,7 @@ if(NOT FBGEMM_CPU_ONLY)
     src/quantize_ops/quantize_ops_gpu.cpp
     src/sparse_ops/sparse_ops_gpu.cpp
     src/split_embeddings_utils.cpp
-    src/split_embeddings_cache/split_embeddings_cache_ops.cpp
+    src/split_embeddings_cache/split_embeddings_cache_ops.cu
     src/metric_ops_host.cpp
     src/embedding_inplace_update_gpu.cpp
     src/input_combine_gpu.cpp
diff --git a/fbgemm_gpu/src/split_embeddings_cache/common.cuh b/fbgemm_gpu/src/split_embeddings_cache/common.cuh
index 7efabb8519..5a1c9c70dd 100644
--- a/fbgemm_gpu/src/split_embeddings_cache/common.cuh
+++ b/fbgemm_gpu/src/split_embeddings_cache/common.cuh
@@ -8,6 +8,8 @@
 
 #pragma once
 
+#include "common.h"
+
 // clang-format off
 #include "fbgemm_gpu/cub_namespace_prefix.cuh"
 #include <cub/device/device_radix_sort.cuh>
@@ -17,10 +19,6 @@
 #include "fbgemm_gpu/cub_namespace_postfix.cuh"
 // clang-format on
 
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
-#include <ATen/TensorUtils.h>
-#include <ATen/core/TensorAccessor.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ATen/cuda/detail/KernelUtils.h>
@@ -30,15 +28,7 @@
 #include <curand_kernel.h>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/CUDAGraphsUtils.cuh>
-#include <limits>
-#include <mutex>
-
-#include "fbgemm_gpu/dispatch_macros.h"
-#include "fbgemm_gpu/embedding_common.h"
 #include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
-#include "fbgemm_gpu/fbgemm_tensor_accessor.h"
-#include "fbgemm_gpu/ops_utils.h"
-#include "fbgemm_gpu/sparse_ops_utils.h"
 #include "fbgemm_gpu/split_embeddings_cache_cuda.cuh"
 #include "fbgemm_gpu/split_embeddings_utils.cuh"
 
diff --git a/fbgemm_gpu/src/split_embeddings_cache/common.h b/fbgemm_gpu/src/split_embeddings_cache/common.h
new file mode 100644
index 0000000000..9c22f02e40
--- /dev/null
+++ b/fbgemm_gpu/src/split_embeddings_cache/common.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/core/TensorAccessor.h>
+#include <limits>
+#include <mutex>
+
+#include "fbgemm_gpu/dispatch_macros.h"
+#include "fbgemm_gpu/embedding_common.h"
+#include "fbgemm_gpu/fbgemm_tensor_accessor.h"
+#include "fbgemm_gpu/ops_utils.h"
+#include "fbgemm_gpu/sparse_ops_utils.h"
+
+using Tensor = at::Tensor;
+
+namespace fbgemm_gpu {
+
+Tensor linearize_cache_indices_cpu(
+    Tensor cache_hash_size_cumsum,
+    Tensor indices,
+    Tensor offsets);
+
+Tensor linearize_cache_indices_from_row_idx_cpu(
+    Tensor cache_hash_size_cumsum,
+    Tensor update_table_indices,
+    Tensor update_row_indices);
+
+void lru_cache_populate_byte_cpu(
+    Tensor weights,
+    Tensor cache_hash_size_cumsum,
+    int64_t total_cache_hash_size,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor weights_tys,
+    Tensor D_offsets,
+    Tensor linear_cache_indices,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    int64_t time_stamp,
+    Tensor lru_state,
+    int64_t row_alignment,
+    bool gather_cache_stats,
+    c10::optional<Tensor> uvm_cache_stats);
+
+void direct_mapped_lru_cache_populate_byte_cpu(
+    Tensor weights,
+    Tensor cache_hash_size_cumsum,
+    int64_t total_cache_hash_size,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor weights_tys,
+    Tensor D_offsets,
+    Tensor linear_cache_indices,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    int64_t time_stamp,
+    Tensor lru_state,
+    Tensor lxu_cache_miss_timestamp,
+    int64_t row_alignment,
+    bool gather_cache_stats,
+    c10::optional<Tensor> uvm_cache_stats);
+
+void lfu_cache_populate_byte_cpu(
+    Tensor weights,
+    Tensor cache_hash_size_cumsum,
+    int64_t total_cache_hash_size,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor weights_tys,
+    Tensor D_offsets,
+    Tensor linear_cache_indices,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    Tensor lfu_state,
+    int64_t row_alignment);
+
+Tensor lxu_cache_lookup_cpu(
+    Tensor linear_cache_indices,
+    Tensor lxu_cache_state,
+    int64_t invalid_index,
+    bool gather_cache_stats,
+    c10::optional<Tensor> uvm_cache_stats);
+
+Tensor direct_mapped_lxu_cache_lookup_cpu(
+    Tensor linear_cache_indices,
+    Tensor lxu_cache_state,
+    int64_t invalid_index,
+    bool gather_cache_stats,
+    c10::optional<Tensor> uvm_cache_stats);
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate_byte.cpp b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate_byte.cpp
new file mode 100644
index 0000000000..090a80d76e
--- /dev/null
+++ b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate_byte.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "common.h"
+
+using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+namespace fbgemm_gpu {
+
+DLL_PUBLIC void lfu_cache_populate_byte_cpu(
+    Tensor weights,
+    Tensor cache_hash_size_cumsum,
+    int64_t total_cache_hash_size,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor weights_tys,
+    Tensor D_offsets,
+    Tensor linear_cache_indices,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    Tensor lfu_state,
+    int64_t row_alignment) {
+  return;
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cpp b/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cpp
new file mode 100644
index 0000000000..c27e866175
--- /dev/null
+++ b/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "common.h"
+
+using Tensor = at::Tensor;
+
+namespace fbgemm_gpu {
+
+DLL_PUBLIC Tensor linearize_cache_indices_cpu(
+    Tensor cache_hash_size_cumsum,
+    Tensor indices,
+    Tensor offsets) {
+  return at::empty_like(indices);
+}
+
+DLL_PUBLIC Tensor linearize_cache_indices_from_row_idx_cpu(
+    Tensor cache_hash_size_cumsum,
+    Tensor update_table_indices,
+    Tensor update_row_indices) {
+  return at::empty_like(update_row_indices);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cpp b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cpp
new file mode 100644
index 0000000000..a7b2bf8b51
--- /dev/null
+++ b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "common.h"
+
+using Tensor = at::Tensor;
+
+namespace fbgemm_gpu {
+
+DLL_PUBLIC void lru_cache_populate_byte_cpu(
+    Tensor weights,
+    Tensor cache_hash_size_cumsum,
+    int64_t total_cache_hash_size,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor weights_tys,
+    Tensor D_offsets,
+    Tensor linear_cache_indices,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    int64_t time_stamp,
+    Tensor lru_state,
+    int64_t row_alignment,
+    bool gather_cache_stats,
+    c10::optional<Tensor> uvm_cache_stats) {
+  return;
+}
+
+DLL_PUBLIC void direct_mapped_lru_cache_populate_byte_cpu(
+    Tensor weights,
+    Tensor cache_hash_size_cumsum,
+    int64_t total_cache_hash_size,
+    Tensor cache_index_table_map,
+    Tensor weights_offsets,
+    Tensor weights_tys,
+    Tensor D_offsets,
+    Tensor linear_cache_indices,
+    Tensor lxu_cache_state,
+    Tensor lxu_cache_weights,
+    int64_t time_stamp,
+    Tensor lru_state,
+    Tensor lxu_cache_miss_timestamp,
+    int64_t row_alignment,
+    bool gather_cache_stats,
+    c10::optional<Tensor> uvm_cache_stats) {
+  return;
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cpp b/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cpp
new file mode 100644
index 0000000000..490da1fb7d
--- /dev/null
+++ b/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "common.h"
+
+using Tensor = at::Tensor;
+
+namespace fbgemm_gpu {
+
+DLL_PUBLIC Tensor lxu_cache_lookup_cpu(
+    Tensor linear_cache_indices,
+    Tensor lxu_cache_state,
+    int64_t invalid_index,
+    bool gather_cache_stats,
+    c10::optional<Tensor> uvm_cache_stats) {
+  return empty_like(
+      linear_cache_indices, linear_cache_indices.options().dtype(at::kInt));
+}
+
+DLL_PUBLIC Tensor direct_mapped_lxu_cache_lookup_cpu(
+    Tensor linear_cache_indices,
+    Tensor lxu_cache_state,
+    int64_t invalid_index,
+    bool gather_cache_stats,
+    c10::optional<Tensor> uvm_cache_stats) {
+  return empty_like(
+      linear_cache_indices, linear_cache_indices.options().dtype(at::kInt));
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cpp b/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cpp
index 31a330fe73..2565ed219c 100644
--- a/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cpp
+++ b/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cpp
@@ -6,65 +6,55 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <ATen/ATen.h>
-#include <ATen/core/op_registration/op_registration.h>
-#include <torch/library.h>
-
-#include "fbgemm_gpu/sparse_ops_utils.h"
-#include "fbgemm_gpu/split_embeddings_cache_cuda.cuh"
+#include "common.h"
 
 namespace {
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "linearize_cache_indices(Tensor cache_hash_size_cumsum, Tensor indices, Tensor offsets) -> Tensor");
-  DISPATCH_TO_CUDA("linearize_cache_indices", linearize_cache_indices_cuda);
   m.def(
       "linearize_cache_indices_from_row_idx(Tensor cache_hash_size_cumsum, Tensor update_table_indices, Tensor update_row_indices) -> Tensor");
-  DISPATCH_TO_CUDA(
-      "linearize_cache_indices_from_row_idx",
-      linearize_cache_indices_from_row_idx_cuda);
   m.def(
       "lru_cache_populate(Tensor weights, Tensor hash_size_cumsum, int total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor D_offsets, Tensor linear_cache_indices, Tensor(a!) lxu_cache_state, Tensor(b!) lxu_cache_weights, int time_stamp, Tensor(c!) lru_state, bool stochastic_rounding, bool gather_cache_stats=False, Tensor(d!)? uvm_cache_stats=None, bool lock_cache_line=False, Tensor(e!)? lxu_cache_locking_counter=None) -> ()");
-  DISPATCH_TO_CUDA("lru_cache_populate", lru_cache_populate_cuda);
   m.def(
       "lru_cache_populate_byte(Tensor weights, Tensor hash_size_cumsum, int total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor linear_cache_indices, Tensor(a!) lxu_cache_state, Tensor(b!) lxu_cache_weights, int time_stamp, Tensor(c!) lru_state, int row_alignment=16, bool gather_cache_stats=False, Tensor(d!)? uvm_cache_stats=None) -> ()");
-  DISPATCH_TO_CUDA("lru_cache_populate_byte", lru_cache_populate_byte_cuda);
   m.def(
       "direct_mapped_lru_cache_populate_byte(Tensor weights, Tensor hash_size_cumsum, int total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor linear_cache_indices, Tensor(a!) lxu_cache_state, Tensor(b!) lxu_cache_weights, int time_stamp, Tensor(c!) lru_state, Tensor(d!) lxu_cache_miss_timestamp, int row_alignment=16, bool gather_cache_stats=False, Tensor(e!)? uvm_cache_stats=None) -> ()");
-  DISPATCH_TO_CUDA(
-      "direct_mapped_lru_cache_populate_byte",
-      direct_mapped_lru_cache_populate_byte_cuda);
   m.def(
       "lfu_cache_populate(Tensor weights, Tensor cache_hash_size_cumsum, int total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor D_offsets, Tensor linear_cache_indices, Tensor(a!) lxu_cache_state, Tensor(b!) lxu_cache_weights, Tensor(c!) lfu_state, bool stochastic_rounding) -> ()");
-  DISPATCH_TO_CUDA("lfu_cache_populate", lfu_cache_populate_cuda);
   m.def(
       "lfu_cache_populate_byte(Tensor weights, Tensor cache_hash_size_cumsum, int total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor linear_cache_indices, Tensor(a!) lxu_cache_state, Tensor(b!) lxu_cache_weights, Tensor(c!) lfu_state, int row_alignment=16) -> ()");
-  DISPATCH_TO_CUDA("lfu_cache_populate_byte", lfu_cache_populate_byte_cuda);
   m.def(
       "lxu_cache_lookup(Tensor linear_cache_indices, Tensor lxu_cache_state, int invalid_index = -1, bool gather_cache_stats=False, Tensor(a!)? uvm_cache_stats=None) -> Tensor");
-  DISPATCH_TO_CUDA("lxu_cache_lookup", lxu_cache_lookup_cuda);
   m.def(
       "direct_mapped_lxu_cache_lookup(Tensor linear_cache_indices, Tensor lxu_cache_state, int invalid_index = -1, bool gather_cache_stats=False, Tensor(a!)? uvm_cache_stats=None) -> Tensor");
-  DISPATCH_TO_CUDA(
-      "direct_mapped_lxu_cache_lookup", direct_mapped_lxu_cache_lookup_cuda);
   m.def(
       "lxu_cache_flush(Tensor(a!) uvm_weights, Tensor cache_hash_size_cumsum, Tensor cache_index_table_map, Tensor weights_offsets, Tensor D_offsets, int total_D, Tensor(b!) lxu_cache_state, Tensor(c!) lxu_cache_weights, bool stochastic_rounding) -> ()");
-  DISPATCH_TO_CUDA("lxu_cache_flush", lxu_cache_flush_cuda);
   m.def("lxu_cache_slot(int h_in, int C) -> int");
-  DISPATCH_TO_ALL("lxu_cache_slot", host_lxu_cache_slot);
   m.def(
       "reset_weight_momentum(Tensor dev_weights, Tensor uvm_weights, Tensor lxu_cache_weights, Tensor weights_placements, Tensor weights_offsets, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor D_offsets, Tensor pruned_indices, Tensor pruned_indices_offsets, Tensor logical_table_ids, Tensor buffer_ids, Tensor cache_hash_size_cumsum, Tensor lxu_cache_state, int total_cache_hash_size) -> ()");
-  DISPATCH_TO_CUDA("reset_weight_momentum", reset_weight_momentum_cuda);
   m.def(
       "lxu_cache_locking_counter_decrement(Tensor(a!) lxu_cache_locking_counter, Tensor lxu_cache_locations) -> ()");
-  DISPATCH_TO_CUDA(
-      "lxu_cache_locking_counter_decrement",
-      lxu_cache_locking_counter_decrement_cuda);
   m.def(
       "lxu_cache_locations_update(Tensor(a!) lxu_cache_locations, Tensor lxu_cache_locations_new) -> ()");
-  DISPATCH_TO_CUDA(
-      "lxu_cache_locations_update", lxu_cache_locations_update_cuda);
+}
+
+using namespace fbgemm_gpu;
+
+TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
+  DISPATCH_TO_CPU("linearize_cache_indices", linearize_cache_indices_cpu);
+  DISPATCH_TO_CPU(
+      "linearize_cache_indices_from_row_idx",
+      linearize_cache_indices_from_row_idx_cpu);
+  DISPATCH_TO_CPU("lru_cache_populate_byte", lru_cache_populate_byte_cpu);
+  DISPATCH_TO_CPU(
+      "direct_mapped_lru_cache_populate_byte",
+      direct_mapped_lru_cache_populate_byte_cpu);
+  DISPATCH_TO_CPU("lfu_cache_populate_byte", lfu_cache_populate_byte_cpu);
+  DISPATCH_TO_CPU("lxu_cache_lookup", lxu_cache_lookup_cpu);
+  DISPATCH_TO_CPU(
+      "direct_mapped_lxu_cache_lookup", direct_mapped_lxu_cache_lookup_cpu);
 }
 
 } // namespace
diff --git a/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cu b/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cu
new file mode 100644
index 0000000000..91b4024dc9
--- /dev/null
+++ b/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "common.cuh"
+
+namespace {
+
+TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
+  DISPATCH_TO_CUDA("linearize_cache_indices", linearize_cache_indices_cuda);
+  DISPATCH_TO_CUDA(
+      "linearize_cache_indices_from_row_idx",
+      linearize_cache_indices_from_row_idx_cuda);
+  DISPATCH_TO_CUDA("lru_cache_populate", lru_cache_populate_cuda);
+  DISPATCH_TO_CUDA("lru_cache_populate_byte", lru_cache_populate_byte_cuda);
+  DISPATCH_TO_CUDA(
+      "direct_mapped_lru_cache_populate_byte",
+      direct_mapped_lru_cache_populate_byte_cuda);
+  DISPATCH_TO_CUDA("lfu_cache_populate", lfu_cache_populate_cuda);
+  DISPATCH_TO_CUDA("lfu_cache_populate_byte", lfu_cache_populate_byte_cuda);
+  DISPATCH_TO_CUDA("lxu_cache_lookup", lxu_cache_lookup_cuda);
+  DISPATCH_TO_CUDA(
+      "direct_mapped_lxu_cache_lookup", direct_mapped_lxu_cache_lookup_cuda);
+  DISPATCH_TO_CUDA("lxu_cache_flush", lxu_cache_flush_cuda);
+  DISPATCH_TO_ALL("lxu_cache_slot", host_lxu_cache_slot);
+  DISPATCH_TO_CUDA("reset_weight_momentum", reset_weight_momentum_cuda);
+  DISPATCH_TO_CUDA(
+      "lxu_cache_locking_counter_decrement",
+      lxu_cache_locking_counter_decrement_cuda);
+  DISPATCH_TO_CUDA(
+      "lxu_cache_locations_update", lxu_cache_locations_update_cuda);
+}
+
+} // namespace

From 79f38e457182206c09d1b7ed5b296a3924df1921 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@meta.com>
Date: Thu, 26 Oct 2023 07:45:37 -0700
Subject: [PATCH 94/94] No-op impl_abstract if upstream PyTorch doesn't have it
 (#2097)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2097

Reviewed By: zou3519

Differential Revision: D50693694

fbshipit-source-id: 70b719d512a1de6f0ec49aca0fe4e794edb667d7
---
 fbgemm_gpu/fbgemm_gpu/sparse_operators.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_operators.py b/fbgemm_gpu/fbgemm_gpu/sparse_operators.py
index 8107ba09f2..d0e3958090 100644
--- a/fbgemm_gpu/fbgemm_gpu/sparse_operators.py
+++ b/fbgemm_gpu/fbgemm_gpu/sparse_operators.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Optional, Tuple
+from typing import Callable, Optional, Tuple
 
 import torch
 from torch import Tensor
@@ -17,7 +17,20 @@
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
 
 
-@torch.library.impl_abstract("fbgemm::permute_2D_sparse_data")
+if hasattr(torch.library, "impl_abstract"):
+    impl_abstract = torch.library.impl_abstract
+else:
+    # pyre-ignore
+    def impl_abstract(schema: str) -> Callable[[Callable], Callable]:
+        # no-op
+        # pyre-ignore
+        def wrapper(f: Callable) -> Callable:
+            return f
+
+        return wrapper
+
+
+@impl_abstract("fbgemm::permute_2D_sparse_data")
 def permute_2D_sparse_data_meta(
     permute: Tensor,
     lengths: Tensor,
@@ -47,7 +60,7 @@ def permute_2D_sparse_data_meta(
     return permuted_lengths, permuted_indices, permuted_weights
 
 
-@torch.library.impl_abstract("fbgemm::permute_1D_sparse_data")
+@impl_abstract("fbgemm::permute_1D_sparse_data")
 def permute_1D_sparse_data_meta(
     permute: Tensor,
     lengths: Tensor,
@@ -73,7 +86,7 @@ def permute_1D_sparse_data_meta(
     return permuted_lengths, permuted_indices, permuted_weights
 
 
-@torch.library.impl_abstract("fbgemm::expand_into_jagged_permute")
+@impl_abstract("fbgemm::expand_into_jagged_permute")
 def expand_into_jagged_permute_meta(
     permute: Tensor,
     input_offsets: Tensor,