From b61ae740e0ce5ece994bf54b7866bba5eddc41d7 Mon Sep 17 00:00:00 2001
From: Ivan Kobzarev <ivankobzarev@meta.com>
Date: Fri, 3 Nov 2023 16:30:27 -0700
Subject: [PATCH 01/73] Meta functions for dynamic_shapes for
 block_bucketize_sparse_features,
 int_nbit_split_embedding_codegen_lookup_function (#2102)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2102

Main goal is to make torchrec inference modules dynamo traceable with dynamic shapes.
Adding meta function for `int_nbit_split_embedding_codegen_lookup_function`, changing `int` to `SymInt` in schema.
Adding meta function for `block_bucketize_sparse_features`  (RW sharding)

Reviewed By: ezyang

Differential Revision:
D50741802

Privacy Context Container: L1138451

fbshipit-source-id: d0b9788e57c4551bc6f0c3ce364723d470e70f16
---
 .../embedding_forward_quantized_host_cpu.cpp  |   4 +-
 fbgemm_gpu/fbgemm_gpu/sparse_operators.py     | 148 +++++++++++++++++-
 .../src/merge_pooled_embeddings_gpu.cpp       |  18 +--
 .../test/merge_pooled_embeddings_test.py      |   1 +
 4 files changed, 151 insertions(+), 20 deletions(-)

diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp
index 66bd594f20..075574224f 100644
--- a/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp
+++ b/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp
@@ -244,13 +244,13 @@ Tensor pruned_array_lookup_cpu(
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
-      "int_nbit_split_embedding_codegen_lookup_function(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int total_D, int max_int2_D, int max_int4_D, int max_int8_D, int max_float16_D, int max_float32_D, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, int output_dtype=1, Tensor? lxu_cache_weights=None, Tensor? lxu_cache_locations=None, int? row_alignment = None, int? max_float8_D=0, int? fp8_exponent_bits=-1, int? fp8_exponent_bias=-1) -> Tensor");
+      "int_nbit_split_embedding_codegen_lookup_function(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, SymInt total_D, int max_int2_D, int max_int4_D, int max_int8_D, int max_float16_D, int max_float32_D, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, int output_dtype=1, Tensor? lxu_cache_weights=None, Tensor? lxu_cache_locations=None, int? row_alignment = None, int? max_float8_D=0, int? fp8_exponent_bits=-1, int? fp8_exponent_bias=-1) -> Tensor");
   DISPATCH_TO_CPU(
       "int_nbit_split_embedding_codegen_lookup_function",
       int_nbit_split_embedding_codegen_lookup_function_cpu);
 
   m.def(
-      "int_nbit_split_embedding_uvm_caching_codegen_lookup_function(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int total_D, int max_int2_D, int max_int4_D, int max_int8_D, int max_float16_D, int max_float32_D, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights=None, int output_dtype=1, Tensor? lxu_cache_weights=None, Tensor? lxu_cache_locations=None, int? row_alignment=-1, int? max_float8_D=0, int? fp8_exponent_bits=-1, int? fp8_exponent_bias=-1, Tensor? cache_hash_size_cumsum=None, int? total_cache_hash_size=-1, Tensor? cache_index_table_map=None, Tensor? lxu_cache_state=None, Tensor? lxu_state=None) -> Tensor");
+      "int_nbit_split_embedding_uvm_caching_codegen_lookup_function(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, SymInt total_D, int max_int2_D, int max_int4_D, int max_int8_D, int max_float16_D, int max_float32_D, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights=None, int output_dtype=1, Tensor? lxu_cache_weights=None, Tensor? lxu_cache_locations=None, int? row_alignment=-1, int? max_float8_D=0, int? fp8_exponent_bits=-1, int? fp8_exponent_bias=-1, Tensor? cache_hash_size_cumsum=None, int? total_cache_hash_size=-1, Tensor? cache_index_table_map=None, Tensor? lxu_cache_state=None, Tensor? lxu_state=None) -> Tensor");
   DISPATCH_TO_CPU(
       "int_nbit_split_embedding_uvm_caching_codegen_lookup_function",
       int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu);
diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_operators.py b/fbgemm_gpu/fbgemm_gpu/sparse_operators.py
index d0e3958090..bb193998fc 100644
--- a/fbgemm_gpu/fbgemm_gpu/sparse_operators.py
+++ b/fbgemm_gpu/fbgemm_gpu/sparse_operators.py
@@ -4,9 +4,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Callable, Optional, Tuple
+from typing import Callable, List, Optional, Tuple
 
 import torch
+
+from fbgemm_gpu.split_embedding_configs import SparseType
+from fbgemm_gpu.split_table_batched_embeddings_ops_common import PoolingMode
 from torch import Tensor
 
 try:
@@ -15,6 +18,12 @@
 except Exception:
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:merge_pooled_embeddings")
+    torch.ops.load_library(
+        "//deeplearning/fbgemm/fbgemm_gpu:merge_pooled_embeddings_cpu"
+    )
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/codegen:embedding_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/codegen:embedding_ops_cpu")
 
 
 if hasattr(torch.library, "impl_abstract"):
@@ -104,3 +113,140 @@ def expand_into_jagged_permute_meta(
     )
     output_permute = input_offsets.new_empty(output_size)
     return output_permute
+
+
+@impl_abstract("fbgemm::int_nbit_split_embedding_codegen_lookup_function")
+def int_nbit_split_embedding_codegen_lookup_function_meta(
+    dev_weights: torch.Tensor,
+    uvm_weights: torch.Tensor,
+    weights_placements: torch.Tensor,
+    weights_offsets: torch.Tensor,
+    weights_tys: torch.Tensor,
+    D_offsets: torch.Tensor,
+    total_D: int,
+    max_int2_D: int,
+    max_int4_D: int,
+    max_int8_D: int,
+    max_float16_D: int,
+    max_float32_D: int,
+    indices: torch.Tensor,
+    offsets: torch.Tensor,
+    pooling_mode: int,
+    indice_weights: Optional[torch.Tensor] = None,
+    output_dtype_int: Optional[int] = None,
+    lxu_cache_weights: Optional[torch.Tensor] = None,
+    lxu_cache_locations: Optional[torch.Tensor] = None,
+    row_alignment: Optional[int] = None,
+    max_float8_D: Optional[int] = None,
+    fp8_exponent_bits: Optional[int] = None,
+    fp8_exponent_bias: Optional[int] = None,
+) -> torch.Tensor:
+    T = D_offsets.numel() - 1
+    B = (offsets.size(0) - 1) // T
+    output_dtype = torch.float32
+    torch._check(
+        output_dtype_int in (0, 1, 5),
+        lambda: f"expected output_dtype to be fp32, fp16 or bf16, got {indices.dtype}",
+    )
+    if output_dtype_int == SparseType.FP32.value:
+        output_dtype = torch.float32
+    elif output_dtype_int == SparseType.FP16.value:
+        output_dtype = torch.float16
+    elif output_dtype_int == SparseType.BF16.value:
+        output_dtype = torch.bfloat16
+
+    if pooling_mode == PoolingMode.NONE:
+        # pyre-ignore
+        offsets_last: int = offsets[-1].item()
+        total_D_T: int = total_D // T
+        torch._check_is_size(offsets[-1].item())
+        torch._check_is_size(total_D_T)
+        torch._check_is_size(B)
+        return dev_weights.new_empty(
+            [offsets_last, total_D_T],
+            dtype=output_dtype,
+            device=torch.device("meta"),
+        )
+    torch._check_is_size(B)
+    torch._check_is_size(total_D)
+    return dev_weights.new_empty(
+        (B, total_D),
+        dtype=output_dtype,
+        device=torch.device("meta"),
+    )
+
+
+@impl_abstract("fbgemm::block_bucketize_sparse_features")
+def block_bucketize_sparse_features_meta(
+    lengths: torch.Tensor,
+    indices: torch.Tensor,
+    bucketize_pos: bool,
+    sequence: bool,
+    block_sizes: torch.Tensor,
+    my_size: int,
+    weights: Optional[torch.Tensor] = None,
+    batch_size_per_feature: Optional[torch.Tensor] = None,
+    max_B: int = -1,
+) -> Tuple[
+    torch.Tensor,
+    torch.Tensor,
+    Optional[torch.Tensor],
+    Optional[torch.Tensor],
+    Optional[torch.Tensor],
+]:
+    # Output: lengths, indices, weights", pos?, unbucketize_permute?
+    num_buckets = my_size
+    num_features = lengths.size(0)
+    num_values = indices.size(0)
+    return (
+        lengths.new_empty([num_buckets * num_features]),
+        indices.new_empty([num_values]),
+        weights.new_empty(weights.shape) if weights is not None else None,
+        indices.new_empty([num_values]) if bucketize_pos else None,
+        indices.new_empty([num_values]),
+    )
+
+
+@impl_abstract("fbgemm::merge_pooled_embeddings")
+def merge_pooled_embeddings(
+    pooled_embeddings: List[torch.Tensor],
+    uncat_dim_size: int,
+    target_device: torch.device,
+    cat_dim: int = 1,
+) -> torch.Tensor:
+    if len(pooled_embeddings) == 0:
+        return torch.empty([], device=target_device)
+    torch._check_is_size(cat_dim)
+    torch._check(cat_dim >= 0)
+    torch._check(cat_dim <= 1)
+    total_cat_dim_size = 0
+    for e in pooled_embeddings:
+        torch._check(e.dim() == 2)
+        torch._check(e.size(1 - cat_dim) == uncat_dim_size)
+        total_cat_dim_size += e.size(cat_dim)
+    torch._check_is_size(total_cat_dim_size)
+    e = pooled_embeddings[0]
+    if cat_dim == 0:
+        return e.new_empty(
+            [total_cat_dim_size, e.size(1)],
+            device=torch.device("meta"),
+        )
+
+    return e.new_empty(
+        [e.size(0), total_cat_dim_size],
+        device=torch.device("meta"),
+    )
+
+
+@impl_abstract("fbgemm::bounds_check_indices")
+def bounds_check_indices(
+    rows_per_table: torch.Tensor,
+    indices: torch.Tensor,
+    offsets: torch.Tensor,
+    bounds_check_mode: int,
+    warning: torch.Tensor,
+    weights: Optional[torch.Tensor] = None,
+    B_offsets: Optional[torch.Tensor] = None,
+    max_B: int = -1,
+) -> None:
+    pass
diff --git a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
index 0977a0d98c..305af91e17 100644
--- a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
+++ b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
@@ -642,25 +642,11 @@ Tensor sum_reduce_to_one_device(
   return sum_reduce_to_one(input_tensors, target_device);
 }
 
-Tensor merge_pooled_embeddings_meta(
-    std::vector<Tensor> pooled_embeddings,
-    int64_t uncat_dim_size,
-    at::Device /*target_device*/,
-    int64_t cat_dim) {
-  if (pooled_embeddings.size() == 0) {
-    return at::empty({0}, at::TensorOptions().device("meta"));
-  }
-
-  auto [output_shape, cumulative_dims, total_cat_dim] =
-      cat_dim_2d_output_shape(pooled_embeddings, uncat_dim_size, cat_dim);
-
-  return at::empty(output_shape, pooled_embeddings.front().options());
-}
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
-      "merge_pooled_embeddings(Tensor[] pooled_embeddings, int uncat_dim_size, Device target_device, int cat_dim=1) -> Tensor");
+      "merge_pooled_embeddings(Tensor[] pooled_embeddings, SymInt uncat_dim_size, Device target_device, SymInt cat_dim=1) -> Tensor");
   DISPATCH_TO_CUDA(
       "merge_pooled_embeddings", fbgemm_gpu::merge_pooled_embeddings);
   m.def(
@@ -669,6 +655,4 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "sum_reduce_to_one(Tensor[] input_tensors, Device target_device) -> Tensor");
   DISPATCH_TO_CUDA("sum_reduce_to_one", fbgemm_gpu::sum_reduce_to_one_device);
-  DISPATCH_TO_META(
-      "merge_pooled_embeddings", fbgemm_gpu::merge_pooled_embeddings_meta);
 }
diff --git a/fbgemm_gpu/test/merge_pooled_embeddings_test.py b/fbgemm_gpu/test/merge_pooled_embeddings_test.py
index 0e4b7986e7..7f3ab26176 100644
--- a/fbgemm_gpu/test/merge_pooled_embeddings_test.py
+++ b/fbgemm_gpu/test/merge_pooled_embeddings_test.py
@@ -26,6 +26,7 @@
     torch.ops.load_library(
         "//deeplearning/fbgemm/fbgemm_gpu:merge_pooled_embeddings_cpu"
     )
+    import fbgemm_gpu.sparse_operators  # noqa: F401, E402
     from fbgemm_gpu.test.test_utils import gpu_unavailable
 
     open_source = False

From 002d72b9d3d27480a09ac259cf3edef5d498ef39 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Mon, 6 Nov 2023 08:02:11 -0800
Subject: [PATCH 02/73] Namespace doesn't need to be followed by semicolon

Summary:
Auto-generated with
```
fbgs "}; // namespace" -l | sort | uniq | sed 's/fbsource.//' | xargs -n 50 sed -i 's_}; // namespace_} // namespace_'
```

Reviewed By: dmm-fb

Differential Revision: D51029740

fbshipit-source-id: 177e3f6e6b0ab7e986b1147952cd5e2f59d4b1fc
---
 include/fbgemm/FbgemmFP16.h           | 2 +-
 src/DirectConv.h                      | 2 +-
 test/EmbeddingSpMDM8BitTest.cc        | 2 +-
 test/EmbeddingSpMDMNBitTest.cc        | 2 +-
 test/EmbeddingSpMDMTest.cc            | 2 +-
 test/EmbeddingSpMDMTestUtils.h        | 2 +-
 test/Float16ConvertTest.cc            | 2 +-
 test/GConvTest.cc                     | 2 +-
 test/Im2ColFusedRequantizeTest.cc     | 2 +-
 test/PackedRequantizeAcc16Test.cc     | 2 +-
 test/PackedRequantizeTest.cc          | 2 +-
 test/RequantizeOnlyTest.cc            | 2 +-
 test/RowWiseSparseAdagradFusedTest.cc | 2 +-
 test/SparseAdagradTest.cc             | 2 +-
 test/TransposedRequantizeTest.cc      | 2 +-
 test/UniConvTest.cc                   | 2 +-
 16 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/include/fbgemm/FbgemmFP16.h b/include/fbgemm/FbgemmFP16.h
index 3cd6af9915..4e48331239 100644
--- a/include/fbgemm/FbgemmFP16.h
+++ b/include/fbgemm/FbgemmFP16.h
@@ -56,4 +56,4 @@ extern template void cblas_gemm_compute<float16>(
     int thread_id,
     int num_threads);
 
-}; // namespace fbgemm
+} // namespace fbgemm
diff --git a/src/DirectConv.h b/src/DirectConv.h
index ded8c2c62d..e10597e759 100644
--- a/src/DirectConv.h
+++ b/src/DirectConv.h
@@ -224,4 +224,4 @@ CodeCache<
     typename DirectConvCodeGenBase<TA, TB, TC, accT>::jit_micro_kernel_fp_convT>
     DirectConvCodeGenBase<TA, TB, TC, accT>::codeCacheT_;
 
-}; // namespace fbgemm
+} // namespace fbgemm
diff --git a/test/EmbeddingSpMDM8BitTest.cc b/test/EmbeddingSpMDM8BitTest.cc
index 3bb172ad59..cba4c29488 100644
--- a/test/EmbeddingSpMDM8BitTest.cc
+++ b/test/EmbeddingSpMDM8BitTest.cc
@@ -56,7 +56,7 @@ class Fused8BitRowwiseEmbeddingLookupTest
           EmbeddingSpMDMWeightChoice,
           EmbeddingSpMDMCornerCase,
           EmbeddingSpMDMOutputDtypeChoice>> {};
-}; // namespace
+} // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
diff --git a/test/EmbeddingSpMDMNBitTest.cc b/test/EmbeddingSpMDMNBitTest.cc
index 1106289c18..873c8145da 100644
--- a/test/EmbeddingSpMDMNBitTest.cc
+++ b/test/EmbeddingSpMDMNBitTest.cc
@@ -59,7 +59,7 @@ class FusedNBitRowwiseEmbeddingLookupTest : public testing::TestWithParam<tuple<
                                                 EmbeddingSpMDMWeightChoice,
                                                 EmbeddingSpMDMCornerCase,
                                                 EmbeddingSpMDMDtypeChoice>> {};
-}; // namespace
+} // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
diff --git a/test/EmbeddingSpMDMTest.cc b/test/EmbeddingSpMDMTest.cc
index 68962cff23..8c59266ab9 100644
--- a/test/EmbeddingSpMDMTest.cc
+++ b/test/EmbeddingSpMDMTest.cc
@@ -62,7 +62,7 @@ class rowwiseSparseEmbeddingSpMDMTest
 
 class IndexRemapTest
     : public testing::TestWithParam<tuple<int, int, int, bool, bool>> {};
-}; // namespace
+} // namespace
 
 vector<int> prefetch_distances = {0, 16, 1000000};
 
diff --git a/test/EmbeddingSpMDMTestUtils.h b/test/EmbeddingSpMDMTestUtils.h
index b71b1c9916..082625462f 100644
--- a/test/EmbeddingSpMDMTestUtils.h
+++ b/test/EmbeddingSpMDMTestUtils.h
@@ -59,4 +59,4 @@ int CreateMappingTableForRowWiseSparsity(
     int num_rows,
     float sparsity);
 
-}; // namespace fbgemm
+} // namespace fbgemm
diff --git a/test/Float16ConvertTest.cc b/test/Float16ConvertTest.cc
index e8f78ec601..4408ca016f 100644
--- a/test/Float16ConvertTest.cc
+++ b/test/Float16ConvertTest.cc
@@ -19,7 +19,7 @@ using namespace fbgemm;
 
 namespace {
 class FBGemmFloat16Test : public testing::TestWithParam<bool> {};
-}; // namespace
+} // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
diff --git a/test/GConvTest.cc b/test/GConvTest.cc
index ce465d890e..28e6f6501e 100644
--- a/test/GConvTest.cc
+++ b/test/GConvTest.cc
@@ -47,7 +47,7 @@ class fbgemmGConvAcc32WithQuantGranularityTest
           bool,
           bool>> {};
 class fbgemmGConvPackTest : public testing::TestWithParam<matrix_op_t> {};
-}; // namespace
+} // namespace
 
 // INSTANTIATE_TEST_CASE_P(
 //     InstantiationName,
diff --git a/test/Im2ColFusedRequantizeTest.cc b/test/Im2ColFusedRequantizeTest.cc
index c4c0e3e047..875200f09b 100644
--- a/test/Im2ColFusedRequantizeTest.cc
+++ b/test/Im2ColFusedRequantizeTest.cc
@@ -34,7 +34,7 @@ vector<QuantizationGranularity> qGranularityVals{
 namespace {
 class fbgemmIm2colTest
     : public testing::TestWithParam<tuple<QuantizationGranularity, bool>> {};
-}; // namespace
+} // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
diff --git a/test/PackedRequantizeAcc16Test.cc b/test/PackedRequantizeAcc16Test.cc
index 596dafa71c..ae0ba33d37 100644
--- a/test/PackedRequantizeAcc16Test.cc
+++ b/test/PackedRequantizeAcc16Test.cc
@@ -46,7 +46,7 @@ class fbgemmu8s8acc16Test
     : public testing::TestWithParam<tuple<matrix_op_t, matrix_op_t, bool>> {};
 class fbgemmPackUnpackAcc16Test
     : public testing::TestWithParam<tuple<matrix_op_t, bool>> {};
-}; // namespace
+} // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
diff --git a/test/PackedRequantizeTest.cc b/test/PackedRequantizeTest.cc
index 575d881fe3..446e56840e 100644
--- a/test/PackedRequantizeTest.cc
+++ b/test/PackedRequantizeTest.cc
@@ -45,7 +45,7 @@ class fbgemmu8s8acc32Test
     : public testing::TestWithParam<tuple<matrix_op_t, matrix_op_t, bool>> {};
 class fbgemmPackUnpackAcc32Test
     : public testing::TestWithParam<tuple<matrix_op_t, bool>> {};
-}; // namespace
+} // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
diff --git a/test/RequantizeOnlyTest.cc b/test/RequantizeOnlyTest.cc
index 8bf584e831..6a3302704b 100644
--- a/test/RequantizeOnlyTest.cc
+++ b/test/RequantizeOnlyTest.cc
@@ -33,7 +33,7 @@ class FloatRequantizeTest
     : public testing::TestWithParam<
           tuple<int, int, bool, QuantizationGranularity>> {};
 
-}; // namespace
+} // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
diff --git a/test/RowWiseSparseAdagradFusedTest.cc b/test/RowWiseSparseAdagradFusedTest.cc
index 0e295462ac..af2a7d6f1f 100644
--- a/test/RowWiseSparseAdagradFusedTest.cc
+++ b/test/RowWiseSparseAdagradFusedTest.cc
@@ -64,7 +64,7 @@ class RowWiseSparseAdagradFusedTest : public testing::TestWithParam<tuple<
                                           bool,
                                           EmbeddingSpMDMCornerCase,
                                           bool>> {};
-}; // namespace
+} // namespace
 
 constexpr float DEFAULT_TOL = 1.0e-6;
 
diff --git a/test/SparseAdagradTest.cc b/test/SparseAdagradTest.cc
index 3f4b193b34..eab97a1440 100644
--- a/test/SparseAdagradTest.cc
+++ b/test/SparseAdagradTest.cc
@@ -47,7 +47,7 @@ vector<int> prefetch_distances{0, 16, 1000000};
 namespace {
 class SparseAdagradTest
     : public testing::TestWithParam<tuple<bool, int, bool, bool, bool>> {};
-}; // namespace
+} // namespace
 
 constexpr float DEFAULT_TOL = 1.0e-6;
 
diff --git a/test/TransposedRequantizeTest.cc b/test/TransposedRequantizeTest.cc
index c546c3ffd9..77bda7d2f1 100644
--- a/test/TransposedRequantizeTest.cc
+++ b/test/TransposedRequantizeTest.cc
@@ -33,7 +33,7 @@ class RequantizeTest
     : public testing::TestWithParam<
           tuple<int, int, bool, bool, QuantizationGranularity>> {};
 
-}; // namespace
+} // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
diff --git a/test/UniConvTest.cc b/test/UniConvTest.cc
index cb21d178dc..82ee02d140 100644
--- a/test/UniConvTest.cc
+++ b/test/UniConvTest.cc
@@ -162,7 +162,7 @@ class UniConvQGranTest
     : public testing::TestWithParam<
           tuple<QuantizationGranularity, bool, bool, bool, bool>> {};
 
-}; // namespace
+} // namespace
 
 // Combine only allows at most 10 generators.
 INSTANTIATE_TEST_CASE_P(

From 6000a5ce23e2b83b5dea8f43182f5e0251f480e1 Mon Sep 17 00:00:00 2001
From: Christopher Ponce de Leon <cpdl@fb.com>
Date: Mon, 6 Nov 2023 08:37:37 -0800
Subject: [PATCH 03/73] Revert D51029740: Namespace doesn't need to be followed
 by semicolon

Differential Revision:
D51029740

Original commit changeset: 177e3f6e6b0a

Original Phabricator Diff: D51029740

fbshipit-source-id: c71ff386342902f2cfa6552d6a834ea3f2475e32
---
 include/fbgemm/FbgemmFP16.h           | 2 +-
 src/DirectConv.h                      | 2 +-
 test/EmbeddingSpMDM8BitTest.cc        | 2 +-
 test/EmbeddingSpMDMNBitTest.cc        | 2 +-
 test/EmbeddingSpMDMTest.cc            | 2 +-
 test/EmbeddingSpMDMTestUtils.h        | 2 +-
 test/Float16ConvertTest.cc            | 2 +-
 test/GConvTest.cc                     | 2 +-
 test/Im2ColFusedRequantizeTest.cc     | 2 +-
 test/PackedRequantizeAcc16Test.cc     | 2 +-
 test/PackedRequantizeTest.cc          | 2 +-
 test/RequantizeOnlyTest.cc            | 2 +-
 test/RowWiseSparseAdagradFusedTest.cc | 2 +-
 test/SparseAdagradTest.cc             | 2 +-
 test/TransposedRequantizeTest.cc      | 2 +-
 test/UniConvTest.cc                   | 2 +-
 16 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/include/fbgemm/FbgemmFP16.h b/include/fbgemm/FbgemmFP16.h
index 4e48331239..3cd6af9915 100644
--- a/include/fbgemm/FbgemmFP16.h
+++ b/include/fbgemm/FbgemmFP16.h
@@ -56,4 +56,4 @@ extern template void cblas_gemm_compute<float16>(
     int thread_id,
     int num_threads);
 
-} // namespace fbgemm
+}; // namespace fbgemm
diff --git a/src/DirectConv.h b/src/DirectConv.h
index e10597e759..ded8c2c62d 100644
--- a/src/DirectConv.h
+++ b/src/DirectConv.h
@@ -224,4 +224,4 @@ CodeCache<
     typename DirectConvCodeGenBase<TA, TB, TC, accT>::jit_micro_kernel_fp_convT>
     DirectConvCodeGenBase<TA, TB, TC, accT>::codeCacheT_;
 
-} // namespace fbgemm
+}; // namespace fbgemm
diff --git a/test/EmbeddingSpMDM8BitTest.cc b/test/EmbeddingSpMDM8BitTest.cc
index cba4c29488..3bb172ad59 100644
--- a/test/EmbeddingSpMDM8BitTest.cc
+++ b/test/EmbeddingSpMDM8BitTest.cc
@@ -56,7 +56,7 @@ class Fused8BitRowwiseEmbeddingLookupTest
           EmbeddingSpMDMWeightChoice,
           EmbeddingSpMDMCornerCase,
           EmbeddingSpMDMOutputDtypeChoice>> {};
-} // namespace
+}; // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
diff --git a/test/EmbeddingSpMDMNBitTest.cc b/test/EmbeddingSpMDMNBitTest.cc
index 873c8145da..1106289c18 100644
--- a/test/EmbeddingSpMDMNBitTest.cc
+++ b/test/EmbeddingSpMDMNBitTest.cc
@@ -59,7 +59,7 @@ class FusedNBitRowwiseEmbeddingLookupTest : public testing::TestWithParam<tuple<
                                                 EmbeddingSpMDMWeightChoice,
                                                 EmbeddingSpMDMCornerCase,
                                                 EmbeddingSpMDMDtypeChoice>> {};
-} // namespace
+}; // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
diff --git a/test/EmbeddingSpMDMTest.cc b/test/EmbeddingSpMDMTest.cc
index 8c59266ab9..68962cff23 100644
--- a/test/EmbeddingSpMDMTest.cc
+++ b/test/EmbeddingSpMDMTest.cc
@@ -62,7 +62,7 @@ class rowwiseSparseEmbeddingSpMDMTest
 
 class IndexRemapTest
     : public testing::TestWithParam<tuple<int, int, int, bool, bool>> {};
-} // namespace
+}; // namespace
 
 vector<int> prefetch_distances = {0, 16, 1000000};
 
diff --git a/test/EmbeddingSpMDMTestUtils.h b/test/EmbeddingSpMDMTestUtils.h
index 082625462f..b71b1c9916 100644
--- a/test/EmbeddingSpMDMTestUtils.h
+++ b/test/EmbeddingSpMDMTestUtils.h
@@ -59,4 +59,4 @@ int CreateMappingTableForRowWiseSparsity(
     int num_rows,
     float sparsity);
 
-} // namespace fbgemm
+}; // namespace fbgemm
diff --git a/test/Float16ConvertTest.cc b/test/Float16ConvertTest.cc
index 4408ca016f..e8f78ec601 100644
--- a/test/Float16ConvertTest.cc
+++ b/test/Float16ConvertTest.cc
@@ -19,7 +19,7 @@ using namespace fbgemm;
 
 namespace {
 class FBGemmFloat16Test : public testing::TestWithParam<bool> {};
-} // namespace
+}; // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
diff --git a/test/GConvTest.cc b/test/GConvTest.cc
index 28e6f6501e..ce465d890e 100644
--- a/test/GConvTest.cc
+++ b/test/GConvTest.cc
@@ -47,7 +47,7 @@ class fbgemmGConvAcc32WithQuantGranularityTest
           bool,
           bool>> {};
 class fbgemmGConvPackTest : public testing::TestWithParam<matrix_op_t> {};
-} // namespace
+}; // namespace
 
 // INSTANTIATE_TEST_CASE_P(
 //     InstantiationName,
diff --git a/test/Im2ColFusedRequantizeTest.cc b/test/Im2ColFusedRequantizeTest.cc
index 875200f09b..c4c0e3e047 100644
--- a/test/Im2ColFusedRequantizeTest.cc
+++ b/test/Im2ColFusedRequantizeTest.cc
@@ -34,7 +34,7 @@ vector<QuantizationGranularity> qGranularityVals{
 namespace {
 class fbgemmIm2colTest
     : public testing::TestWithParam<tuple<QuantizationGranularity, bool>> {};
-} // namespace
+}; // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
diff --git a/test/PackedRequantizeAcc16Test.cc b/test/PackedRequantizeAcc16Test.cc
index ae0ba33d37..596dafa71c 100644
--- a/test/PackedRequantizeAcc16Test.cc
+++ b/test/PackedRequantizeAcc16Test.cc
@@ -46,7 +46,7 @@ class fbgemmu8s8acc16Test
     : public testing::TestWithParam<tuple<matrix_op_t, matrix_op_t, bool>> {};
 class fbgemmPackUnpackAcc16Test
     : public testing::TestWithParam<tuple<matrix_op_t, bool>> {};
-} // namespace
+}; // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
diff --git a/test/PackedRequantizeTest.cc b/test/PackedRequantizeTest.cc
index 446e56840e..575d881fe3 100644
--- a/test/PackedRequantizeTest.cc
+++ b/test/PackedRequantizeTest.cc
@@ -45,7 +45,7 @@ class fbgemmu8s8acc32Test
     : public testing::TestWithParam<tuple<matrix_op_t, matrix_op_t, bool>> {};
 class fbgemmPackUnpackAcc32Test
     : public testing::TestWithParam<tuple<matrix_op_t, bool>> {};
-} // namespace
+}; // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
diff --git a/test/RequantizeOnlyTest.cc b/test/RequantizeOnlyTest.cc
index 6a3302704b..8bf584e831 100644
--- a/test/RequantizeOnlyTest.cc
+++ b/test/RequantizeOnlyTest.cc
@@ -33,7 +33,7 @@ class FloatRequantizeTest
     : public testing::TestWithParam<
           tuple<int, int, bool, QuantizationGranularity>> {};
 
-} // namespace
+}; // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
diff --git a/test/RowWiseSparseAdagradFusedTest.cc b/test/RowWiseSparseAdagradFusedTest.cc
index af2a7d6f1f..0e295462ac 100644
--- a/test/RowWiseSparseAdagradFusedTest.cc
+++ b/test/RowWiseSparseAdagradFusedTest.cc
@@ -64,7 +64,7 @@ class RowWiseSparseAdagradFusedTest : public testing::TestWithParam<tuple<
                                           bool,
                                           EmbeddingSpMDMCornerCase,
                                           bool>> {};
-} // namespace
+}; // namespace
 
 constexpr float DEFAULT_TOL = 1.0e-6;
 
diff --git a/test/SparseAdagradTest.cc b/test/SparseAdagradTest.cc
index eab97a1440..3f4b193b34 100644
--- a/test/SparseAdagradTest.cc
+++ b/test/SparseAdagradTest.cc
@@ -47,7 +47,7 @@ vector<int> prefetch_distances{0, 16, 1000000};
 namespace {
 class SparseAdagradTest
     : public testing::TestWithParam<tuple<bool, int, bool, bool, bool>> {};
-} // namespace
+}; // namespace
 
 constexpr float DEFAULT_TOL = 1.0e-6;
 
diff --git a/test/TransposedRequantizeTest.cc b/test/TransposedRequantizeTest.cc
index 77bda7d2f1..c546c3ffd9 100644
--- a/test/TransposedRequantizeTest.cc
+++ b/test/TransposedRequantizeTest.cc
@@ -33,7 +33,7 @@ class RequantizeTest
     : public testing::TestWithParam<
           tuple<int, int, bool, bool, QuantizationGranularity>> {};
 
-} // namespace
+}; // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
diff --git a/test/UniConvTest.cc b/test/UniConvTest.cc
index 82ee02d140..cb21d178dc 100644
--- a/test/UniConvTest.cc
+++ b/test/UniConvTest.cc
@@ -162,7 +162,7 @@ class UniConvQGranTest
     : public testing::TestWithParam<
           tuple<QuantizationGranularity, bool, bool, bool, bool>> {};
 
-} // namespace
+}; // namespace
 
 // Combine only allows at most 10 generators.
 INSTANTIATE_TEST_CASE_P(

From 174d473218c5c6660cc06c061a997dc55773bfe8 Mon Sep 17 00:00:00 2001
From: generatedunixname89002005232357
 <generatedunixname89002005232357@meta.com>
Date: Mon, 6 Nov 2023 13:05:22 -0800
Subject: [PATCH 04/73] Revert D50941763: Multisect successfully blamed
 "D50941763: [fbgemm_gpu] Remove test_aot_dispatch_static* tests from opcheck
 tests" for test or build failures (#2115)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2115

This diff is reverting D50941763
D50941763: [fbgemm_gpu] Remove test_aot_dispatch_static* tests from opcheck tests by zou3519 has been identified to be causing the following test or build failures:

Tests affected:
- [deeplearning/fbgemm/fbgemm_gpu:sparse_ops_test - test_aot_dispatch_static__test_permute_indices (deeplearning.fbgemm.fbgemm_gpu.test.sparse_ops_test.SparseOpsTest)](https://www.internalfb.com/intern/test/844425044733284/)

Here's the Multisect link:
https://www.internalfb.com/multisect/3468682
Here are the tasks that are relevant to this breakage:

We're generating a revert to back out the changes in this diff, please note the backout may land if someone accepts it.

If you believe this diff has been generated in error you may Commandeer and Abandon it.

Reviewed By: q10

Differential Revision: D51011831

fbshipit-source-id: e61dda526a7d915eb36bf6a6be8fb8583410a9a6
---
 fbgemm_gpu/test/failures_dict.json | 208 +++++++++++++++++++++++++++++
 fbgemm_gpu/test/test_utils.py      |   1 +
 2 files changed, 209 insertions(+)

diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index 8e53be56ed..fe762b4450 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -10,6 +10,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "SparseOpsTest.test_aot_dispatch_static__test_batch_index_select_dim0": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SparseOpsTest.test_autograd_registration__test_batch_index_select_dim0": {
         "comment": "",
         "status": "xfail"
@@ -33,6 +37,18 @@
         "comment": "",
         "status": "xfail"
       },
+      "SparseOpsTest.test_aot_dispatch_static__test_block_bucketize_sparse_features": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_block_bucketize_sparse_features_long_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_block_bucketize_sparse_features_with_variable_batch_sizes": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SparseOpsTest.test_faketensor__test_block_bucketize_sparse_features": {
         "comment": "",
         "status": "xfail"
@@ -51,6 +67,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "SparseOpsTest.test_aot_dispatch_static__test_bottom_unique_k_per_row": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SparseOpsTest.test_faketensor__test_bottom_unique_k_per_row": {
         "comment": "",
         "status": "xfail"
@@ -61,6 +81,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "SparseOpsTest.test_aot_dispatch_static__test_bucketize_sparse_features": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SparseOpsTest.test_faketensor__test_bucketize_sparse_features": {
         "comment": "",
         "status": "xfail"
@@ -71,6 +95,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "SparseOpsTest.test_aot_dispatch_static__test_cat_reorder_batched_ad_indices_cpu": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SparseOpsTest.test_faketensor__test_cat_reorder_batched_ad_indices_cpu": {
         "comment": "",
         "status": "xfail"
@@ -93,6 +121,22 @@
         "comment": "",
         "status": "xfail"
       },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_dense_to_jagged": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_dense_to_jagged_meta_backend": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_dense_to_jagged_opt": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_dense_to_jagged_opt_large_batch": {
+        "comment": "",
+        "status": "xfail"
+      },
       "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged": {
         "comment": "",
         "status": "xfail"
@@ -120,6 +164,14 @@
         "comment": "",
         "status": "xfail"
       },
+      "SparseOpsTest.test_aot_dispatch_static__test_generic_histogram_binning_calibration_by_feature": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_generic_histogram_binning_calibration_by_feature_cpu_gpu": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SparseOpsTest.test_faketensor__test_generic_histogram_binning_calibration_by_feature": {
         "comment": "",
         "status": "xfail"
@@ -144,6 +196,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "SparseOpsTest.test_aot_dispatch_static__test_histogram_binning_calibration": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SparseOpsTest.test_faketensor__test_histogram_binning_calibration": {
         "comment": "",
         "status": "xfail"
@@ -154,6 +210,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "SparseOpsTest.test_aot_dispatch_static__test_histogram_binning_calibration_by_feature": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SparseOpsTest.test_faketensor__test_histogram_binning_calibration_by_feature": {
         "comment": "",
         "status": "xfail"
@@ -164,6 +224,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "SparseOpsTest.test_aot_dispatch_static__test_invert_permute": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SparseOpsTest.test_faketensor__test_invert_permute": {
         "comment": "",
         "status": "xfail"
@@ -175,6 +239,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_1d_to_truncated_values": {
+        "comment": "",
+        "status": "xfail"
+      },
       "JaggedTensorOpsTest.test_faketensor__test_jagged_1d_to_truncated_values": {
         "comment": "",
         "status": "xfail"
@@ -188,6 +256,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_elementwise_binary": {
+        "comment": "",
+        "status": "xfail"
+      },
       "JaggedTensorOpsTest.test_autograd_registration__test_jagged_elementwise_binary": {
         "comment": "",
         "status": "xfail"
@@ -202,6 +274,14 @@
         "comment": "",
         "status": "xfail"
       },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_elementwise_binary": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_elementwise_binary_opt": {
+        "comment": "",
+        "status": "xfail"
+      },
       "JaggedTensorOpsTest.test_autograd_registration__test_jagged_elementwise_binary": {
         "comment": "",
         "status": "xfail"
@@ -221,6 +301,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_unique_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
       "JaggedTensorOpsTest.test_faketensor__test_jagged_unique_indices": {
         "comment": "",
         "status": "xfail"
@@ -239,6 +323,18 @@
         "comment": "",
         "status": "xfail"
       },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_index_select_2d": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_index_select_2d_in_inference": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_keyed_jagged_index_select_dim1": {
+        "comment": "",
+        "status": "xfail"
+      },
       "JaggedTensorOpsTest.test_faketensor__test_jagged_index_select_2d": {
         "comment": "",
         "status": "xfail"
@@ -258,6 +354,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_slice": {
+        "comment": "",
+        "status": "xfail"
+      },
       "JaggedTensorOpsTest.test_faketensor__test_jagged_slice": {
         "comment": "",
         "status": "xfail"
@@ -268,6 +368,10 @@
       "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_to_padded_dense": {
         "comment": "seems nondeterministic, but error is real",
         "status": "skip"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_to_padded_dense": {
+        "comment": "seems nondeterministic but error is real",
+        "status": "skip"
       }
     },
     "fbgemm::jagged_unique_indices": {
@@ -283,6 +387,18 @@
         "comment": "",
         "status": "xfail"
       },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_unique_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_unique_indices_empty": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_unique_indices_multi_keys": {
+        "comment": "",
+        "status": "xfail"
+      },
       "JaggedTensorOpsTest.test_faketensor__test_jagged_unique_indices": {
         "comment": "",
         "status": "xfail"
@@ -301,6 +417,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_keyed_jagged_index_select_dim1": {
+        "comment": "",
+        "status": "xfail"
+      },
       "JaggedTensorOpsTest.test_autograd_registration__test_keyed_jagged_index_select_dim1": {
         "comment": "",
         "status": "xfail"
@@ -315,6 +435,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_masked_select_jagged_1d": {
+        "comment": "",
+        "status": "xfail"
+      },
       "JaggedTensorOpsTest.test_faketensor__test_masked_select_jagged_1d": {
         "comment": "",
         "status": "xfail"
@@ -336,12 +460,32 @@
       "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_stacked_jagged_2d_to_dense": {
         "comment": "",
         "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_1d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_2d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_2d_to_dense_truncation": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_stacked_jagged_2d_to_dense": {
+        "comment": "",
+        "status": "xfail"
       }
     },
     "fbgemm::pack_segments": {
       "SparseOpsTest.test_aot_dispatch_dynamic__test_pack_segments": {
         "comment": "",
         "status": "xsuccess"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_pack_segments": {
+        "comment": "",
+        "status": "xsuccess"
       }
     },
     "fbgemm::permute102_baddbmm_permute102": {
@@ -349,18 +493,38 @@
         "comment": "",
         "status": "xfail"
       },
+      "SparseOpsTest.test_aot_dispatch_static__test_permute102_baddbmm_permute102": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SparseOpsTest.test_faketensor__test_permute102_baddbmm_permute102": {
         "comment": "",
         "status": "xfail"
       }
     },
     "fbgemm::permute_1D_sparse_data": {
+      "SparseOpsTest.test_aot_dispatch_static__test_permute_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SparseOpsTest.test_schema__test_permute_indices": {
         "comment": "flaky",
         "status": "skip"
       }
     },
     "fbgemm::permute_2D_sparse_data": {
+      "SparseOpsTest.test_aot_dispatch_static__test_permute_embeddings": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_permute_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_permute_indices_with_repeats": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SparseOpsTest.test_schema__test_permute_indices": {
         "comment": "flaky",
         "status": "skip"
@@ -371,6 +535,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "SparseOpsTest.test_aot_dispatch_static__test_permute_embeddings": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SparseOpsTest.test_faketensor__test_permute_embeddings": {
         "comment": "",
         "status": "xfail"
@@ -385,6 +553,14 @@
         "comment": "",
         "status": "xfail"
       },
+      "SparseOpsTest.test_aot_dispatch_static__test_reorder_batched_ad_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_reorder_batched_ad_indices_cpu": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SparseOpsTest.test_faketensor__test_reorder_batched_ad_indices": {
         "comment": "",
         "status": "xfail"
@@ -415,6 +591,26 @@
         "comment": "",
         "status": "xfail"
       },
+      "SparseOpsTest.test_aot_dispatch_static__test_cat_reorder_batched_ad_indices_cpu": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_reorder_batched_ad_indices": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_reorder_batched_ad_indices_cpu": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_reorder_batched_ad_lengths": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "SparseOpsTest.test_aot_dispatch_static__test_reorder_batched_ad_lengths_cpu": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SparseOpsTest.test_faketensor__test_cat_reorder_batched_ad_indices_cpu": {
         "comment": "",
         "status": "xfail"
@@ -441,6 +637,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "SparseOpsTest.test_aot_dispatch_static__test_segment_sum_csr": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SparseOpsTest.test_faketensor__test_segment_sum_csr": {
         "comment": "",
         "status": "xfail"
@@ -451,6 +651,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_stacked_jagged_1d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
       "JaggedTensorOpsTest.test_faketensor__test_stacked_jagged_1d_to_dense": {
         "comment": "",
         "status": "xfail"
@@ -461,6 +665,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "JaggedTensorOpsTest.test_aot_dispatch_static__test_stacked_jagged_2d_to_dense": {
+        "comment": "",
+        "status": "xfail"
+      },
       "JaggedTensorOpsTest.test_autograd_registration__test_stacked_jagged_2d_to_dense": {
         "comment": "",
         "status": "xfail"
diff --git a/fbgemm_gpu/test/test_utils.py b/fbgemm_gpu/test/test_utils.py
index 7f3ad9ae38..275eb2ae8a 100644
--- a/fbgemm_gpu/test/test_utils.py
+++ b/fbgemm_gpu/test/test_utils.py
@@ -257,6 +257,7 @@ def decorator(test_class: unittest.TestCase) -> unittest.TestCase:
             if not fast:
                 tests_to_run.extend(
                     [
+                        "test_aot_dispatch_static",
                         "test_aot_dispatch_dynamic",
                     ]
                 )

From 80990a6c4d6e7d6373cbbfa0802807a50c2b506d Mon Sep 17 00:00:00 2001
From: Tran Le <quytranle@meta.com>
Date: Mon, 6 Nov 2023 14:33:03 -0800
Subject: [PATCH 05/73] Add BF16 support for reorder_batched_ad_indices (#2116)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2116

We use `reorder_batched_ad_indices` to [rebatch id_score_list weights](https://www.internalfb.com/code/fbsource/[e3bbe1eaf65e]/fbcode/caffe2/caffe2/fb/predictor/rebatch/GPURebatchUtils.cpp?lines=305) which is quantized to BFloat 16. However, BFloat16 is currently not supported in `reorder_batched_ad_indices`, see error trace: P868895010

This diff adds this support for BFloat16 dtype.

Reviewed By: YazhiGao

Differential Revision: D50817983

fbshipit-source-id: 4949acac8d1524dc10c7931e28bdfcabd2e94477
---
 fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu | 3 ++-
 fbgemm_gpu/test/sparse_ops_test.py                     | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu b/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu
index 5243876a1b..dc55104229 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu
+++ b/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu
@@ -243,7 +243,8 @@ DLL_PUBLIC Tensor reorder_batched_ad_indices_gpu(
   const dim3 threads(32, 32);
   const dim3 blocks((B * T + 32 - 1) / 32);
 
-  AT_DISPATCH_ALL_TYPES(
+  AT_DISPATCH_ALL_TYPES_AND(
+      at::ScalarType::BFloat16,
       cat_ad_indices.scalar_type(),
       "reorder_batched_ad_indices_gpu_kernel_1",
       [&] {
diff --git a/fbgemm_gpu/test/sparse_ops_test.py b/fbgemm_gpu/test/sparse_ops_test.py
index f3cbb5dbf1..558f3f4990 100644
--- a/fbgemm_gpu/test/sparse_ops_test.py
+++ b/fbgemm_gpu/test/sparse_ops_test.py
@@ -1114,7 +1114,7 @@ def test_reorder_batched_ad_lengths_cpu(
         T=st.integers(min_value=1, max_value=20),
         L=st.integers(min_value=2, max_value=20),
         A=st.integers(min_value=1, max_value=20),
-        Dtype=st.sampled_from([torch.int32, torch.float, torch.int64]),
+        Dtype=st.sampled_from([torch.int32, torch.float, torch.int64, torch.bfloat16]),
         Itype=st.sampled_from([torch.int32, torch.int64]),
         broadcast_indices=st.booleans(),
     )

From 21d0c953eece69eadf6b4fd22334f7ee62b090c5 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Mon, 6 Nov 2023 22:45:48 -0800
Subject: [PATCH 06/73] Fix OSS issues with D50741802 (#2117)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2117

- Migrate the merge pooled embedding operator definitions to the cpu source file to avoid undefined ops errors when building the CPU-only variant of OSS FBGEMM_GPU

Reviewed By: sryap

Differential Revision: D51040487

fbshipit-source-id: 20961b13e0ba10000e693267fb8324161cd62831
---
 fbgemm_gpu/CMakeLists.txt                       |  4 ++--
 fbgemm_gpu/fbgemm_gpu/__init__.py               |  2 +-
 .../{sparse_operators.py => sparse_ops.py}      |  1 -
 .../merge_pooled_embedding_ops_cpu.cpp}         | 17 +++++++++++++----
 .../merge_pooled_embedding_ops_gpu.cpp}         | 10 ++--------
 fbgemm_gpu/test/jagged_tensor_ops_test.py       |  2 +-
 fbgemm_gpu/test/merge_pooled_embeddings_test.py |  2 +-
 fbgemm_gpu/test/sparse_ops_test.py              |  2 +-
 8 files changed, 21 insertions(+), 19 deletions(-)
 rename fbgemm_gpu/fbgemm_gpu/{sparse_operators.py => sparse_ops.py} (99%)
 rename fbgemm_gpu/src/{merge_pooled_embeddings_cpu.cpp => merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp} (71%)
 rename fbgemm_gpu/src/{merge_pooled_embeddings_gpu.cpp => merge_pooled_embedding_ops/merge_pooled_embedding_ops_gpu.cpp} (98%)

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 076961a827..fd2dbfbc82 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -561,6 +561,7 @@ set(fbgemm_gpu_sources_static_cpu
     codegen/embedding_forward_quantized_host_cpu.cpp
     codegen/embedding_backward_dense_host_cpu.cpp
     codegen/embedding_bounds_check_host_cpu.cpp
+    src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp
@@ -607,8 +608,7 @@ if(NOT FBGEMM_CPU_ONLY)
   if(NVML_LIB_PATH OR USE_ROCM)
     message(STATUS "Adding merge_pooled_embeddings sources")
     list(APPEND fbgemm_gpu_sources_static_cpu
-      src/merge_pooled_embeddings_cpu.cpp
-      src/merge_pooled_embeddings_gpu.cpp
+      src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_gpu.cpp
       src/topology_utils.cpp)
   else()
     message(STATUS "Skipping merge_pooled_embeddings sources")
diff --git a/fbgemm_gpu/fbgemm_gpu/__init__.py b/fbgemm_gpu/fbgemm_gpu/__init__.py
index f63fde22d2..3ca97566b7 100644
--- a/fbgemm_gpu/fbgemm_gpu/__init__.py
+++ b/fbgemm_gpu/fbgemm_gpu/__init__.py
@@ -20,7 +20,7 @@
 
 # Re-export docs
 # Trigger meta registrations
-from . import _fbgemm_gpu_docs, sparse_operators  # noqa: F401, E402  # noqa: F401, E402
+from . import _fbgemm_gpu_docs, sparse_ops  # noqa: F401, E402  # noqa: F401, E402
 
 # Re-export the version string from the auto-generated version file
 from ._fbgemm_gpu_version import __version__  # noqa: F401, E402
diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_operators.py b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
similarity index 99%
rename from fbgemm_gpu/fbgemm_gpu/sparse_operators.py
rename to fbgemm_gpu/fbgemm_gpu/sparse_ops.py
index bb193998fc..42b0079b2c 100644
--- a/fbgemm_gpu/fbgemm_gpu/sparse_operators.py
+++ b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
@@ -7,7 +7,6 @@
 from typing import Callable, List, Optional, Tuple
 
 import torch
-
 from fbgemm_gpu.split_embedding_configs import SparseType
 from fbgemm_gpu.split_table_batched_embeddings_ops_common import PoolingMode
 from torch import Tensor
diff --git a/fbgemm_gpu/src/merge_pooled_embeddings_cpu.cpp b/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
similarity index 71%
rename from fbgemm_gpu/src/merge_pooled_embeddings_cpu.cpp
rename to fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
index f3db379f93..cd974d132b 100644
--- a/fbgemm_gpu/src/merge_pooled_embeddings_cpu.cpp
+++ b/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
@@ -10,7 +10,7 @@
 #include <ATen/core/op_registration/op_registration.h>
 #include <c10/core/TensorOptions.h>
 #include <torch/library.h>
-#include "fbgemm_gpu/sparse_ops_utils.h"
+#include "fbgemm_gpu/ops_utils.h"
 
 using Tensor = at::Tensor;
 
@@ -49,7 +49,16 @@ Tensor merge_pooled_embeddings_cpu(
 
 } // namespace fbgemm_gpu
 
-TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
-  DISPATCH_TO_CPU(
-      "merge_pooled_embeddings", fbgemm_gpu::merge_pooled_embeddings_cpu);
+TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
+  m.def(
+      "merge_pooled_embeddings(Tensor[] pooled_embeddings, SymInt uncat_dim_size, Device target_device, SymInt cat_dim=1) -> Tensor");
+  m.def(
+      "all_to_one_device(Tensor[] input_tensors, Device target_device) -> Tensor[]");
+  m.def(
+      "sum_reduce_to_one(Tensor[] input_tensors, Device target_device) -> Tensor");
 }
+
+FBGEMM_OP_DISPATCH(
+    CPU,
+    "merge_pooled_embeddings",
+    fbgemm_gpu::merge_pooled_embeddings_cpu);
diff --git a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_gpu.cpp
similarity index 98%
rename from fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
rename to fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_gpu.cpp
index 305af91e17..37d6c5a440 100644
--- a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
+++ b/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_gpu.cpp
@@ -11,7 +11,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAEvent.h>
 #include <ATen/cuda/PeerToPeerAccess.h>
-#include <ATen/native/TensorAdvancedIndexing.h>
+
 #include <c10/core/Device.h>
 #include <c10/core/TensorOptions.h>
 #include <c10/cuda/CUDAGuard.h>
@@ -19,8 +19,8 @@
 #include <torch/library.h>
 #include <algorithm>
 #include <tuple>
-
 #include "fbgemm_gpu/merge_pooled_embeddings.h"
+
 #include "fbgemm_gpu/sparse_ops_utils.h"
 #include "fbgemm_gpu/topology_utils.h"
 
@@ -645,14 +645,8 @@ Tensor sum_reduce_to_one_device(
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
-  m.def(
-      "merge_pooled_embeddings(Tensor[] pooled_embeddings, SymInt uncat_dim_size, Device target_device, SymInt cat_dim=1) -> Tensor");
   DISPATCH_TO_CUDA(
       "merge_pooled_embeddings", fbgemm_gpu::merge_pooled_embeddings);
-  m.def(
-      "all_to_one_device(Tensor[] input_tensors, Device target_device) -> Tensor[]");
   DISPATCH_TO_CUDA("all_to_one_device", fbgemm_gpu::all_to_one_device);
-  m.def(
-      "sum_reduce_to_one(Tensor[] input_tensors, Device target_device) -> Tensor");
   DISPATCH_TO_CUDA("sum_reduce_to_one", fbgemm_gpu::sum_reduce_to_one_device);
 }
diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py
index f83bd941bc..62a1ffa1cf 100644
--- a/fbgemm_gpu/test/jagged_tensor_ops_test.py
+++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py
@@ -35,7 +35,7 @@
 except Exception:
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
-    import fbgemm_gpu.sparse_operators  # noqa: F401, E402
+    import fbgemm_gpu.sparse_ops  # noqa: F401, E402
     from fbgemm_gpu.test.test_utils import (
         gpu_available,
         gpu_unavailable,
diff --git a/fbgemm_gpu/test/merge_pooled_embeddings_test.py b/fbgemm_gpu/test/merge_pooled_embeddings_test.py
index 7f3ab26176..78d62fb416 100644
--- a/fbgemm_gpu/test/merge_pooled_embeddings_test.py
+++ b/fbgemm_gpu/test/merge_pooled_embeddings_test.py
@@ -26,7 +26,7 @@
     torch.ops.load_library(
         "//deeplearning/fbgemm/fbgemm_gpu:merge_pooled_embeddings_cpu"
     )
-    import fbgemm_gpu.sparse_operators  # noqa: F401, E402
+    import fbgemm_gpu.sparse_ops  # noqa: F401, E402
     from fbgemm_gpu.test.test_utils import gpu_unavailable
 
     open_source = False
diff --git a/fbgemm_gpu/test/sparse_ops_test.py b/fbgemm_gpu/test/sparse_ops_test.py
index 558f3f4990..b9f486fc10 100644
--- a/fbgemm_gpu/test/sparse_ops_test.py
+++ b/fbgemm_gpu/test/sparse_ops_test.py
@@ -38,7 +38,7 @@
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/codegen:index_select_ops")
-    import fbgemm_gpu.sparse_operators  # noqa: F401, E402
+    import fbgemm_gpu.sparse_ops  # noqa: F401, E402
     from fbgemm_gpu.test.test_utils import gpu_available, gpu_unavailable, skipIfRocm
 
 suppressed_list: List[HealthCheck] = (

From b6bdf047abea0f86c38208a35b9aceff9b694cfe Mon Sep 17 00:00:00 2001
From: Richard Zou <rzou@meta.com>
Date: Tue, 7 Nov 2023 07:58:38 -0800
Subject: [PATCH 07/73] Update impl_abstract_pystub to be less boilerplatey

Summary:
X-link: https://github.com/pytorch/pytorch/pull/112851

We've made the following changes:
- The new way to use the API is `m.impl_abstract_pystub(module, context)`.
  Every subsequent m.def of an op inside the TORCH_LIBRARY block gives
  the op the `impl_abstract_pystub`.
- Added a mechanism to determine if an operator was defined in Python or C++.
  Library.define in Python appends the op to a global set, which is analogous
  to what we do for tracking Library.impl.
- If someone does `torch.library.impl_abstract` in Python for an operator, then
  we require that it has an `impl_abstract_pystub` specified and we also check
  that the module in the `impl_abstract_pystub` is the same as the module where
  the call to `torch.library.impl_abstract` exists.
- Unfortunately we can't check the "context" (which is the buck target on
  buck-based systems) because buck sits above us.

bypass-github-export-checks

Reviewed By: ezyang

Differential Revision: D50972148

fbshipit-source-id: 34ab31493d9bccd0d351b463cfb508ba0b05eef4
---
 .../merge_pooled_embedding_ops_cpu.cpp                       | 5 +++++
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp                 | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp b/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
index cd974d132b..db3ee9b35a 100644
--- a/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
+++ b/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
@@ -50,6 +50,11 @@ Tensor merge_pooled_embeddings_cpu(
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
+#ifdef HAS_IMPL_ABSTRACT_PYSTUB
+  m.impl_abstract_pystub(
+      "fbgemm_gpu.sparse_ops",
+      "//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_py");
+#endif
   m.def(
       "merge_pooled_embeddings(Tensor[] pooled_embeddings, SymInt uncat_dim_size, Device target_device, SymInt cat_dim=1) -> Tensor");
   m.def(
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index 7d2a98d5cd..54e8b205e8 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -2686,6 +2686,11 @@ Tensor bottom_k_per_row(
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
+#ifdef HAS_IMPL_ABSTRACT_PYSTUB
+  m.impl_abstract_pystub(
+      "fbgemm_gpu.sparse_ops",
+      "//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_py");
+#endif
   m.def(
       "permute_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
   m.def(

From b8f6beb785038e5ee9180010273aa5419e2a3cf9 Mon Sep 17 00:00:00 2001
From: Richard Zou <rzou@meta.com>
Date: Tue, 7 Nov 2023 08:05:24 -0800
Subject: [PATCH 08/73] Remove test_aot_dispatch_static* tests from opcheck
 tests (#2118)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2118

We missed a line in the original diff.

After Ed's D50886564, test_aot_dispatch_static and
test_aot_dispatch_dynamic are redundant, so we're just going to go with one
of them to reduce complexity and overall test times.

Reviewed By: williamwen42

Differential Revision: D51051732

fbshipit-source-id: 8be518dd562264f5933eb2cf01d58db425af4204
---
 fbgemm_gpu/test/failures_dict.json | 208 -----------------------------
 fbgemm_gpu/test/sparse_ops_test.py |   1 -
 fbgemm_gpu/test/test_utils.py      |   1 -
 3 files changed, 210 deletions(-)

diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index fe762b4450..8e53be56ed 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -10,10 +10,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "SparseOpsTest.test_aot_dispatch_static__test_batch_index_select_dim0": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SparseOpsTest.test_autograd_registration__test_batch_index_select_dim0": {
         "comment": "",
         "status": "xfail"
@@ -37,18 +33,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "SparseOpsTest.test_aot_dispatch_static__test_block_bucketize_sparse_features": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SparseOpsTest.test_aot_dispatch_static__test_block_bucketize_sparse_features_long_indices": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SparseOpsTest.test_aot_dispatch_static__test_block_bucketize_sparse_features_with_variable_batch_sizes": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SparseOpsTest.test_faketensor__test_block_bucketize_sparse_features": {
         "comment": "",
         "status": "xfail"
@@ -67,10 +51,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "SparseOpsTest.test_aot_dispatch_static__test_bottom_unique_k_per_row": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SparseOpsTest.test_faketensor__test_bottom_unique_k_per_row": {
         "comment": "",
         "status": "xfail"
@@ -81,10 +61,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "SparseOpsTest.test_aot_dispatch_static__test_bucketize_sparse_features": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SparseOpsTest.test_faketensor__test_bucketize_sparse_features": {
         "comment": "",
         "status": "xfail"
@@ -95,10 +71,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "SparseOpsTest.test_aot_dispatch_static__test_cat_reorder_batched_ad_indices_cpu": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SparseOpsTest.test_faketensor__test_cat_reorder_batched_ad_indices_cpu": {
         "comment": "",
         "status": "xfail"
@@ -121,22 +93,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_dense_to_jagged": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_dense_to_jagged_meta_backend": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_dense_to_jagged_opt": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_dense_to_jagged_opt_large_batch": {
-        "comment": "",
-        "status": "xfail"
-      },
       "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged": {
         "comment": "",
         "status": "xfail"
@@ -164,14 +120,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "SparseOpsTest.test_aot_dispatch_static__test_generic_histogram_binning_calibration_by_feature": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SparseOpsTest.test_aot_dispatch_static__test_generic_histogram_binning_calibration_by_feature_cpu_gpu": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SparseOpsTest.test_faketensor__test_generic_histogram_binning_calibration_by_feature": {
         "comment": "",
         "status": "xfail"
@@ -196,10 +144,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "SparseOpsTest.test_aot_dispatch_static__test_histogram_binning_calibration": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SparseOpsTest.test_faketensor__test_histogram_binning_calibration": {
         "comment": "",
         "status": "xfail"
@@ -210,10 +154,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "SparseOpsTest.test_aot_dispatch_static__test_histogram_binning_calibration_by_feature": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SparseOpsTest.test_faketensor__test_histogram_binning_calibration_by_feature": {
         "comment": "",
         "status": "xfail"
@@ -224,10 +164,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "SparseOpsTest.test_aot_dispatch_static__test_invert_permute": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SparseOpsTest.test_faketensor__test_invert_permute": {
         "comment": "",
         "status": "xfail"
@@ -239,10 +175,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_1d_to_truncated_values": {
-        "comment": "",
-        "status": "xfail"
-      },
       "JaggedTensorOpsTest.test_faketensor__test_jagged_1d_to_truncated_values": {
         "comment": "",
         "status": "xfail"
@@ -256,10 +188,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_elementwise_binary": {
-        "comment": "",
-        "status": "xfail"
-      },
       "JaggedTensorOpsTest.test_autograd_registration__test_jagged_elementwise_binary": {
         "comment": "",
         "status": "xfail"
@@ -274,14 +202,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_elementwise_binary": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_elementwise_binary_opt": {
-        "comment": "",
-        "status": "xfail"
-      },
       "JaggedTensorOpsTest.test_autograd_registration__test_jagged_elementwise_binary": {
         "comment": "",
         "status": "xfail"
@@ -301,10 +221,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_unique_indices": {
-        "comment": "",
-        "status": "xfail"
-      },
       "JaggedTensorOpsTest.test_faketensor__test_jagged_unique_indices": {
         "comment": "",
         "status": "xfail"
@@ -323,18 +239,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_index_select_2d": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_index_select_2d_in_inference": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_keyed_jagged_index_select_dim1": {
-        "comment": "",
-        "status": "xfail"
-      },
       "JaggedTensorOpsTest.test_faketensor__test_jagged_index_select_2d": {
         "comment": "",
         "status": "xfail"
@@ -354,10 +258,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_slice": {
-        "comment": "",
-        "status": "xfail"
-      },
       "JaggedTensorOpsTest.test_faketensor__test_jagged_slice": {
         "comment": "",
         "status": "xfail"
@@ -368,10 +268,6 @@
       "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_to_padded_dense": {
         "comment": "seems nondeterministic, but error is real",
         "status": "skip"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_to_padded_dense": {
-        "comment": "seems nondeterministic but error is real",
-        "status": "skip"
       }
     },
     "fbgemm::jagged_unique_indices": {
@@ -387,18 +283,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_unique_indices": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_unique_indices_empty": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_unique_indices_multi_keys": {
-        "comment": "",
-        "status": "xfail"
-      },
       "JaggedTensorOpsTest.test_faketensor__test_jagged_unique_indices": {
         "comment": "",
         "status": "xfail"
@@ -417,10 +301,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_keyed_jagged_index_select_dim1": {
-        "comment": "",
-        "status": "xfail"
-      },
       "JaggedTensorOpsTest.test_autograd_registration__test_keyed_jagged_index_select_dim1": {
         "comment": "",
         "status": "xfail"
@@ -435,10 +315,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_masked_select_jagged_1d": {
-        "comment": "",
-        "status": "xfail"
-      },
       "JaggedTensorOpsTest.test_faketensor__test_masked_select_jagged_1d": {
         "comment": "",
         "status": "xfail"
@@ -460,32 +336,12 @@
       "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_stacked_jagged_2d_to_dense": {
         "comment": "",
         "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_1d_to_dense": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_2d_to_dense": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_jagged_2d_to_dense_truncation": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_stacked_jagged_2d_to_dense": {
-        "comment": "",
-        "status": "xfail"
       }
     },
     "fbgemm::pack_segments": {
       "SparseOpsTest.test_aot_dispatch_dynamic__test_pack_segments": {
         "comment": "",
         "status": "xsuccess"
-      },
-      "SparseOpsTest.test_aot_dispatch_static__test_pack_segments": {
-        "comment": "",
-        "status": "xsuccess"
       }
     },
     "fbgemm::permute102_baddbmm_permute102": {
@@ -493,38 +349,18 @@
         "comment": "",
         "status": "xfail"
       },
-      "SparseOpsTest.test_aot_dispatch_static__test_permute102_baddbmm_permute102": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SparseOpsTest.test_faketensor__test_permute102_baddbmm_permute102": {
         "comment": "",
         "status": "xfail"
       }
     },
     "fbgemm::permute_1D_sparse_data": {
-      "SparseOpsTest.test_aot_dispatch_static__test_permute_indices": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SparseOpsTest.test_schema__test_permute_indices": {
         "comment": "flaky",
         "status": "skip"
       }
     },
     "fbgemm::permute_2D_sparse_data": {
-      "SparseOpsTest.test_aot_dispatch_static__test_permute_embeddings": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SparseOpsTest.test_aot_dispatch_static__test_permute_indices": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SparseOpsTest.test_aot_dispatch_static__test_permute_indices_with_repeats": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SparseOpsTest.test_schema__test_permute_indices": {
         "comment": "flaky",
         "status": "skip"
@@ -535,10 +371,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "SparseOpsTest.test_aot_dispatch_static__test_permute_embeddings": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SparseOpsTest.test_faketensor__test_permute_embeddings": {
         "comment": "",
         "status": "xfail"
@@ -553,14 +385,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "SparseOpsTest.test_aot_dispatch_static__test_reorder_batched_ad_indices": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SparseOpsTest.test_aot_dispatch_static__test_reorder_batched_ad_indices_cpu": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SparseOpsTest.test_faketensor__test_reorder_batched_ad_indices": {
         "comment": "",
         "status": "xfail"
@@ -591,26 +415,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "SparseOpsTest.test_aot_dispatch_static__test_cat_reorder_batched_ad_indices_cpu": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SparseOpsTest.test_aot_dispatch_static__test_reorder_batched_ad_indices": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SparseOpsTest.test_aot_dispatch_static__test_reorder_batched_ad_indices_cpu": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SparseOpsTest.test_aot_dispatch_static__test_reorder_batched_ad_lengths": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SparseOpsTest.test_aot_dispatch_static__test_reorder_batched_ad_lengths_cpu": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SparseOpsTest.test_faketensor__test_cat_reorder_batched_ad_indices_cpu": {
         "comment": "",
         "status": "xfail"
@@ -637,10 +441,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "SparseOpsTest.test_aot_dispatch_static__test_segment_sum_csr": {
-        "comment": "",
-        "status": "xfail"
-      },
       "SparseOpsTest.test_faketensor__test_segment_sum_csr": {
         "comment": "",
         "status": "xfail"
@@ -651,10 +451,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_stacked_jagged_1d_to_dense": {
-        "comment": "",
-        "status": "xfail"
-      },
       "JaggedTensorOpsTest.test_faketensor__test_stacked_jagged_1d_to_dense": {
         "comment": "",
         "status": "xfail"
@@ -665,10 +461,6 @@
         "comment": "",
         "status": "xfail"
       },
-      "JaggedTensorOpsTest.test_aot_dispatch_static__test_stacked_jagged_2d_to_dense": {
-        "comment": "",
-        "status": "xfail"
-      },
       "JaggedTensorOpsTest.test_autograd_registration__test_stacked_jagged_2d_to_dense": {
         "comment": "",
         "status": "xfail"
diff --git a/fbgemm_gpu/test/sparse_ops_test.py b/fbgemm_gpu/test/sparse_ops_test.py
index b9f486fc10..2fe6aa80aa 100644
--- a/fbgemm_gpu/test/sparse_ops_test.py
+++ b/fbgemm_gpu/test/sparse_ops_test.py
@@ -2436,7 +2436,6 @@ def validate(
             "test_schema",
             "test_autograd_registration",
             "test_faketensor",
-            "test_aot_dispatch_static",
             "test_aot_dispatch_dynamic",
         ],
     )
diff --git a/fbgemm_gpu/test/test_utils.py b/fbgemm_gpu/test/test_utils.py
index 275eb2ae8a..7f3ad9ae38 100644
--- a/fbgemm_gpu/test/test_utils.py
+++ b/fbgemm_gpu/test/test_utils.py
@@ -257,7 +257,6 @@ def decorator(test_class: unittest.TestCase) -> unittest.TestCase:
             if not fast:
                 tests_to_run.extend(
                     [
-                        "test_aot_dispatch_static",
                         "test_aot_dispatch_dynamic",
                     ]
                 )

From 308ffdc6bf17c10f45fc278b2ba8bf7b27449a6b Mon Sep 17 00:00:00 2001
From: Richard Zou <rzou@fb.com>
Date: Tue, 7 Nov 2023 10:46:02 -0800
Subject: [PATCH 09/73] Revert D50972148: Update impl_abstract_pystub to be
 less boilerplatey

Differential Revision:
D50972148

Original commit changeset: 34ab31493d9b

Original Phabricator Diff: D50972148

fbshipit-source-id: 102516e7d0defe2049d4758ef54f32fd1a4a499b
---
 .../merge_pooled_embedding_ops_cpu.cpp                       | 5 -----
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp                 | 5 -----
 2 files changed, 10 deletions(-)

diff --git a/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp b/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
index db3ee9b35a..cd974d132b 100644
--- a/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
+++ b/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
@@ -50,11 +50,6 @@ Tensor merge_pooled_embeddings_cpu(
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
-#ifdef HAS_IMPL_ABSTRACT_PYSTUB
-  m.impl_abstract_pystub(
-      "fbgemm_gpu.sparse_ops",
-      "//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_py");
-#endif
   m.def(
       "merge_pooled_embeddings(Tensor[] pooled_embeddings, SymInt uncat_dim_size, Device target_device, SymInt cat_dim=1) -> Tensor");
   m.def(
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index 54e8b205e8..7d2a98d5cd 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -2686,11 +2686,6 @@ Tensor bottom_k_per_row(
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
-#ifdef HAS_IMPL_ABSTRACT_PYSTUB
-  m.impl_abstract_pystub(
-      "fbgemm_gpu.sparse_ops",
-      "//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_py");
-#endif
   m.def(
       "permute_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
   m.def(

From 0457bb7bcb0640cd99476231fac952c8c21b335f Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@meta.com>
Date: Tue, 7 Nov 2023 14:18:44 -0800
Subject: [PATCH 10/73] Derandomize all tests in fbgemm_gpu

Reviewed By: zou3519

Differential Revision: D50895307

fbshipit-source-id: a5c0709fd1e8d96c38f446e1c922bb2ff54909d0
---
 fbgemm_gpu/test/test_utils.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fbgemm_gpu/test/test_utils.py b/fbgemm_gpu/test/test_utils.py
index 7f3ad9ae38..cd5dca4105 100644
--- a/fbgemm_gpu/test/test_utils.py
+++ b/fbgemm_gpu/test/test_utils.py
@@ -18,6 +18,11 @@
 import hypothesis.strategies as st
 import numpy as np
 import torch
+from hypothesis import settings
+
+
+settings.register_profile("derandomize", derandomize=True)
+settings.load_profile("derandomize")
 
 
 TEST_WITH_ROCM: bool = os.getenv("FBGEMM_TEST_WITH_ROCM", "0") == "1"

From 0def4d8f19735caad93d36afa489f1988c2993ac Mon Sep 17 00:00:00 2001
From: Richard Zou <rzou@meta.com>
Date: Tue, 7 Nov 2023 16:29:47 -0800
Subject: [PATCH 11/73] Update impl_abstract_pystub to be less boilerplatey

Summary:
X-link: https://github.com/pytorch/pytorch/pull/113182

We've made the following changes:
- The new way to use the API is `m.impl_abstract_pystub(module, context)`.
  Every subsequent m.def of an op inside the TORCH_LIBRARY block gives
  the op the `impl_abstract_pystub`.
- Added a mechanism to determine if an operator was defined in Python or C++.
  Library.define in Python appends the op to a global set, which is analogous
  to what we do for tracking Library.impl.
- If someone does `torch.library.impl_abstract` in Python for an operator, then
  we require that it has an `impl_abstract_pystub` specified and we also check
  that the module in the `impl_abstract_pystub` is the same as the module where
  the call to `torch.library.impl_abstract` exists.
- Unfortunately we can't check the "context" (which is the buck target on
  buck-based systems) because buck sits above us.

bypass-github-export-checks

Reviewed By: ezyang

Differential Revision: D51080493

fbshipit-source-id: 703e349b8071799a6218b2e03369d1b02c150505
---
 .../merge_pooled_embedding_ops_cpu.cpp                       | 5 +++++
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp                 | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp b/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
index cd974d132b..db3ee9b35a 100644
--- a/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
+++ b/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
@@ -50,6 +50,11 @@ Tensor merge_pooled_embeddings_cpu(
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
+#ifdef HAS_IMPL_ABSTRACT_PYSTUB
+  m.impl_abstract_pystub(
+      "fbgemm_gpu.sparse_ops",
+      "//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_py");
+#endif
   m.def(
       "merge_pooled_embeddings(Tensor[] pooled_embeddings, SymInt uncat_dim_size, Device target_device, SymInt cat_dim=1) -> Tensor");
   m.def(
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index 7d2a98d5cd..54e8b205e8 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -2686,6 +2686,11 @@ Tensor bottom_k_per_row(
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
+#ifdef HAS_IMPL_ABSTRACT_PYSTUB
+  m.impl_abstract_pystub(
+      "fbgemm_gpu.sparse_ops",
+      "//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_py");
+#endif
   m.def(
       "permute_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
   m.def(

From 9b13b5acf826ee77d517631b02f2be63210929aa Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@meta.com>
Date: Wed, 8 Nov 2023 06:35:22 -0800
Subject: [PATCH 12/73] SymIntify {}_embedding{}_codegen_forward_{}{}_cuda meta
 function

Reviewed By: zou3519

Differential Revision: D50893098

fbshipit-source-id: 0ef9d0fa47ad94bb0be62191f966e829fac0d17d
---
 .../embedding_forward_split_meta_template.cpp | 35 +++++++++----------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/fbgemm_gpu/codegen/embedding_forward_split_meta_template.cpp b/fbgemm_gpu/codegen/embedding_forward_split_meta_template.cpp
index f46d56c096..0c8c930ecd 100644
--- a/fbgemm_gpu/codegen/embedding_forward_split_meta_template.cpp
+++ b/fbgemm_gpu/codegen/embedding_forward_split_meta_template.cpp
@@ -85,22 +85,20 @@ Tensor
 ) {
     // NB: omitted the device tests TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL
 
-    // TODO: SymIntify
-
     {%- if not nobag %}
-    int32_t T = D_offsets.numel() - 1;
+    auto T = D_offsets.sym_numel() - 1;
     {%- else %}
-    int32_t total_L = indices.numel();
-    int32_t T = weights_offsets.numel();
+    auto total_L = indices.sym_numel();
+    auto T = weights_offsets.sym_numel();
     {%- endif %}
     TORCH_CHECK_GT(T, 0);
     // offsets = [B x T  + 1]
     {%- if is_index_select %}
     const auto total_B = num_warps_per_feature * T;
-    const int32_t B = num_warps_per_feature;
+    const auto B = num_warps_per_feature;
     {%- else %}
-    const auto total_B = offsets.size(0) - 1;
-    const int32_t B = total_B / T;
+    const auto total_B = offsets.sym_size(0) - 1;
+    const auto B = total_B / T;
     {%- endif %}
     TORCH_CHECK_GE(B, 0);
     {%- if not nobag or is_index_select %}
@@ -114,7 +112,7 @@ Tensor
     TORCH_CHECK_EQ(D % 4, 0);
     {%- endif %}
     {%- if vbe %}
-    TORCH_CHECK_EQ(vbe_row_output_offsets.numel(), total_B);
+    TORCH_CHECK_EQ(vbe_row_output_offsets.sym_numel(), total_B);
     TENSORS_HAVE_SAME_NUMEL(vbe_row_output_offsets, vbe_b_t_map);
     TORCH_CHECK_GE(vbe_output_size, 0);
 
@@ -133,40 +131,41 @@ Tensor
     TORCH_CHECK_GT(num_warps_per_feature, 0);
     if (!permute_output_dim_0_1) {
         TORCH_CHECK_GE(output_size, 0);
-        TORCH_CHECK_GT(output_offsets.numel(), 0);
+        TORCH_CHECK_GT(output_offsets.sym_numel(), 0);
     }
 
     // If permute_output_dim_0_1 is true, output shape is (batch_size * total_D)
     // Else, output shape is (output_size)
-    output = at::empty({output_size}, dev_weights.options().dtype(getScalarType(o_dtype)));
+    output = at::empty_symint({output_size}, dev_weights.options().dtype(getScalarType(o_dtype)));
     {%- else %}
     TORCH_CHECK(o_dtype == SparseType::FP32 || o_dtype == SparseType::FP16 ||
                 o_dtype == SparseType::BF16 || o_dtype == SparseType::INT8);
 
-    int64_t adjusted_D = D;
+    c10::SymInt adjusted_D = D;
     if (o_dtype == SparseType::INT8) {
-        adjusted_D += T * kINT8QparamsBytes;
+        adjusted_D += T * int64_t(kINT8QparamsBytes);
     }
 
-    output = at::empty({total_L, adjusted_D}, dev_weights.options().dtype(getScalarType(o_dtype)));
+    output = at::empty_symint({total_L, adjusted_D}, dev_weights.options().dtype(getScalarType(o_dtype)));
     {%- endif %}
     {%- else %}
     SparseType o_dtype = static_cast<SparseType>(output_dtype);
     TORCH_CHECK(o_dtype == SparseType::FP32 || o_dtype == SparseType::FP16 ||
                 o_dtype == SparseType::BF16 || o_dtype == SparseType::INT8);
-    int64_t total_adjusted_D = total_D;
+    c10::SymInt total_adjusted_D = total_D;
     if (o_dtype == SparseType::INT8) {
-        total_adjusted_D += T * kINT8QparamsBytes;
+        // TODO: Why is kINT8QparamsBytes a float
+        total_adjusted_D += T * int64_t(kINT8QparamsBytes);
     }
 
     {%- if vbe %}
     // Use a 2D tensor to make it compatible with 2D PackedTensorsAccessor of other output
-    output = at::empty(
+    output = at::empty_symint(
         {1, vbe_output_size},
         dev_weights.options().dtype(getScalarType(o_dtype))
     );
     {%- else %}
-    output = at::empty(
+    output = at::empty_symint(
         {B, total_adjusted_D},
         dev_weights.options().dtype(getScalarType(o_dtype))
     );

From e5236c8d22ad4d18f90adae92dd44d605350e594 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@meta.com>
Date: Wed, 8 Nov 2023 06:35:22 -0800
Subject: [PATCH 13/73] SymIntify {}_embedding{}_codegen_forward_{}{}_cuda
 autograd function

Reviewed By: zou3519

Differential Revision: D50895693

fbshipit-source-id: e6b774369446661486667252ee28aad2a3318cc2
---
 .../codegen/embedding_backward_split_host_template.cpp   | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
index 63fdf6e684..82b345365e 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
+++ b/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
@@ -179,18 +179,19 @@ class {{ autograd_func }} :
     const bool is_experimental,
     {{ args.split_function_args | join(", ") }}) {
 
-    const auto T = weights_offsets.numel();
+    const auto T = weights_offsets.sym_numel();
     {%- if vbe %}
     const auto B_offsets_ = B_offsets.value_or(Tensor());
     const auto vbe_output_offsets_feature_rank_ = vbe_output_offsets_feature_rank.value_or(Tensor());
     const auto vbe_B_offsets_rank_per_feature_ = vbe_B_offsets_rank_per_feature.value_or(Tensor());
 
-    const auto max_B_ = max_B;
+    const c10::SymInt max_B_ = max_B;
     {%- else %}
-    const auto max_B_ = offsets.size(0) / T;
+    const auto max_B_ = offsets.sym_size(0) / T;
     {%- endif %}
 
-    auto [info_B_num_bits, info_B_mask] = adjust_info_B_num_bits(max_B_, T);
+    // TODO: don't guard here
+    auto [info_B_num_bits, info_B_mask] = adjust_info_B_num_bits(max_B_.guard_int(__FILE__, __LINE__), T.guard_int(__FILE__, __LINE__));
 
     {%- if vbe %}
     static auto generate_vbe_metadata_op =

From bd2d5fcb953dc64e3d07102de363e2c76fe25fbd Mon Sep 17 00:00:00 2001
From: Ivan Kobzarev <ivankobzarev@meta.com>
Date: Wed, 8 Nov 2023 14:46:57 -0800
Subject: [PATCH 14/73] Fix meta function for merge_pooled_embeddings

Summary:
Meta fn should contain device information (FakeTensor) for merge_pooled_embeddings.

BEFORE dynamo export fails with
```
 File "/data/sandcastle/boxes/fbsource/buck-out/v2/gen/fbcode/5408a275d581d7c2/scripts/ads_pt2_inference/__pt2_cli__/pt2_cli#link-tree/torch/_subclasses/fake_tensor.py", line 1264, in merge_devices
    raise RuntimeError(
torch._dynamo.exc.TorchRuntimeError: Failed running call_function fbgemm.permute_pooled_embs_auto_grad(*(FakeTensor(..., device='meta', size=(10, 5124)), FakeTensor(..., device='cuda:0', size=(40,), dtype=torch.int64), FakeTensor(..., device='cuda:0', size=(39,), dtype=torch.int64), FakeTensor(..., device='cuda:0', size=(40,), dtype=torch.int64), FakeTensor(..., device='cuda:0', size=(39,), dtype=torch.int64)), **{}):
Unhandled FakeTensor Device Propagation for fbgemm.permute_pooled_embs.default, found two different devices meta, cuda:0
```

Reviewed By: ezyang

Differential Revision:
D51121648

Privacy Context Container: L1138451

fbshipit-source-id: cb9c8d320072a4e0e266f4752602d5635cbf43bc
---
 fbgemm_gpu/fbgemm_gpu/sparse_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
index 42b0079b2c..92c44120a4 100644
--- a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
@@ -228,12 +228,12 @@ def merge_pooled_embeddings(
     if cat_dim == 0:
         return e.new_empty(
             [total_cat_dim_size, e.size(1)],
-            device=torch.device("meta"),
+            device=target_device,
         )
 
     return e.new_empty(
         [e.size(0), total_cat_dim_size],
-        device=torch.device("meta"),
+        device=target_device,
     )
 
 

From 79d1729452c16a939fb04edeb56c7d86e5809c87 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Thu, 9 Nov 2023 01:56:11 -0800
Subject: [PATCH 15/73] Refactor `embedding_inplace_update` (#2112)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2112

Move all source files related to `embedding_inplace_update` into its own directory

Reviewed By: spcyppt

Differential Revision: D50993688

fbshipit-source-id: 50bb307c1d7164667005b525d0b318adf9e3c695
---
 fbgemm_gpu/CMakeLists.txt                            |  6 +++---
 .../split_embedding_codegen_lookup_invoker.template  |  5 ++---
 fbgemm_gpu/fbgemm_gpu/sparse_ops.py                  |  4 +++-
 .../embedding_inplace_update.cu                      |  0
 .../embedding_inplace_update_cpu.cpp                 | 12 ++----------
 .../embedding_inplace_update_gpu.cpp                 |  3 ---
 .../embedding_inplace_update_test.cpp                |  7 +++++--
 7 files changed, 15 insertions(+), 22 deletions(-)
 rename fbgemm_gpu/src/{ => embedding_inplace_ops}/embedding_inplace_update.cu (100%)
 rename fbgemm_gpu/src/{ => embedding_inplace_ops}/embedding_inplace_update_cpu.cpp (98%)
 rename fbgemm_gpu/src/{ => embedding_inplace_ops}/embedding_inplace_update_gpu.cpp (94%)
 rename fbgemm_gpu/{test => src/embedding_inplace_ops}/embedding_inplace_update_test.cpp (96%)

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index fd2dbfbc82..9b1a118837 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -574,7 +574,7 @@ set(fbgemm_gpu_sources_static_cpu
     src/quantize_ops/quantize_ops_meta.cpp
     src/sparse_ops/sparse_ops_cpu.cpp
     src/sparse_ops/sparse_ops_meta.cpp
-    src/embedding_inplace_update_cpu.cpp
+    src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
     src/split_embeddings_cache/linearize_cache_indices.cpp
     src/split_embeddings_cache/lfu_cache_populate_byte.cpp
     src/split_embeddings_cache/lru_cache_populate_byte.cpp
@@ -597,7 +597,7 @@ if(NOT FBGEMM_CPU_ONLY)
     src/split_embeddings_utils.cpp
     src/split_embeddings_cache/split_embeddings_cache_ops.cu
     src/metric_ops_host.cpp
-    src/embedding_inplace_update_gpu.cpp
+    src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
     src/input_combine_gpu.cpp
     codegen/batch_index_select_dim0_host.cpp)
 
@@ -631,7 +631,7 @@ if(NOT FBGEMM_CPU_ONLY)
       codegen/embedding_forward_quantized_split_lookup.cu
       src/memory_utils/memory_utils.cu
       src/memory_utils/memory_utils_ops.cu
-      src/embedding_inplace_update.cu
+      src/embedding_inplace_ops/embedding_inplace_update.cu
       src/histogram_binning_calibration_ops.cu
       src/input_combine.cu
       src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu
diff --git a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
index 5af6e320f0..b546089c8b 100644
--- a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
+++ b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
@@ -13,6 +13,7 @@ from .lookup_args import *
 
 
 {%- if is_fbcode %}
+
 # Provide compatibility to downstream packages for eventual migration to the split training / inference packages
 try:
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/codegen:embedding_ops_cuda_training")
@@ -25,12 +26,10 @@ torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:cumem_utils")
 torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
 torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
 torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:split_table_batched_embeddings")
+
 torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:embedding_inplace_update")
 torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:embedding_inplace_update_cpu")
 
-{%- else %}
-# import os
-# torch.ops.load_library(os.path.join(os.path.join(os.path.dirname(os.path.dirname(__file__)), "fbgemm_gpu_py.so")))
 {%- endif %}
 
 
diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
index 92c44120a4..7db295f642 100644
--- a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
@@ -7,9 +7,9 @@
 from typing import Callable, List, Optional, Tuple
 
 import torch
+
 from fbgemm_gpu.split_embedding_configs import SparseType
 from fbgemm_gpu.split_table_batched_embeddings_ops_common import PoolingMode
-from torch import Tensor
 
 try:
     # pyre-ignore
@@ -24,6 +24,8 @@
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/codegen:embedding_ops")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/codegen:embedding_ops_cpu")
 
+from torch import Tensor
+
 
 if hasattr(torch.library, "impl_abstract"):
     impl_abstract = torch.library.impl_abstract
diff --git a/fbgemm_gpu/src/embedding_inplace_update.cu b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update.cu
similarity index 100%
rename from fbgemm_gpu/src/embedding_inplace_update.cu
rename to fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update.cu
diff --git a/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
similarity index 98%
rename from fbgemm_gpu/src/embedding_inplace_update_cpu.cpp
rename to fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
index 92c7428d71..1738d61a68 100644
--- a/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp
+++ b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
@@ -167,19 +167,11 @@ Tensor pruned_array_lookup_from_row_idx_cpu(
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "emb_inplace_update(Tensor(a!) dev_weights, Tensor(b!) uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor update_weights, Tensor update_table_indices, Tensor update_row_indices, Tensor update_offsets, int row_alignment=1, Tensor(c!)? lxu_cache_weights=None, Tensor? lxu_cache_locations=None) -> ()");
-}
-
-TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
-  DISPATCH_TO_CPU(
-      "emb_inplace_update", fbgemm_gpu::embedding_inplace_update_cpu);
-}
-
-TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "pruned_array_lookup_from_row_idx(Tensor update_row_indices, Tensor update_table_indices, Tensor index_remappings, Tensor index_remappings_offsets) -> Tensor");
-}
 
-TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
+  DISPATCH_TO_CPU(
+      "emb_inplace_update", fbgemm_gpu::embedding_inplace_update_cpu);
   DISPATCH_TO_CPU(
       "pruned_array_lookup_from_row_idx",
       fbgemm_gpu::pruned_array_lookup_from_row_idx_cpu);
diff --git a/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
similarity index 94%
rename from fbgemm_gpu/src/embedding_inplace_update_gpu.cpp
rename to fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
index 14ec1229d2..2605827a6d 100644
--- a/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp
+++ b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
@@ -15,9 +15,6 @@
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   DISPATCH_TO_CUDA(
       "emb_inplace_update", fbgemm_gpu::embedding_inplace_update_cuda);
-}
-
-TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   DISPATCH_TO_CUDA(
       "pruned_array_lookup_from_row_idx",
       fbgemm_gpu::pruned_array_lookup_from_row_idx_cuda);
diff --git a/fbgemm_gpu/test/embedding_inplace_update_test.cpp b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_test.cpp
similarity index 96%
rename from fbgemm_gpu/test/embedding_inplace_update_test.cpp
rename to fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_test.cpp
index 853da8115b..712315f7f7 100644
--- a/fbgemm_gpu/test/embedding_inplace_update_test.cpp
+++ b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_test.cpp
@@ -177,7 +177,10 @@ void test_embedding_inplace_update() {
   }
 }
 
-TEST(embedding_inplace_update_test, random_update) {
-  test_embedding_inplace_update<int32_t>();
+TEST(EmbeddingInplaceUpdateTest, random_update) {
+  // TODO: Skipping test_embedding_inplace_update<int32_t> because it is
+  // unreliable and crashes occasionally.  This should be fixed and re-enabled.
+  //
+  // test_embedding_inplace_update<int32_t>();
   test_embedding_inplace_update<int64_t>();
 }

From 5bddcaccc0a589ef1822a87680c901d0ad2fff15 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Thu, 9 Nov 2023 13:49:42 -0800
Subject: [PATCH 16/73] Back out "Refactor `embedding_inplace_update`" (#2125)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2125

Original commit changeset: 50bb307c1d71

Original Phabricator Diff: D50993688

Reviewed By: drdarshan

Differential Revision: D51165476

fbshipit-source-id: 6382e32b2683bf65571442d1d16b5a799f661c68
---
 fbgemm_gpu/CMakeLists.txt                            |  6 +++---
 .../split_embedding_codegen_lookup_invoker.template  |  5 +++--
 fbgemm_gpu/fbgemm_gpu/sparse_ops.py                  |  4 +---
 .../embedding_inplace_update.cu                      |  0
 .../embedding_inplace_update_cpu.cpp                 | 12 ++++++++++--
 .../embedding_inplace_update_gpu.cpp                 |  3 +++
 .../embedding_inplace_update_test.cpp                |  7 ++-----
 7 files changed, 22 insertions(+), 15 deletions(-)
 rename fbgemm_gpu/src/{embedding_inplace_ops => }/embedding_inplace_update.cu (100%)
 rename fbgemm_gpu/src/{embedding_inplace_ops => }/embedding_inplace_update_cpu.cpp (98%)
 rename fbgemm_gpu/src/{embedding_inplace_ops => }/embedding_inplace_update_gpu.cpp (94%)
 rename fbgemm_gpu/{src/embedding_inplace_ops => test}/embedding_inplace_update_test.cpp (96%)

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 9b1a118837..fd2dbfbc82 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -574,7 +574,7 @@ set(fbgemm_gpu_sources_static_cpu
     src/quantize_ops/quantize_ops_meta.cpp
     src/sparse_ops/sparse_ops_cpu.cpp
     src/sparse_ops/sparse_ops_meta.cpp
-    src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
+    src/embedding_inplace_update_cpu.cpp
     src/split_embeddings_cache/linearize_cache_indices.cpp
     src/split_embeddings_cache/lfu_cache_populate_byte.cpp
     src/split_embeddings_cache/lru_cache_populate_byte.cpp
@@ -597,7 +597,7 @@ if(NOT FBGEMM_CPU_ONLY)
     src/split_embeddings_utils.cpp
     src/split_embeddings_cache/split_embeddings_cache_ops.cu
     src/metric_ops_host.cpp
-    src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
+    src/embedding_inplace_update_gpu.cpp
     src/input_combine_gpu.cpp
     codegen/batch_index_select_dim0_host.cpp)
 
@@ -631,7 +631,7 @@ if(NOT FBGEMM_CPU_ONLY)
       codegen/embedding_forward_quantized_split_lookup.cu
       src/memory_utils/memory_utils.cu
       src/memory_utils/memory_utils_ops.cu
-      src/embedding_inplace_ops/embedding_inplace_update.cu
+      src/embedding_inplace_update.cu
       src/histogram_binning_calibration_ops.cu
       src/input_combine.cu
       src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu
diff --git a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
index b546089c8b..5af6e320f0 100644
--- a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
+++ b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
@@ -13,7 +13,6 @@ from .lookup_args import *
 
 
 {%- if is_fbcode %}
-
 # Provide compatibility to downstream packages for eventual migration to the split training / inference packages
 try:
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/codegen:embedding_ops_cuda_training")
@@ -26,10 +25,12 @@ torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:cumem_utils")
 torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
 torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
 torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:split_table_batched_embeddings")
-
 torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:embedding_inplace_update")
 torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:embedding_inplace_update_cpu")
 
+{%- else %}
+# import os
+# torch.ops.load_library(os.path.join(os.path.join(os.path.dirname(os.path.dirname(__file__)), "fbgemm_gpu_py.so")))
 {%- endif %}
 
 
diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
index 7db295f642..92c44120a4 100644
--- a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
@@ -7,9 +7,9 @@
 from typing import Callable, List, Optional, Tuple
 
 import torch
-
 from fbgemm_gpu.split_embedding_configs import SparseType
 from fbgemm_gpu.split_table_batched_embeddings_ops_common import PoolingMode
+from torch import Tensor
 
 try:
     # pyre-ignore
@@ -24,8 +24,6 @@
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/codegen:embedding_ops")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/codegen:embedding_ops_cpu")
 
-from torch import Tensor
-
 
 if hasattr(torch.library, "impl_abstract"):
     impl_abstract = torch.library.impl_abstract
diff --git a/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update.cu b/fbgemm_gpu/src/embedding_inplace_update.cu
similarity index 100%
rename from fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update.cu
rename to fbgemm_gpu/src/embedding_inplace_update.cu
diff --git a/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp b/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp
similarity index 98%
rename from fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
rename to fbgemm_gpu/src/embedding_inplace_update_cpu.cpp
index 1738d61a68..92c7428d71 100644
--- a/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
+++ b/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp
@@ -167,11 +167,19 @@ Tensor pruned_array_lookup_from_row_idx_cpu(
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "emb_inplace_update(Tensor(a!) dev_weights, Tensor(b!) uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor update_weights, Tensor update_table_indices, Tensor update_row_indices, Tensor update_offsets, int row_alignment=1, Tensor(c!)? lxu_cache_weights=None, Tensor? lxu_cache_locations=None) -> ()");
-  m.def(
-      "pruned_array_lookup_from_row_idx(Tensor update_row_indices, Tensor update_table_indices, Tensor index_remappings, Tensor index_remappings_offsets) -> Tensor");
+}
 
+TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
   DISPATCH_TO_CPU(
       "emb_inplace_update", fbgemm_gpu::embedding_inplace_update_cpu);
+}
+
+TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
+  m.def(
+      "pruned_array_lookup_from_row_idx(Tensor update_row_indices, Tensor update_table_indices, Tensor index_remappings, Tensor index_remappings_offsets) -> Tensor");
+}
+
+TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
   DISPATCH_TO_CPU(
       "pruned_array_lookup_from_row_idx",
       fbgemm_gpu::pruned_array_lookup_from_row_idx_cpu);
diff --git a/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp b/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp
similarity index 94%
rename from fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
rename to fbgemm_gpu/src/embedding_inplace_update_gpu.cpp
index 2605827a6d..14ec1229d2 100644
--- a/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
+++ b/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp
@@ -15,6 +15,9 @@
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   DISPATCH_TO_CUDA(
       "emb_inplace_update", fbgemm_gpu::embedding_inplace_update_cuda);
+}
+
+TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   DISPATCH_TO_CUDA(
       "pruned_array_lookup_from_row_idx",
       fbgemm_gpu::pruned_array_lookup_from_row_idx_cuda);
diff --git a/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_test.cpp b/fbgemm_gpu/test/embedding_inplace_update_test.cpp
similarity index 96%
rename from fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_test.cpp
rename to fbgemm_gpu/test/embedding_inplace_update_test.cpp
index 712315f7f7..853da8115b 100644
--- a/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_test.cpp
+++ b/fbgemm_gpu/test/embedding_inplace_update_test.cpp
@@ -177,10 +177,7 @@ void test_embedding_inplace_update() {
   }
 }
 
-TEST(EmbeddingInplaceUpdateTest, random_update) {
-  // TODO: Skipping test_embedding_inplace_update<int32_t> because it is
-  // unreliable and crashes occasionally.  This should be fixed and re-enabled.
-  //
-  // test_embedding_inplace_update<int32_t>();
+TEST(embedding_inplace_update_test, random_update) {
+  test_embedding_inplace_update<int32_t>();
   test_embedding_inplace_update<int64_t>();
 }

From 293e500d2dffa5f942f457272c660ce00f6794b8 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Thu, 9 Nov 2023 15:26:08 -0800
Subject: [PATCH 17/73] Refactor `embedding_inplace_update` (#2127)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2127

Move all source files related to `embedding_inplace_update` into its own directory

Reviewed By: spcyppt

Differential Revision: D51170862

fbshipit-source-id: 726ae5df567e99e861cc47180ce11ffd87a71519
---
 fbgemm_gpu/CMakeLists.txt                            |  6 +++---
 .../split_embedding_codegen_lookup_invoker.template  |  4 +---
 fbgemm_gpu/fbgemm_gpu/sparse_ops.py                  |  4 +++-
 .../embedding_inplace_update.cu                      |  0
 .../embedding_inplace_update_cpu.cpp                 | 12 ++----------
 .../embedding_inplace_update_gpu.cpp                 |  3 ---
 .../embedding_inplace_update_test.cpp                |  7 +++++--
 7 files changed, 14 insertions(+), 22 deletions(-)
 rename fbgemm_gpu/src/{ => embedding_inplace_ops}/embedding_inplace_update.cu (100%)
 rename fbgemm_gpu/src/{ => embedding_inplace_ops}/embedding_inplace_update_cpu.cpp (98%)
 rename fbgemm_gpu/src/{ => embedding_inplace_ops}/embedding_inplace_update_gpu.cpp (94%)
 rename fbgemm_gpu/{test => src/embedding_inplace_ops}/embedding_inplace_update_test.cpp (96%)

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index fd2dbfbc82..9b1a118837 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -574,7 +574,7 @@ set(fbgemm_gpu_sources_static_cpu
     src/quantize_ops/quantize_ops_meta.cpp
     src/sparse_ops/sparse_ops_cpu.cpp
     src/sparse_ops/sparse_ops_meta.cpp
-    src/embedding_inplace_update_cpu.cpp
+    src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
     src/split_embeddings_cache/linearize_cache_indices.cpp
     src/split_embeddings_cache/lfu_cache_populate_byte.cpp
     src/split_embeddings_cache/lru_cache_populate_byte.cpp
@@ -597,7 +597,7 @@ if(NOT FBGEMM_CPU_ONLY)
     src/split_embeddings_utils.cpp
     src/split_embeddings_cache/split_embeddings_cache_ops.cu
     src/metric_ops_host.cpp
-    src/embedding_inplace_update_gpu.cpp
+    src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
     src/input_combine_gpu.cpp
     codegen/batch_index_select_dim0_host.cpp)
 
@@ -631,7 +631,7 @@ if(NOT FBGEMM_CPU_ONLY)
       codegen/embedding_forward_quantized_split_lookup.cu
       src/memory_utils/memory_utils.cu
       src/memory_utils/memory_utils_ops.cu
-      src/embedding_inplace_update.cu
+      src/embedding_inplace_ops/embedding_inplace_update.cu
       src/histogram_binning_calibration_ops.cu
       src/input_combine.cu
       src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu
diff --git a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
index 5af6e320f0..3a00d7d1fe 100644
--- a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
+++ b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
@@ -13,6 +13,7 @@ from .lookup_args import *
 
 
 {%- if is_fbcode %}
+
 # Provide compatibility to downstream packages for eventual migration to the split training / inference packages
 try:
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/codegen:embedding_ops_cuda_training")
@@ -28,9 +29,6 @@ torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:split_table_batched_emb
 torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:embedding_inplace_update")
 torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:embedding_inplace_update_cpu")
 
-{%- else %}
-# import os
-# torch.ops.load_library(os.path.join(os.path.join(os.path.dirname(os.path.dirname(__file__)), "fbgemm_gpu_py.so")))
 {%- endif %}
 
 
diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
index 92c44120a4..7db295f642 100644
--- a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
@@ -7,9 +7,9 @@
 from typing import Callable, List, Optional, Tuple
 
 import torch
+
 from fbgemm_gpu.split_embedding_configs import SparseType
 from fbgemm_gpu.split_table_batched_embeddings_ops_common import PoolingMode
-from torch import Tensor
 
 try:
     # pyre-ignore
@@ -24,6 +24,8 @@
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/codegen:embedding_ops")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/codegen:embedding_ops_cpu")
 
+from torch import Tensor
+
 
 if hasattr(torch.library, "impl_abstract"):
     impl_abstract = torch.library.impl_abstract
diff --git a/fbgemm_gpu/src/embedding_inplace_update.cu b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update.cu
similarity index 100%
rename from fbgemm_gpu/src/embedding_inplace_update.cu
rename to fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update.cu
diff --git a/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
similarity index 98%
rename from fbgemm_gpu/src/embedding_inplace_update_cpu.cpp
rename to fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
index 92c7428d71..1738d61a68 100644
--- a/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp
+++ b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
@@ -167,19 +167,11 @@ Tensor pruned_array_lookup_from_row_idx_cpu(
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "emb_inplace_update(Tensor(a!) dev_weights, Tensor(b!) uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor update_weights, Tensor update_table_indices, Tensor update_row_indices, Tensor update_offsets, int row_alignment=1, Tensor(c!)? lxu_cache_weights=None, Tensor? lxu_cache_locations=None) -> ()");
-}
-
-TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
-  DISPATCH_TO_CPU(
-      "emb_inplace_update", fbgemm_gpu::embedding_inplace_update_cpu);
-}
-
-TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "pruned_array_lookup_from_row_idx(Tensor update_row_indices, Tensor update_table_indices, Tensor index_remappings, Tensor index_remappings_offsets) -> Tensor");
-}
 
-TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
+  DISPATCH_TO_CPU(
+      "emb_inplace_update", fbgemm_gpu::embedding_inplace_update_cpu);
   DISPATCH_TO_CPU(
       "pruned_array_lookup_from_row_idx",
       fbgemm_gpu::pruned_array_lookup_from_row_idx_cpu);
diff --git a/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
similarity index 94%
rename from fbgemm_gpu/src/embedding_inplace_update_gpu.cpp
rename to fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
index 14ec1229d2..2605827a6d 100644
--- a/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp
+++ b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
@@ -15,9 +15,6 @@
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   DISPATCH_TO_CUDA(
       "emb_inplace_update", fbgemm_gpu::embedding_inplace_update_cuda);
-}
-
-TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   DISPATCH_TO_CUDA(
       "pruned_array_lookup_from_row_idx",
       fbgemm_gpu::pruned_array_lookup_from_row_idx_cuda);
diff --git a/fbgemm_gpu/test/embedding_inplace_update_test.cpp b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_test.cpp
similarity index 96%
rename from fbgemm_gpu/test/embedding_inplace_update_test.cpp
rename to fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_test.cpp
index 853da8115b..712315f7f7 100644
--- a/fbgemm_gpu/test/embedding_inplace_update_test.cpp
+++ b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_test.cpp
@@ -177,7 +177,10 @@ void test_embedding_inplace_update() {
   }
 }
 
-TEST(embedding_inplace_update_test, random_update) {
-  test_embedding_inplace_update<int32_t>();
+TEST(EmbeddingInplaceUpdateTest, random_update) {
+  // TODO: Skipping test_embedding_inplace_update<int32_t> because it is
+  // unreliable and crashes occasionally.  This should be fixed and re-enabled.
+  //
+  // test_embedding_inplace_update<int32_t>();
   test_embedding_inplace_update<int64_t>();
 }

From 2117dd33d4c4fd53cfadbf037b5fb3c0824cb00e Mon Sep 17 00:00:00 2001
From: Richard Zou <rzou@meta.com>
Date: Thu, 9 Nov 2023 16:23:01 -0800
Subject: [PATCH 18/73] add pt2_compliant tag to some ops (#2119)

Summary:
X-link: https://github.com/pytorch/pytorch/pull/113201

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2119

Logs show these ops are being used with PT2, so we are grandfathering in these
ops to the pt2_compliant tag. Most of these ops are tested, some aren't.

bypass-github-export-checks

Reviewed By: ezyang

Differential Revision: D51076460

fbshipit-source-id: b08efb10fef0a0437a6c09cf0ac7f374f3b308ab
---
 .../include/fbgemm_gpu/dispatch_macros.h      |  7 +++
 .../jagged_tensor_ops_cpu.cpp                 | 60 ++++++++++++-------
 .../merge_pooled_embedding_ops_cpu.cpp        |  4 +-
 .../permute_pooled_embedding_ops_cpu.cpp      |  4 +-
 .../src/quantize_ops/quantize_ops_cpu.cpp     |  3 +-
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp  | 19 ++++--
 fbgemm_gpu/test/jagged_tensor_ops_test.py     | 24 +++++++-
 fbgemm_gpu/test/sparse_ops_test.py            | 12 ++++
 8 files changed, 103 insertions(+), 30 deletions(-)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/dispatch_macros.h b/fbgemm_gpu/include/fbgemm_gpu/dispatch_macros.h
index f268735d04..834a226ce4 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/dispatch_macros.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/dispatch_macros.h
@@ -201,3 +201,10 @@
 #define FBGEMM_DISPATCH_FLOAT_HALF_AND_BFLOAT16(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(                                            \
       TYPE, NAME, FBGEMM_DISPATCH_FLOAT_HALF_AND_BFLOAT16_CASE(__VA_ARGS__))
+
+// We can cleanup the following once fbgemm uses PyTorch 2.2 in January 2024.
+#ifdef HAS_PT2_COMPLIANT_TAG
+#define PT2_COMPLIANT_TAG at::Tag::pt2_compliant_tag
+#else
+#define PT2_COMPLIANT_TAG
+#endif
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
index 7ae207adb8..0b8525773e 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
@@ -1574,13 +1574,17 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   // details at https://pytorch.org/get-started/pytorch-2.0/#dynamic-shapes. If
   // you find it doesn't compile, please pull the new PyTorch 2.0 code
   m.def(
-      "dense_to_jagged(Tensor dense, Tensor[] x_offsets, SymInt? total_L=None) -> (Tensor, Tensor[])");
+      "dense_to_jagged(Tensor dense, Tensor[] x_offsets, SymInt? total_L=None) -> (Tensor, Tensor[])",
+      {PT2_COMPLIANT_TAG});
   m.def(
-      "dense_to_jagged_forward(Tensor dense, Tensor[] x_offsets, SymInt? total_L=None) -> Tensor");
+      "dense_to_jagged_forward(Tensor dense, Tensor[] x_offsets, SymInt? total_L=None) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
-      "jagged_2d_to_dense(Tensor values, Tensor offsets, SymInt max_sequence_length) -> Tensor");
+      "jagged_2d_to_dense(Tensor values, Tensor offsets, SymInt max_sequence_length) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
-      "jagged_1d_to_dense(Tensor values, Tensor offsets, SymInt max_sequence_length, int padding_value) -> Tensor");
+      "jagged_1d_to_dense(Tensor values, Tensor offsets, SymInt max_sequence_length, int padding_value) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
       "stacked_jagged_2d_to_dense_forward(Tensor values, Tensor lengths, int[] offset_per_key, int[] max_lengths_per_key, int padding_value = 0) -> (Tensor[], Tensor[])");
   m.def(
@@ -1590,35 +1594,48 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "stacked_jagged_2d_to_dense(Tensor values, Tensor lengths, int[] offset_per_key, int[] max_lengths_per_key, int padding_value = 0) -> Tensor[]");
   m.def(
-      "jagged_to_padded_dense(Tensor values, Tensor[] offsets, SymInt[] max_lengths, float padding_value = 0) -> Tensor");
+      "jagged_to_padded_dense(Tensor values, Tensor[] offsets, SymInt[] max_lengths, float padding_value = 0) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
-      "jagged_to_padded_dense_forward(Tensor values, Tensor[] offsets, SymInt[] max_lengths, float padding_value = 0) -> Tensor");
+      "jagged_to_padded_dense_forward(Tensor values, Tensor[] offsets, SymInt[] max_lengths, float padding_value = 0) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
-      "jagged_to_padded_dense_backward(Tensor grad_output, Tensor[] offsets, SymInt total_L) -> Tensor");
+      "jagged_to_padded_dense_backward(Tensor grad_output, Tensor[] offsets, SymInt total_L) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   // jagged + dense -> dense
   m.def(
-      "jagged_dense_elementwise_add(Tensor x_values, Tensor[] x_offsets, Tensor y) -> Tensor");
+      "jagged_dense_elementwise_add(Tensor x_values, Tensor[] x_offsets, Tensor y) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   // jagged + dense -> jagged (treat "zeros" in the jagged tensor as unknowns.
   // output offsets is same as x_offsets)
   m.def(
-      "jagged_dense_elementwise_add_jagged_output(Tensor x_values, Tensor[] x_offsets, Tensor y) -> (Tensor, Tensor[])");
+      "jagged_dense_elementwise_add_jagged_output(Tensor x_values, Tensor[] x_offsets, Tensor y) -> (Tensor, Tensor[])",
+      {PT2_COMPLIANT_TAG});
   m.def(
-      "jagged_dense_dense_elementwise_add_jagged_output_forward(Tensor x_values, Tensor[] x_offsets, Tensor y_0, Tensor y_1) -> Tensor");
+      "jagged_dense_dense_elementwise_add_jagged_output_forward(Tensor x_values, Tensor[] x_offsets, Tensor y_0, Tensor y_1) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
-      "jagged_dense_dense_elementwise_add_jagged_output(Tensor x_values, Tensor[] x_offsets, Tensor y_0, Tensor y_1) -> (Tensor, Tensor[])");
+      "jagged_dense_dense_elementwise_add_jagged_output(Tensor x_values, Tensor[] x_offsets, Tensor y_0, Tensor y_1) -> (Tensor, Tensor[])",
+      {PT2_COMPLIANT_TAG});
   // jagged * dense -> jagged (its offsets is same as x_offsets)
   m.def(
-      "jagged_dense_elementwise_mul(Tensor x_values, Tensor[] x_offsets, Tensor y) -> (Tensor, Tensor[])");
+      "jagged_dense_elementwise_mul(Tensor x_values, Tensor[] x_offsets, Tensor y) -> (Tensor, Tensor[])",
+      {PT2_COMPLIANT_TAG});
   m.def(
-      "jagged_dense_elementwise_mul_forward(Tensor x_values, Tensor[] x_offsets, Tensor y) -> Tensor");
+      "jagged_dense_elementwise_mul_forward(Tensor x_values, Tensor[] x_offsets, Tensor y) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
-      "jagged_dense_elementwise_mul_backward(Tensor grad_output, Tensor[] x_offsets, Tensor y, Tensor x_values) -> (Tensor, Tensor)");
+      "jagged_dense_elementwise_mul_backward(Tensor grad_output, Tensor[] x_offsets, Tensor y, Tensor x_values) -> (Tensor, Tensor)",
+      {PT2_COMPLIANT_TAG});
   m.def(
-      "batched_dense_vec_jagged_2d_mul(Tensor v, Tensor a_values, Tensor a_offsets) -> Tensor");
+      "batched_dense_vec_jagged_2d_mul(Tensor v, Tensor a_values, Tensor a_offsets) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
-      "batched_dense_vec_jagged_2d_mul_forward(Tensor v, Tensor a_values, Tensor a_offsets) -> Tensor");
+      "batched_dense_vec_jagged_2d_mul_forward(Tensor v, Tensor a_values, Tensor a_offsets) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
-      "batched_dense_vec_jagged_2d_mul_backward(Tensor grad_output, Tensor v, Tensor a_values, Tensor a_offsets) -> (Tensor, Tensor)");
+      "batched_dense_vec_jagged_2d_mul_backward(Tensor grad_output, Tensor v, Tensor a_values, Tensor a_offsets) -> (Tensor, Tensor)",
+      {PT2_COMPLIANT_TAG});
   m.def(
       "jagged_index_select(Tensor values, Tensor lengths, Tensor indices) -> Tensor[]");
   m.def(
@@ -1630,17 +1647,20 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "masked_select_jagged_1d(Tensor values, Tensor lengths, Tensor mask) -> (Tensor, Tensor)");
   m.def(
-      "jagged_softmax(Tensor values, Tensor x_offsets, int max_L) -> (Tensor, Tensor)");
+      "jagged_softmax(Tensor values, Tensor x_offsets, int max_L) -> (Tensor, Tensor)",
+      {PT2_COMPLIANT_TAG});
   m.def(
       "jagged_softmax_forward(Tensor values, Tensor x_offsets, int max_L) -> Tensor");
   m.def(
       "jagged_softmax_backward(Tensor grad_output, Tensor output, Tensor x_offsets, int max_L) -> Tensor");
   m.def(
-      "jagged_jagged_bmm(Tensor x_values, Tensor y_values, Tensor x_offsets, int max_L) -> Tensor");
+      "jagged_jagged_bmm(Tensor x_values, Tensor y_values, Tensor x_offsets, int max_L) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
       "jagged_jagged_bmm_forward(Tensor x_values, Tensor y_values, Tensor x_offsets, int max_L) -> Tensor");
   m.def(
-      "jagged_dense_bmm(Tensor x_values, Tensor x_offsets, Tensor y, int max_L) -> (Tensor, Tensor)");
+      "jagged_dense_bmm(Tensor x_values, Tensor x_offsets, Tensor y, int max_L) -> (Tensor, Tensor)",
+      {PT2_COMPLIANT_TAG});
   m.def(
       "jagged_dense_bmm_forward(Tensor x_values, Tensor x_offsets, Tensor y, int max_L) -> Tensor");
   // jagged -> jagged
diff --git a/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp b/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
index db3ee9b35a..b7dc57ea63 100644
--- a/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
+++ b/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
@@ -10,6 +10,7 @@
 #include <ATen/core/op_registration/op_registration.h>
 #include <c10/core/TensorOptions.h>
 #include <torch/library.h>
+#include "fbgemm_gpu/dispatch_macros.h"
 #include "fbgemm_gpu/ops_utils.h"
 
 using Tensor = at::Tensor;
@@ -56,7 +57,8 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       "//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_py");
 #endif
   m.def(
-      "merge_pooled_embeddings(Tensor[] pooled_embeddings, SymInt uncat_dim_size, Device target_device, SymInt cat_dim=1) -> Tensor");
+      "merge_pooled_embeddings(Tensor[] pooled_embeddings, SymInt uncat_dim_size, Device target_device, SymInt cat_dim=1) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
       "all_to_one_device(Tensor[] input_tensors, Device target_device) -> Tensor[]");
   m.def(
diff --git a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
index 2e884f3a3c..a6ff0d5dce 100644
--- a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
+++ b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
@@ -8,6 +8,7 @@
 
 #include <c10/util/irange.h>
 #include <vector>
+#include "fbgemm_gpu/dispatch_macros.h"
 #include "fbgemm_gpu/permute_pooled_embedding_ops.h"
 
 using Tensor = at::Tensor;
@@ -149,7 +150,8 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "permute_pooled_embs(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor");
   m.def(
-      "permute_pooled_embs_auto_grad(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor");
+      "permute_pooled_embs_auto_grad(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
       "permute_duplicate_pooled_embs(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor");
   m.def(
diff --git a/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp b/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp
index 9521f5a4c4..63988b4b0c 100644
--- a/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp
+++ b/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp
@@ -417,7 +417,8 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def("FloatOrHalfToFused8BitRowwiseQuantized(Tensor t) -> Tensor");
   m.def("Fused8BitRowwiseQuantizedToFloat(Tensor input) -> Tensor");
   m.def(
-      "FP8RowwiseQuantizedToFloat(Tensor input, bool forward, int output_dtype=0) -> Tensor");
+      "FP8RowwiseQuantizedToFloat(Tensor input, bool forward, int output_dtype=0) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def("Fused8BitRowwiseQuantizedToHalf(Tensor input) -> Tensor");
   m.def(
       "Fused8BitRowwiseQuantizedToFloatOrHalf(Tensor input, int output_dtype=0) -> Tensor");
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index 54e8b205e8..889082884a 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -18,6 +18,7 @@
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <torch/csrc/autograd/custom_function.h>
 #include "c10/util/MaybeOwned.h"
+#include "fbgemm_gpu/dispatch_macros.h"
 #include "fbgemm_gpu/sparse_ops.h"
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
@@ -2704,9 +2705,15 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       "block_bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, bool sequence, Tensor block_sizes, SymInt my_size, Tensor? weights=None, Tensor? batch_size_per_feature=None, SymInt max_B= -1) -> (Tensor, Tensor, Tensor?, Tensor?, Tensor?)");
   m.def(
       "bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, SymInt my_size, Tensor? weights=None) -> (Tensor, Tensor, Tensor?, Tensor?)");
-  m.def("asynchronous_exclusive_cumsum(Tensor t_in) -> Tensor");
-  m.def("asynchronous_inclusive_cumsum(Tensor t_in) -> Tensor");
-  m.def("asynchronous_complete_cumsum(Tensor t_in) -> Tensor");
+  m.def(
+      "asynchronous_exclusive_cumsum(Tensor t_in) -> Tensor",
+      {PT2_COMPLIANT_TAG});
+  m.def(
+      "asynchronous_inclusive_cumsum(Tensor t_in) -> Tensor",
+      {PT2_COMPLIANT_TAG});
+  m.def(
+      "asynchronous_complete_cumsum(Tensor t_in) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
       "reorder_batched_ad_lengths(Tensor cat_ad_lengths, Tensor batch_offsets, SymInt num_ads_in_batch, bool broadcast_lengths=False) -> Tensor");
   m.def(
@@ -2715,7 +2722,8 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       "cat_reorder_batched_ad_indices(Tensor cat_ad_offsets, Tensor[] cat_ad_indices, Tensor reordered_cat_ad_offsets, Tensor batch_offsets, SymInt num_ads_in_batch, bool broadcast_indices, SymInt total_num_indices, bool pinned_memory=False) -> Tensor");
   m.def("offsets_range(Tensor offsets, SymInt range_size) -> Tensor");
   m.def(
-      "batched_unary_embeddings(Tensor weight, Tensor table_offsets, Tensor offsets, Tensor indices) -> Tensor");
+      "batched_unary_embeddings(Tensor weight, Tensor table_offsets, Tensor offsets, Tensor indices) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
       "histogram_binning_calibration(Tensor logit, Tensor bin_num_examples, Tensor bin_num_positives, float positive_weight, float lower_bound, float upper_bound, SymInt bin_ctr_in_use_after, float bin_ctr_weight_value) -> (Tensor, Tensor)");
   m.def(
@@ -2738,7 +2746,8 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "permute_sequence_embeddings(Tensor permute, Tensor lengths, Tensor embeddings) -> (Tensor, Tensor)");
   m.def(
-      "pack_segments(Tensor t_in, Tensor lengths, SymInt max_length) -> Tensor");
+      "pack_segments(Tensor t_in, Tensor lengths, SymInt max_length) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
       "pack_segments_backward(Tensor data, Tensor lengths, SymInt total_length, SymInt max_length) -> Tensor");
   // A specialization of at::index_select for selecting dim 0
diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py
index 62a1ffa1cf..8465490282 100644
--- a/fbgemm_gpu/test/jagged_tensor_ops_test.py
+++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py
@@ -10,7 +10,7 @@
 import itertools
 import random
 import unittest
-from typing import List, Tuple
+from typing import Callable, Dict, List, Tuple
 
 import hypothesis.strategies as st
 import numpy as np
@@ -127,7 +127,27 @@ def hash_size_cumsum_to_offsets(hash_size_cum_sum_list: List[int]) -> List[int]:
     return hash_size_offsets_list
 
 
-@optests.generate_opcheck_tests
+# e.g. "test_faketensor__test_cumsum": [unittest.expectedFailure]
+# Please avoid putting tests here, you should put operator-specific
+# skips and failures in deeplearning/fbgemm/fbgemm_gpu/test/failures_dict.json
+# pyre-ignore[24]: Generic type `Callable` expects 2 type parameters.
+additional_decorators: Dict[str, List[Callable]] = {
+    "test_pt2_compliant_tag_fbgemm_dense_to_jagged": [
+        # This operator has been grandfathered in. We need to fix this test failure.
+        unittest.expectedFailure,
+    ],
+    "test_pt2_compliant_tag_fbgemm_jagged_dense_elementwise_add": [
+        # This operator has been grandfathered in. We need to fix this test failure.
+        unittest.expectedFailure,
+    ],
+    "test_pt2_compliant_tag_fbgemm_jagged_dense_elementwise_add_jagged_output": [
+        # This operator has been grandfathered in. We need to fix this test failure.
+        unittest.expectedFailure,
+    ],
+}
+
+
+@optests.generate_opcheck_tests(additional_decorators=additional_decorators)
 class JaggedTensorOpsTest(unittest.TestCase):
     def setUp(self) -> None:
         if symint_vector_unsupported()[0]:
diff --git a/fbgemm_gpu/test/sparse_ops_test.py b/fbgemm_gpu/test/sparse_ops_test.py
index 2fe6aa80aa..5e0fed0504 100644
--- a/fbgemm_gpu/test/sparse_ops_test.py
+++ b/fbgemm_gpu/test/sparse_ops_test.py
@@ -2417,6 +2417,18 @@ def validate(
     "test_faketensor__test_index_select_dim0": [unittest.skip("hangs")],
     "test_autograd_registration__test_index_select_dim0": [unittest.skip("hangs")],
     "test_schema__test_index_select_dim0": [unittest.skip("hangs")],
+    "test_pt2_compliant_tag_fbgemm_dense_to_jagged": [
+        # This operator has been grandfathered in. We need to fix this test failure.
+        unittest.expectedFailure,
+    ],
+    "test_pt2_compliant_tag_fbgemm_jagged_dense_elementwise_add": [
+        # This operator has been grandfathered in. We need to fix this test failure.
+        unittest.expectedFailure,
+    ],
+    "test_pt2_compliant_tag_fbgemm_jagged_dense_elementwise_add_jagged_output": [
+        # This operator has been grandfathered in. We need to fix this test failure.
+        unittest.expectedFailure,
+    ],
 }
 
 # only generate tests on nightly pytorch (current release version is 2.1)

From 09ab470c7eca954f3314494d8e6b102b18ae6f3b Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@meta.com>
Date: Mon, 13 Nov 2023 19:48:56 -0800
Subject: [PATCH 19/73] Move permute_sparse_features tests and abstract impl to
 fbgemm (#2129)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2129

Title

Reviewed By: zou3519

Differential Revision: D51211981

fbshipit-source-id: 14d4201670b9998288bc2bcf9b476730cd66c6c1
---
 fbgemm_gpu/fbgemm_gpu/sparse_ops.py          |  24 +++
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp |   3 +-
 fbgemm_gpu/test/sparse_ops_test.py           | 163 ++++++++++++++++++-
 3 files changed, 188 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
index 7db295f642..05d4112bae 100644
--- a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
@@ -251,3 +251,27 @@ def bounds_check_indices(
     max_B: int = -1,
 ) -> None:
     pass
+
+
+@impl_abstract("fbgemm::permute_sparse_features")
+def permute_sparse_features_abstract(
+    permute: Tensor, lengths: Tensor, indices: Tensor, weights: Optional[Tensor] = None
+) -> Tuple[Tensor, Tensor, Optional[Tensor]]:
+    torch._check(lengths.dtype == indices.dtype)
+    torch._check(permute.device == lengths.device)
+    torch._check(permute.device == indices.device)
+    if weights is not None:
+        torch._check(permute.device == weights.device)
+    num_output_features = permute.numel()
+    B = lengths.size(1)
+    permuted_lengths = lengths.new_empty(num_output_features, B)
+    output_size = torch.library.get_ctx().new_dynamic_size()
+    # pyre-fixme[6]: In call `torch._C.TensorBase.new_empty`, for 1st positional argument,
+    # expected `Sequence[Union[int, types.SymInt]]` but got `Union[int, torch.SymInt]`
+    permuted_indices = indices.new_empty(output_size)
+    permuted_weights = None
+    if weights is not None:
+        # pyre-fixme[6]: In call `torch._C.TensorBase.new_empty`, for 1st positional argument,
+        # expected `Sequence[Union[int, types.SymInt]]` but got `Union[int, torch.SymInt]`
+        permuted_weights = weights.new_empty(output_size)
+    return (permuted_lengths, permuted_indices, permuted_weights)
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index 889082884a..efdac1385e 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -2738,7 +2738,8 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "lengths_range_out(Tensor output, Tensor t_in, SymInt[]? shape=None) -> Tensor");
   m.def(
-      "permute_sparse_features(Tensor permute, Tensor lengths, Tensor indices, Tensor? weights=None) -> (Tensor, Tensor, Tensor?)");
+      "permute_sparse_features(Tensor permute, Tensor lengths, Tensor indices, Tensor? weights=None) -> (Tensor, Tensor, Tensor?)",
+      {PT2_COMPLIANT_TAG});
   m.def("Bfloat16QuantizedToFloat(Tensor input) -> Tensor");
   m.def("FloatToBfloat16Quantized(Tensor input) -> Tensor");
   m.def(
diff --git a/fbgemm_gpu/test/sparse_ops_test.py b/fbgemm_gpu/test/sparse_ops_test.py
index 5e0fed0504..d1f810bc6f 100644
--- a/fbgemm_gpu/test/sparse_ops_test.py
+++ b/fbgemm_gpu/test/sparse_ops_test.py
@@ -15,7 +15,7 @@
 import random
 import unittest
 from itertools import accumulate
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Type, Union
 
 import fbgemm_gpu
 
@@ -2402,6 +2402,167 @@ def validate(
             "grad",
         )
 
+    def permute_sparse_features_ref_(
+        self,
+        lengths: torch.Tensor,
+        indices: torch.Tensor,
+        weights: Optional[torch.Tensor],
+        permute: torch.LongTensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        T = lengths.size(0)
+        B = lengths.size(1)
+        permuted_lengths = torch.index_select(lengths.view(T, B), 0, permute)
+
+        original_segment_lengths = lengths.view(T, B).sum(dim=1, dtype=torch.int32)
+        original_segment_start = torch.ops.fbgemm.asynchronous_exclusive_cumsum(
+            original_segment_lengths.view(-1)
+        )
+
+        permuted_indices = []
+        permuted_weights = []
+        for i in range(permute.size(0)):
+            start = original_segment_start[permute[i]]
+            end = start + original_segment_lengths[permute[i]]
+            permuted_indices.append(indices[start:end])
+            if weights is not None:
+                permuted_weights.append(weights[start:end])
+
+        permuted_indices = torch.cat(permuted_indices, dim=0).flatten()
+
+        if weights is None:
+            permuted_weights = None
+        else:
+            permuted_weights = torch.cat(permuted_weights, dim=0).flatten()
+
+        return permuted_lengths, permuted_indices, permuted_weights
+
+    @given(
+        B=st.integers(min_value=1, max_value=20),
+        T=st.integers(min_value=1, max_value=20),
+        L=st.integers(min_value=2, max_value=20),
+        long_index=st.booleans(),
+        has_weight=st.booleans(),
+    )
+    @settings(max_examples=20, deadline=None)
+    def test_permute_sparse_features(
+        self, B: int, T: int, L: int, long_index: bool, has_weight: bool
+    ) -> None:
+        index_dtype = torch.int64 if long_index else torch.int32
+        lengths = torch.randint(low=1, high=L, size=(T, B)).type(index_dtype)
+        weights = torch.rand(int(lengths.sum().item())).float() if has_weight else None
+        indices = torch.randint(
+            low=1,
+            high=int(1e5),
+            size=cast(Tuple[int, ...], (lengths.sum().item(),)),
+        ).type(index_dtype)
+        permute_list = list(range(T))
+        random.shuffle(permute_list)
+        permute = torch.IntTensor(permute_list)
+
+        (
+            permuted_lengths_cpu,
+            permuted_indices_cpu,
+            permuted_weights_cpu,
+        ) = torch.ops.fbgemm.permute_sparse_features(permute, lengths, indices, weights)
+        (
+            permuted_lengths_ref,
+            permuted_indices_ref,
+            permuted_weights_ref,
+            # pyre-fixme[6]: For 4th param expected `LongTensor` but got `Tensor`.
+        ) = self.permute_indices_ref_(lengths, indices, weights, permute.long())
+        torch.testing.assert_close(permuted_indices_cpu, permuted_indices_ref)
+        torch.testing.assert_close(permuted_lengths_cpu, permuted_lengths_ref)
+        if has_weight:
+            torch.testing.assert_close(permuted_weights_cpu, permuted_weights_ref)
+        else:
+            assert permuted_weights_cpu is None and permuted_weights_ref is None
+
+        if gpu_available:
+            (
+                permuted_lengths_gpu,
+                permuted_indices_gpu,
+                permuted_weights_gpu,
+            ) = torch.ops.fbgemm.permute_sparse_features(
+                permute.cuda(),
+                lengths.cuda(),
+                indices.cuda(),
+                weights.cuda() if has_weight and weights is not None else None,
+            )
+            torch.testing.assert_close(permuted_indices_gpu.cpu(), permuted_indices_cpu)
+            torch.testing.assert_close(permuted_lengths_gpu.cpu(), permuted_lengths_cpu)
+            if has_weight:
+                torch.testing.assert_close(
+                    permuted_weights_gpu.cpu(), permuted_weights_cpu
+                )
+            else:
+                assert permuted_weights_gpu is None
+
+    @given(
+        B=st.integers(min_value=1, max_value=20),
+        T=st.integers(min_value=1, max_value=20),
+        L=st.integers(min_value=2, max_value=20),
+        long_index=st.booleans(),
+        has_weight=st.booleans(),
+    )
+    @settings(max_examples=20, deadline=None)
+    def test_permute_sparse_features_with_repeats(
+        self, B: int, T: int, L: int, long_index: bool, has_weight: bool
+    ) -> None:
+        index_dtype = torch.int64 if long_index else torch.int32
+        lengths = torch.randint(low=1, high=L, size=(T, B)).type(index_dtype)
+        weights = torch.rand(int(lengths.sum().item())).float() if has_weight else None
+        indices = torch.randint(
+            low=1,
+            high=int(1e5),
+            size=cast(Tuple[int, ...], (lengths.sum().item(),)),
+        ).type(index_dtype)
+        permute_list = list(range(T))
+
+        num_repeats = random.randint(0, T)
+        for _ in range(num_repeats):
+            permute_list.append(random.randint(0, T - 1))
+
+        random.shuffle(permute_list)
+        permute = torch.IntTensor(permute_list)
+
+        (
+            permuted_lengths_cpu,
+            permuted_indices_cpu,
+            permuted_weights_cpu,
+        ) = torch.ops.fbgemm.permute_sparse_features(permute, lengths, indices, weights)
+        (
+            permuted_lengths_ref,
+            permuted_indices_ref,
+            permuted_weights_ref,
+            # pyre-fixme[6]: For 4th param expected `LongTensor` but got `Tensor`.
+        ) = self.permute_indices_ref_(lengths, indices, weights, permute.long())
+        torch.testing.assert_close(permuted_indices_cpu, permuted_indices_ref)
+        torch.testing.assert_close(permuted_lengths_cpu, permuted_lengths_ref)
+        if has_weight:
+            torch.testing.assert_close(permuted_weights_cpu, permuted_weights_ref)
+        else:
+            assert permuted_weights_cpu is None and permuted_weights_ref is None
+
+        if gpu_available:
+            (
+                permuted_lengths_gpu,
+                permuted_indices_gpu,
+                permuted_weights_gpu,
+            ) = torch.ops.fbgemm.permute_sparse_features(
+                permute.cuda(),
+                lengths.cuda(),
+                indices.cuda(),
+                weights.cuda() if has_weight and weights is not None else None,
+            )
+            torch.testing.assert_close(permuted_indices_gpu.cpu(), permuted_indices_cpu)
+            torch.testing.assert_close(permuted_lengths_gpu.cpu(), permuted_lengths_cpu)
+            if has_weight:
+                torch.testing.assert_close(
+                    permuted_weights_gpu.cpu(), permuted_weights_cpu
+                )
+            else:
+                assert permuted_weights_cpu is None
+
 
 failures_dict_path: str = get_file_path_2(
     "", os.path.dirname(__file__), "failures_dict.json"

From 975cb0156db3f3b12280ef2198c9e972678bbb61 Mon Sep 17 00:00:00 2001
From: Shuai Yang <shuaiyang@meta.com>
Date: Tue, 14 Nov 2023 17:48:56 -0800
Subject: [PATCH 20/73] Add impl_abstract to segment_sum_csr (#2132)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2132

Add FakeTensor support for segement_sum_csr by adding impl_abstract, following: https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit#heading=h.ptttacy8y1u9

Reviewed By: bdhirsh

Differential Revision: D51296192

fbshipit-source-id: 8918ddc45e1ba570c8148c3ac172a4d96240e010
---
 fbgemm_gpu/fbgemm_gpu/sparse_ops.py | 9 +++++++++
 fbgemm_gpu/test/failures_dict.json  | 4 ++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
index 05d4112bae..e4101269b5 100644
--- a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
@@ -275,3 +275,12 @@ def permute_sparse_features_abstract(
         # expected `Sequence[Union[int, types.SymInt]]` but got `Union[int, torch.SymInt]`
         permuted_weights = weights.new_empty(output_size)
     return (permuted_lengths, permuted_indices, permuted_weights)
+
+
+@torch.library.impl_abstract("fbgemm::segment_sum_csr")
+def segment_sum_csr_abstract(
+    batch_size: int, csr_seg: Tensor, values: Tensor
+) -> Tensor:
+    output_size = csr_seg.numel() - 1
+    output = values.new_empty(output_size)
+    return output
diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index 8e53be56ed..7a7ecb3334 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -439,11 +439,11 @@
     "fbgemm::segment_sum_csr": {
       "SparseOpsTest.test_aot_dispatch_dynamic__test_segment_sum_csr": {
         "comment": "",
-        "status": "xfail"
+        "status": "xsuccess"
       },
       "SparseOpsTest.test_faketensor__test_segment_sum_csr": {
         "comment": "",
-        "status": "xfail"
+        "status": "xsuccess"
       }
     },
     "fbgemm::stacked_jagged_1d_to_dense": {

From abb59a33e5490bfc8053bf4f23cd3a4135c71fd6 Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Wed, 15 Nov 2023 14:25:28 -0800
Subject: [PATCH 21/73] Fix fbgemm CI for segment_sum_csr (#2137)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2137

D51296192 breaks OSS CI e.g. https://github.com/pytorch/FBGEMM/actions/runs/6873908944/job/18695145904?pr=2107

We have to support PyTorch 2.1, which doesn't have this attribute defined.  Fortunately, there is a workaround available, where we can replace torch.library.impl_abstract with the locally-defined impl_abstract

Reviewed By: q10

Differential Revision: D51362600

fbshipit-source-id: fa1424e495f19cf494e29595439a062d63139dee
---
 fbgemm_gpu/fbgemm_gpu/sparse_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
index e4101269b5..b394d4c2d1 100644
--- a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
@@ -277,7 +277,7 @@ def permute_sparse_features_abstract(
     return (permuted_lengths, permuted_indices, permuted_weights)
 
 
-@torch.library.impl_abstract("fbgemm::segment_sum_csr")
+@impl_abstract("fbgemm::segment_sum_csr")
 def segment_sum_csr_abstract(
     batch_size: int, csr_seg: Tensor, values: Tensor
 ) -> Tensor:

From 2eb3eb4d0823c9f5cbbb3ff124dbc736003538ee Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Wed, 15 Nov 2023 23:18:04 -0800
Subject: [PATCH 22/73] Support variable bucket size for
 block_bucketize_sparse_features (#2107)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2107

This diff add support for variable bucket size for block bucketize_sparse features for RW sharding.
E.g. Given bucket_sizes_pos as [[0,5,15], [0,10,13]]
For batch 0, indices in [0,5) will be assigned to bucket 0, indices in [5,15) will be assigned to bucket 1.
For batch 1, indices in [0,10) will be assigned to bucket 0, indices in [10,13) will be assigned to bucket 1.
The new index will be original index - bucket_sizes_pos[new_bucket_id-1]
i.e. for batch = 0, index = 12, it will be assigned to bucket 1 and the new index is 12 - 5 = 7.

Reviewed By: jiayisuse

Differential Revision: D50868649

fbshipit-source-id: b1901bce5c539acc4d6a8e11eb74d1ad4ec86195
---
 fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h    |  6 +-
 .../sparse_block_bucketize_features.cu        |  5 +-
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp  | 72 ++++++++++++++-----
 3 files changed, 61 insertions(+), 22 deletions(-)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
index 162c4b9ecc..1552dd1be2 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
@@ -149,7 +149,8 @@ block_bucketize_sparse_features_cuda(
     const int64_t my_size,
     const c10::optional<at::Tensor>& weights,
     const c10::optional<at::Tensor>& batch_size_per_feature,
-    const int64_t max_batch_size);
+    const int64_t max_batch_size,
+    const c10::optional<std::vector<at::Tensor>>& block_bucketize_pos);
 
 std::tuple<
     at::Tensor,
@@ -168,7 +169,8 @@ block_bucketize_sparse_features_cpu(
     const int64_t my_size,
     const c10::optional<at::Tensor>& weights,
     const c10::optional<at::Tensor>& batch_size_per_feature,
-    const int64_t max_batch_size);
+    const int64_t max_batch_size,
+    const c10::optional<std::vector<at::Tensor>>& block_bucketize_pos);
 
 std::tuple<
     at::Tensor,
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu
index d2f281fd49..7e18862538 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu
+++ b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu
@@ -147,7 +147,10 @@ block_bucketize_sparse_features_cuda(
     const int64_t my_size,
     const c10::optional<Tensor>& weights,
     const c10::optional<Tensor>& batch_size_per_feature,
-    const int64_t max_B) {
+    const int64_t max_B,
+    const c10::optional<std::vector<
+        at::Tensor>>& /*block_bucketize_pos*/ // Only used in CPU variant
+) {
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(lengths, indices);
 
   at::cuda::OptionalCUDAGuard device_guard;
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index efdac1385e..8291927f45 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -284,7 +284,8 @@ void _block_bucketize_sparse_features_cpu(
     c10::optional<Tensor> new_weights,
     c10::optional<Tensor> new_pos,
     const c10::optional<Tensor>& unbucketize_permute,
-    const c10::optional<Tensor>& batch_size_per_feature) {
+    const c10::optional<Tensor>& batch_size_per_feature,
+    const c10::optional<std::vector<at::Tensor>>& block_bucketize_pos) {
   // allocate tensors and buffers
   const auto lengths_size = lengths.numel();
   const auto new_lengths_size = lengths_size * my_size;
@@ -305,9 +306,10 @@ void _block_bucketize_sparse_features_cpu(
   const index_t* const block_sizes_data = block_sizes.data_ptr<index_t>();
   offset_t* batch_sizes_data = nullptr;
   const auto variable_batch_size = batch_size_per_feature.has_value();
-
+  const auto variable_bucket_sizes = block_bucketize_pos.has_value();
   using uindex_t = std::make_unsigned_t<index_t>;
   using uoffset_t = std::make_unsigned_t<offset_t>;
+  std::vector<int64_t> lower_bounds(indices.numel(), 0);
 
   if constexpr (sequence) {
     unbucketize_permute_data = unbucketize_permute.value().data_ptr<index_t>();
@@ -331,6 +333,12 @@ void _block_bucketize_sparse_features_cpu(
   for (const auto t : c10::irange(T)) {
     const auto blk_size = block_sizes_data[t];
     const auto cur_batch_size = variable_batch_size ? batch_sizes_data[t] : B;
+    const index_t* bucketize_offset = nullptr;
+    int64_t bucket_size = 0;
+    if (variable_bucket_sizes) {
+      bucketize_offset = block_bucketize_pos.value()[t].data_ptr<index_t>();
+      bucket_size = block_bucketize_pos.value()[t].numel();
+    }
     for (const auto b : c10::irange(cur_batch_size)) {
       const auto b_t = (variable_batch_size ? cur_offset : t * B) + b;
       const offset_t rowstart = offsets_data[b_t];
@@ -343,10 +351,21 @@ void _block_bucketize_sparse_features_cpu(
         // range of blk_size, we expect the later embedding module to take care
         // of hashing indices calculation.
         uindex_t idx = static_cast<uindex_t>(indices_data[i]);
-        uindex_t p = idx < static_cast<uindex_t>(blk_size * my_size)
-            ? idx / blk_size
-            : idx % my_size;
-        new_lengths_data[p * lengths_size + b_t]++;
+        if (variable_bucket_sizes) {
+          int64_t lb = std::upper_bound(
+                           bucketize_offset,
+                           bucketize_offset + bucket_size,
+                           indices_data[i]) -
+              bucketize_offset - 1;
+          lower_bounds[i] = lb;
+          uindex_t p = lb < my_size ? lb : idx % my_size;
+          new_lengths_data[p * lengths_size + b_t]++;
+        } else {
+          uindex_t p = idx < static_cast<uindex_t>(blk_size * my_size)
+              ? idx / blk_size
+              : idx % my_size;
+          new_lengths_data[p * lengths_size + b_t]++;
+        }
       }
     }
     cur_offset += cur_batch_size;
@@ -359,6 +378,10 @@ void _block_bucketize_sparse_features_cpu(
   for (const auto t : c10::irange(T)) {
     const auto blk_size = block_sizes_data[t];
     const auto cur_batch_size = variable_batch_size ? batch_sizes_data[t] : B;
+    const index_t* bucketize_offset = nullptr;
+    if (variable_bucket_sizes) {
+      bucketize_offset = block_bucketize_pos.value()[t].data_ptr<index_t>();
+    }
     for (const auto b : c10::irange(cur_batch_size)) {
       const auto b_t = (variable_batch_size ? cur_offset : t * B) + b;
       const offset_t rowstart = offsets_data[b_t];
@@ -371,12 +394,19 @@ void _block_bucketize_sparse_features_cpu(
         // range of blk_size, we expect the later embedding module to take care
         // of hashing indices calculation.
         const uindex_t idx = static_cast<uindex_t>(indices_data[i]);
-        const uindex_t p = idx < static_cast<uindex_t>(blk_size * my_size)
-            ? idx / blk_size
-            : idx % my_size;
-        const uindex_t new_idx = idx < static_cast<uindex_t>(blk_size * my_size)
-            ? idx % blk_size
-            : idx / my_size;
+        uindex_t p, new_idx;
+        if (variable_bucket_sizes) {
+          int64_t lb = lower_bounds[i];
+          p = lb < my_size ? lb : idx % my_size;
+          new_idx = lb < my_size ? idx - bucketize_offset[lb] : idx / my_size;
+
+        } else {
+          p = idx < static_cast<uindex_t>(blk_size * my_size) ? idx / blk_size
+                                                              : idx % my_size;
+          new_idx = idx < static_cast<uindex_t>(blk_size * my_size)
+              ? idx % blk_size
+              : idx / my_size;
+        }
         const uoffset_t pos = new_offsets_data[p * lengths_size + b_t];
         new_indices_data[pos] = new_idx;
         if (sequence) {
@@ -911,8 +941,8 @@ block_bucketize_sparse_features_cpu(
     const int64_t my_size,
     const c10::optional<Tensor>& weights,
     const c10::optional<Tensor>& batch_size_per_feature,
-    const int64_t /* max_batch_size */ // Only used in GPU variant
-) {
+    const int64_t /* max_batch_size */, // Only used in GPU variant
+    const c10::optional<std::vector<at::Tensor>>& block_bucketize_pos) {
   const auto lengths_size = lengths.numel();
   const auto new_lengths_size = lengths_size * my_size;
   auto new_lengths = at::zeros({new_lengths_size}, lengths.options());
@@ -959,7 +989,8 @@ block_bucketize_sparse_features_cpu(
                             new_weights,
                             new_pos,
                             unbucketize_permute,
-                            batch_size_per_feature);
+                            batch_size_per_feature,
+                            block_bucketize_pos);
                       });
                 });
           });
@@ -994,7 +1025,8 @@ block_bucketize_sparse_features_cpu(
                             new_weights,
                             new_pos,
                             unbucketize_permute,
-                            batch_size_per_feature);
+                            batch_size_per_feature,
+                            block_bucketize_pos);
                       });
                 });
           });
@@ -1027,7 +1059,8 @@ block_bucketize_sparse_features_cpu(
                       new_weights,
                       new_pos,
                       unbucketize_permute,
-                      batch_size_per_feature);
+                      batch_size_per_feature,
+                      block_bucketize_pos);
                 });
           });
     } else {
@@ -1055,7 +1088,8 @@ block_bucketize_sparse_features_cpu(
                       new_weights,
                       new_pos,
                       unbucketize_permute,
-                      batch_size_per_feature);
+                      batch_size_per_feature,
+                      block_bucketize_pos);
                 });
           });
     }
@@ -2702,7 +2736,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "expand_into_jagged_permute(Tensor permute, Tensor input_offset, Tensor output_offset, SymInt output_size) -> Tensor");
   m.def(
-      "block_bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, bool sequence, Tensor block_sizes, SymInt my_size, Tensor? weights=None, Tensor? batch_size_per_feature=None, SymInt max_B= -1) -> (Tensor, Tensor, Tensor?, Tensor?, Tensor?)");
+      "block_bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, bool sequence, Tensor block_sizes, SymInt my_size, Tensor? weights=None, Tensor? batch_size_per_feature=None, SymInt max_B= -1, Tensor[]? block_bucketize_pos=None) -> (Tensor, Tensor, Tensor?, Tensor?, Tensor?)");
   m.def(
       "bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, SymInt my_size, Tensor? weights=None) -> (Tensor, Tensor, Tensor?, Tensor?)");
   m.def(

From 528f24d9188065e9589dca50260fa35eddc038c6 Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Wed, 15 Nov 2023 23:33:26 -0800
Subject: [PATCH 23/73] Add test for fbgemm ops. (#2136)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2136

Add test for D50868649.
Have to land the backend changes first.  D50868649 put CUDA/C++ changes in one diff.  Then put the python changes in this diff.
Otherwise we will have compatibility check failure: https://www.internalfb.com/servicelab/experiment/2501798166/trial/2502750974

Reviewed By: jingsh

Differential Revision: D51314019

fbshipit-source-id: 4bd9ebd09b0f2a4201f6fc87cfc0c6fdcb1e2392
---
 fbgemm_gpu/test/failures_dict.json |   8 +++
 fbgemm_gpu/test/sparse_ops_test.py | 110 +++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+)

diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index 7a7ecb3334..fd909674ee 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -29,6 +29,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_block_bucketize_sparse_features_with_block_bucketize_pos": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SparseOpsTest.test_aot_dispatch_dynamic__test_block_bucketize_sparse_features_with_variable_batch_sizes": {
         "comment": "",
         "status": "xfail"
@@ -41,6 +45,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "SparseOpsTest.test_faketensor__test_block_bucketize_sparse_features_with_block_bucketize_pos": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SparseOpsTest.test_faketensor__test_block_bucketize_sparse_features_with_variable_batch_sizes": {
         "comment": "",
         "status": "xfail"
diff --git a/fbgemm_gpu/test/sparse_ops_test.py b/fbgemm_gpu/test/sparse_ops_test.py
index d1f810bc6f..27730f418b 100644
--- a/fbgemm_gpu/test/sparse_ops_test.py
+++ b/fbgemm_gpu/test/sparse_ops_test.py
@@ -942,6 +942,116 @@ def test_block_bucketize_sparse_features_with_variable_batch_sizes(
                 new_indices_gpu.cpu(), new_indices_ref, rtol=0, atol=0
             )
 
+    @given(
+        index_type=st.sampled_from([torch.int, torch.long]),
+        has_weight=st.booleans(),
+        bucketize_pos=st.booleans(),
+        sequence=st.booleans(),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=16, deadline=None)
+    def test_block_bucketize_sparse_features_with_block_bucketize_pos(
+        self,
+        index_type: Optional[torch.dtype],
+        has_weight: bool,
+        bucketize_pos: bool,
+        sequence: bool,
+    ) -> None:
+        """
+        Test variable bucket size for block bucketize_sparse features for RW sharding.
+        E.g. Given bucket_sizes_pos as [[0,5,15], [0,10,13]]
+        For batch 0, indices in [0,5) will be assigned to bucket 0, indices in [5,15) will be assigned to bucket 1.
+        For batch 1, indices in [0,10) will be assigned to bucket 0, indices in [10,13) will be assigned to bucket 1.
+        The new index will be original index - bucket_sizes_pos[new_bucket_id-1]
+        i.e. for batch = 0, index = 12, it will be assigned to bucket 1 and the new index is 12 - 5 = 7.
+        """
+        # For the following test case, we have
+        # batch 0: 2 (1,7), 1 (2), 1 (6)
+        # 1: bucket 0, new_idx 1
+        # 7: bucket 1, new_idx 5
+        # 2: bucket 1, new_idx 0
+        # 6: bucket 1, new_idx 4
+
+        # batch 1: 2 (7,8)
+        # 7: bucket 1, new_idx 2
+        # 8: bucket 1, new_idx 3
+
+        # batch 2: 0, 2 (8,4)
+        # 8: bucket 1, new_idx 1
+        # 4: bucket 0, new_idx 4
+
+        # new_lengths for 0: 1, 0, 0, 0, 0, 1
+        # new_indices for 0: 1|  |  |  |  | 4
+        # new_lengths for 1: 1, 1, 1, 2,   0, 1
+        # new_indices for 1: 5| 0| 4| 2,3|   |1
+        lengths = torch.tensor([2, 1, 1, 2, 0, 2], dtype=index_type)
+        indices = torch.tensor(
+            [1, 7, 2, 6, 7, 8, 8, 4],
+            dtype=index_type,
+        )
+        batch_sizes = torch.tensor([3, 1, 2], dtype=index_type)
+        weights = (
+            torch.tensor(
+                [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+                dtype=torch.float,
+            )
+            if has_weight
+            else None
+        )
+
+        block_sizes = torch.tensor([5, 10, 8], dtype=index_type)
+        my_size = 2
+        max_B = batch_sizes.max().item()  # unused
+
+        block_bucketize_pos = [
+            torch.tensor([0, 2, 8], dtype=index_type),
+            torch.tensor([0, 5, 10], dtype=index_type),
+            torch.tensor([0, 7, 12], dtype=index_type),
+        ]
+
+        new_lengths_ref = torch.tensor(
+            [1, 0, 0, 0, 0, 1, 1, 1, 1, 2, 0, 1],
+            dtype=index_type,
+        )
+        new_indices_ref = torch.tensor(
+            [1, 4, 5, 0, 4, 2, 3, 1],
+            dtype=index_type,
+        )
+        new_weights_ref = torch.tensor(
+            [
+                1.0,
+                8.0,
+                2.0,
+                3.0,
+                4.0,
+                5.0,
+                6.0,
+                7.0,
+            ],
+            dtype=torch.float,
+        )
+        (
+            new_lengths_cpu,
+            new_indices_cpu,
+            new_weights_cpu,
+            new_pos_cpu,
+            unbucketize_permute,
+        ) = torch.ops.fbgemm.block_bucketize_sparse_features(
+            lengths,
+            indices,
+            bucketize_pos,
+            sequence,
+            block_sizes,
+            my_size,
+            weights,
+            batch_sizes,
+            max_B,
+            block_bucketize_pos,
+        )
+        torch.testing.assert_close(new_lengths_cpu, new_lengths_ref, rtol=0, atol=0)
+        torch.testing.assert_close(new_indices_cpu, new_indices_ref, rtol=0, atol=0)
+        if has_weight:
+            torch.testing.assert_close(new_weights_cpu, new_weights_ref)
+
     @given(
         index_type=st.sampled_from([torch.int, torch.long]),
         has_weight=st.booleans(),

From 5da5a1626e7b34d1dfdb82ee1f9936fdc177b553 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Thu, 16 Nov 2023 03:40:04 -0800
Subject: [PATCH 24/73] Re-organize layout_transform_ops (#2133)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2133

- Migrate layout_transform_ops into its own directory

Reviewed By: spcyppt

Differential Revision: D51290598

fbshipit-source-id: 99ff3f170b29b999a761f327f909053b3bf9f2a6
---
 fbgemm_gpu/CMakeLists.txt                        | 16 ++++++++--------
 .../src/{ => input_combine_ops}/input_combine.cu |  0
 .../input_combine_cpu.cpp                        |  0
 .../input_combine_gpu.cpp                        |  0
 .../layout_transform_ops.cu                      |  9 +++------
 .../layout_transform_ops_cpu.cpp                 |  0
 .../layout_transform_ops_gpu.cpp                 |  5 ++---
 fbgemm_gpu/src/{ => metric_ops}/metric_ops.cu    |  0
 fbgemm_gpu/src/{ => metric_ops}/metric_ops.h     |  0
 .../src/{ => metric_ops}/metric_ops_host.cpp     |  0
 10 files changed, 13 insertions(+), 17 deletions(-)
 rename fbgemm_gpu/src/{ => input_combine_ops}/input_combine.cu (100%)
 rename fbgemm_gpu/src/{ => input_combine_ops}/input_combine_cpu.cpp (100%)
 rename fbgemm_gpu/src/{ => input_combine_ops}/input_combine_gpu.cpp (100%)
 rename fbgemm_gpu/src/{ => layout_transform_ops}/layout_transform_ops.cu (99%)
 rename fbgemm_gpu/src/{ => layout_transform_ops}/layout_transform_ops_cpu.cpp (100%)
 rename fbgemm_gpu/src/{ => layout_transform_ops}/layout_transform_ops_gpu.cpp (99%)
 rename fbgemm_gpu/src/{ => metric_ops}/metric_ops.cu (100%)
 rename fbgemm_gpu/src/{ => metric_ops}/metric_ops.h (100%)
 rename fbgemm_gpu/src/{ => metric_ops}/metric_ops_host.cpp (100%)

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 9b1a118837..95ae263617 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -568,8 +568,8 @@ set(fbgemm_gpu_sources_static_cpu
     src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
     src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp
     src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
-    src/input_combine_cpu.cpp
-    src/layout_transform_ops_cpu.cpp
+    src/input_combine_ops/input_combine_cpu.cpp
+    src/layout_transform_ops/layout_transform_ops_cpu.cpp
     src/quantize_ops/quantize_ops_cpu.cpp
     src/quantize_ops/quantize_ops_meta.cpp
     src/sparse_ops/sparse_ops_cpu.cpp
@@ -589,16 +589,16 @@ if(NOT FBGEMM_CPU_ONLY)
     codegen/embedding_bounds_check_host.cpp
     src/memory_utils/memory_utils.cpp
     src/memory_utils/memory_utils_ops.cpp
-    src/layout_transform_ops_gpu.cpp
+    src/layout_transform_ops/layout_transform_ops_gpu.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp
     src/quantize_ops/quantize_ops_gpu.cpp
     src/sparse_ops/sparse_ops_gpu.cpp
     src/split_embeddings_utils.cpp
     src/split_embeddings_cache/split_embeddings_cache_ops.cu
-    src/metric_ops_host.cpp
+    src/metric_ops/metric_ops_host.cpp
     src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
-    src/input_combine_gpu.cpp
+    src/input_combine_ops/input_combine_gpu.cpp
     codegen/batch_index_select_dim0_host.cpp)
 
   if(NVML_LIB_PATH)
@@ -633,7 +633,7 @@ if(NOT FBGEMM_CPU_ONLY)
       src/memory_utils/memory_utils_ops.cu
       src/embedding_inplace_ops/embedding_inplace_update.cu
       src/histogram_binning_calibration_ops.cu
-      src/input_combine.cu
+      src/input_combine_ops/input_combine.cu
       src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu
       src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu
       src/jagged_tensor_ops/dense_to_jagged_forward.cu
@@ -651,8 +651,8 @@ if(NOT FBGEMM_CPU_ONLY)
       src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu
       src/jagged_tensor_ops/jagged_unique_indices.cu
       src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu
-      src/layout_transform_ops.cu
-      src/metric_ops.cu
+      src/layout_transform_ops/layout_transform_ops.cu
+      src/metric_ops/metric_ops.cu
       src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu
       src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu
       src/quantize_ops/quantize_bfloat16.cu
diff --git a/fbgemm_gpu/src/input_combine.cu b/fbgemm_gpu/src/input_combine_ops/input_combine.cu
similarity index 100%
rename from fbgemm_gpu/src/input_combine.cu
rename to fbgemm_gpu/src/input_combine_ops/input_combine.cu
diff --git a/fbgemm_gpu/src/input_combine_cpu.cpp b/fbgemm_gpu/src/input_combine_ops/input_combine_cpu.cpp
similarity index 100%
rename from fbgemm_gpu/src/input_combine_cpu.cpp
rename to fbgemm_gpu/src/input_combine_ops/input_combine_cpu.cpp
diff --git a/fbgemm_gpu/src/input_combine_gpu.cpp b/fbgemm_gpu/src/input_combine_ops/input_combine_gpu.cpp
similarity index 100%
rename from fbgemm_gpu/src/input_combine_gpu.cpp
rename to fbgemm_gpu/src/input_combine_ops/input_combine_gpu.cpp
diff --git a/fbgemm_gpu/src/layout_transform_ops.cu b/fbgemm_gpu/src/layout_transform_ops/layout_transform_ops.cu
similarity index 99%
rename from fbgemm_gpu/src/layout_transform_ops.cu
rename to fbgemm_gpu/src/layout_transform_ops/layout_transform_ops.cu
index 3b76e5b32c..40b796d23f 100644
--- a/fbgemm_gpu/src/layout_transform_ops.cu
+++ b/fbgemm_gpu/src/layout_transform_ops/layout_transform_ops.cu
@@ -13,19 +13,16 @@
 #include "fbgemm_gpu/cub_namespace_postfix.cuh"
 // clang-format on
 
-#include "fbgemm_gpu/layout_transform_ops.cuh"
-#include "fbgemm_gpu/sparse_ops.h"
-#include "fbgemm_gpu/sparse_ops_utils.h"
-
 #include <ATen/ATen.h>
 #include <ATen/core/op_registration/op_registration.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/Exceptions.h>
 #include <c10/cuda/CUDAGuard.h>
-
 #include <torch/library.h>
-
 #include "ATen/Parallel.h"
+#include "fbgemm_gpu/layout_transform_ops.cuh"
+#include "fbgemm_gpu/sparse_ops.h"
+#include "fbgemm_gpu/sparse_ops_utils.h"
 
 using Tensor = at::Tensor;
 
diff --git a/fbgemm_gpu/src/layout_transform_ops_cpu.cpp b/fbgemm_gpu/src/layout_transform_ops/layout_transform_ops_cpu.cpp
similarity index 100%
rename from fbgemm_gpu/src/layout_transform_ops_cpu.cpp
rename to fbgemm_gpu/src/layout_transform_ops/layout_transform_ops_cpu.cpp
diff --git a/fbgemm_gpu/src/layout_transform_ops_gpu.cpp b/fbgemm_gpu/src/layout_transform_ops/layout_transform_ops_gpu.cpp
similarity index 99%
rename from fbgemm_gpu/src/layout_transform_ops_gpu.cpp
rename to fbgemm_gpu/src/layout_transform_ops/layout_transform_ops_gpu.cpp
index 986b479bd3..19e757875f 100644
--- a/fbgemm_gpu/src/layout_transform_ops_gpu.cpp
+++ b/fbgemm_gpu/src/layout_transform_ops/layout_transform_ops_gpu.cpp
@@ -6,12 +6,11 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "fbgemm_gpu/sparse_ops.h"
-#include "fbgemm_gpu/sparse_ops_utils.h"
-
 #include <ATen/ATen.h>
 #include <ATen/core/op_registration/op_registration.h>
 #include <torch/library.h>
+#include "fbgemm_gpu/sparse_ops.h"
+#include "fbgemm_gpu/sparse_ops_utils.h"
 
 TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
   DISPATCH_TO_CUDA(
diff --git a/fbgemm_gpu/src/metric_ops.cu b/fbgemm_gpu/src/metric_ops/metric_ops.cu
similarity index 100%
rename from fbgemm_gpu/src/metric_ops.cu
rename to fbgemm_gpu/src/metric_ops/metric_ops.cu
diff --git a/fbgemm_gpu/src/metric_ops.h b/fbgemm_gpu/src/metric_ops/metric_ops.h
similarity index 100%
rename from fbgemm_gpu/src/metric_ops.h
rename to fbgemm_gpu/src/metric_ops/metric_ops.h
diff --git a/fbgemm_gpu/src/metric_ops_host.cpp b/fbgemm_gpu/src/metric_ops/metric_ops_host.cpp
similarity index 100%
rename from fbgemm_gpu/src/metric_ops_host.cpp
rename to fbgemm_gpu/src/metric_ops/metric_ops_host.cpp

From f1bbb608d217f8089f0e678b17ee4c6b7d749d7f Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@meta.com>
Date: Thu, 16 Nov 2023 11:08:39 -0800
Subject: [PATCH 25/73] Add opcheck tests to parts of quantize_ops_test.py
 (#2139)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2139

Add generated opcheck tests to `TestFP8RowwiseQuantizationConversion` in quantize_ops_test.py in order to fix `fbgemm::FloatToFP8RowwiseQuantized` and `fbgemm::FP8RowwiseQuantizedToFloat`

Reviewed By: zou3519

Differential Revision: D51375136

fbshipit-source-id: 2683253b4b6f5e18e2792a27e19cad3fb1719833
---
 fbgemm_gpu/test/failures_dict.json   | 31 ++++++++++++++++++++++++++++
 fbgemm_gpu/test/quantize_ops_test.py | 25 +++++++++++++++++++++-
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index fd909674ee..964038bb68 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -2,6 +2,37 @@
   "_description": "This is a dict containing failures for tests autogenerated by generate_opcheck_tests. For more details, please see https://docs.google.com/document/d/1Pj5HRZvdOq3xpFpbEjUZp2hBovhy7Wnxw14m6lF2154/edit",
   "_version": 1,
   "data": {
+    "fbgemm::FP8RowwiseQuantizedToFloat": {},
+    "fbgemm::FloatToFP8RowwiseQuantized": {
+      "TestFP8RowwiseQuantizationConversion.test_aot_dispatch_dynamic__test_quantize_and_dequantize_op_fp8_rowwise": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "TestFP8RowwiseQuantizationConversion.test_faketensor__test_quantize_and_dequantize_op_fp8_rowwise": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::FloatToPaddedFP8RowwiseQuantized": {
+      "TestFP8RowwiseQuantizationConversion.test_aot_dispatch_dynamic__test_quantize_and_dequantize_op_padded_fp8_rowwise": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "TestFP8RowwiseQuantizationConversion.test_faketensor__test_quantize_and_dequantize_op_padded_fp8_rowwise": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::PaddedFP8RowwiseQuantizedToFloat": {
+      "TestFP8RowwiseQuantizationConversion.test_aot_dispatch_dynamic__test_quantize_and_dequantize_op_padded_fp8_rowwise": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "TestFP8RowwiseQuantizationConversion.test_faketensor__test_quantize_and_dequantize_op_padded_fp8_rowwise": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
     "fbgemm::asynchronous_complete_cumsum": {},
     "fbgemm::asynchronous_exclusive_cumsum": {},
     "fbgemm::asynchronous_inclusive_cumsum": {},
diff --git a/fbgemm_gpu/test/quantize_ops_test.py b/fbgemm_gpu/test/quantize_ops_test.py
index b2dac18a7e..493f35f9f8 100644
--- a/fbgemm_gpu/test/quantize_ops_test.py
+++ b/fbgemm_gpu/test/quantize_ops_test.py
@@ -9,7 +9,7 @@
 import random
 import unittest
 from ctypes import c_float, c_int32, cast, POINTER, pointer
-from typing import Dict, Tuple
+from typing import Callable, Dict, List, Tuple
 
 import hypothesis.strategies as st
 import numpy as np
@@ -32,6 +32,7 @@
         fused_rowwise_nbit_quantize_reference,
         gpu_available,
         gpu_unavailable,
+        optests,
         symint_vector_unsupported,
     )
 except Exception:
@@ -45,6 +46,7 @@
         fused_rowwise_nbit_quantize_reference,
         gpu_available,
         gpu_unavailable,
+        optests,
         symint_vector_unsupported,
     )
 
@@ -999,6 +1001,27 @@ def test_quantize_and_dequantize_op_cuda_large_nrows_bf16(
             torch.testing.assert_close(dequantized_data_gpu.cpu(), dequantized_data)
 
 
+# e.g. "test_faketensor__test_cumsum": [unittest.expectedFailure]
+# Please avoid putting tests here, you should put operator-specific
+# skips and failures in deeplearning/fbgemm/fbgemm_gpu/test/failures_dict.json
+# pyre-ignore[24]: Generic type `Callable` expects 2 type parameters.
+additional_decorators: Dict[str, List[Callable]] = {
+    "test_pt2_compliant_tag_fbgemm_dense_to_jagged": [
+        # This operator has been grandfathered in. We need to fix this test failure.
+        unittest.expectedFailure,
+    ],
+    "test_pt2_compliant_tag_fbgemm_jagged_dense_elementwise_add": [
+        # This operator has been grandfathered in. We need to fix this test failure.
+        unittest.expectedFailure,
+    ],
+    "test_pt2_compliant_tag_fbgemm_jagged_dense_elementwise_add_jagged_output": [
+        # This operator has been grandfathered in. We need to fix this test failure.
+        unittest.expectedFailure,
+    ],
+}
+
+
+@optests.generate_opcheck_tests(additional_decorators=additional_decorators)
 class TestFP8RowwiseQuantizationConversion(unittest.TestCase):
     enable_logging: bool = False
     max_examples: int = 40

From a142e2064de172cb128ca086ca6438f92f99484e Mon Sep 17 00:00:00 2001
From: Wei Su <wsu@meta.com>
Date: Fri, 17 Nov 2023 12:11:41 -0800
Subject: [PATCH 26/73] Add an auto-vectorization implementation for int4 CPU
 TBE kernel (#2077)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2077

Add an auto-vectorization implementation for int4 CPU-TBE kernel to accelerate ARM platform. Also adds relevant unit tests and benchmarks.

This is based on my auto-vec tries D50142928 and bangshengtang's further optimizations including D50212552, D50213049, D50243725 and D50303751

Reviewed By: bangshengtang

Differential Revision: D50289383

fbshipit-source-id: 11320ef18883dabd0739852ac8b083fb1bc2883b
---
 CMakeLists.txt                       |  23 ++-
 bench/EmbeddingSpMDMNBitBenchmark.cc | 179 ++++++++++++++++-------
 defs.bzl                             |   5 +
 fbgemm_gpu/CMakeLists.txt            |   1 +
 include/fbgemm/Types.h               |  41 +++++-
 include/fbgemm/Utils.h               |   5 +
 src/EmbeddingSpMDMAutovec.cc         | 204 +++++++++++++++++++++++++++
 src/EmbeddingSpMDMAutovec.h          |  49 +++++++
 src/EmbeddingSpMDMNBit.cc            |  29 ++++
 src/Utils.cc                         |   4 +
 third_party/cpuinfo                  |   2 +-
 11 files changed, 489 insertions(+), 53 deletions(-)
 create mode 100644 src/EmbeddingSpMDMAutovec.cc
 create mode 100644 src/EmbeddingSpMDMAutovec.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 134523e7d7..0b510717a0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -152,18 +152,21 @@ get_filelist("get_fbgemm_inline_avx2_srcs(msvc=${MSVC_BOOL})"
 get_filelist("get_fbgemm_avx512_srcs(msvc=${MSVC_BOOL})" FBGEMM_AVX512_SRCS)
 get_filelist("get_fbgemm_inline_avx512_srcs(msvc=${MSVC_BOOL})"
   FBGEMM_AVX512_INLINE_SRCS)
+get_filelist("get_fbgemm_autovec_srcs()" FBGEMM_AUTOVEC_SRCS)
 get_filelist("get_fbgemm_public_headers()" FBGEMM_PUBLIC_HEADERS)
 
 add_library(fbgemm_generic OBJECT ${FBGEMM_GENERIC_SRCS})
 add_library(fbgemm_avx2 OBJECT ${FBGEMM_AVX2_SRCS} ${FBGEMM_AVX2_INLINE_SRCS})
 add_library(fbgemm_avx512 OBJECT
   ${FBGEMM_AVX512_SRCS} ${FBGEMM_AVX512_INLINE_SRCS})
+add_library(fbgemm_autovec OBJECT ${FBGEMM_AUTOVEC_SRCS})
 
 # Make libraries depend on defs.bzl
 add_custom_target(defs.bzl DEPENDS defs.bzl)
 add_dependencies(fbgemm_generic defs.bzl)
 add_dependencies(fbgemm_avx2 defs.bzl)
 add_dependencies(fbgemm_avx512 defs.bzl)
+add_dependencies(fbgemm_autovec defs.bzl)
 
 # On Windows:
 # 1)  Adding definition of ASMJIT_STATIC to avoid generating asmjit function
@@ -186,6 +189,9 @@ if(MSVC)
   endif()
   target_compile_options(fbgemm_avx2 PRIVATE "/arch:AVX2")
   target_compile_options(fbgemm_avx512 PRIVATE "/arch:AVX512")
+  if(OpenMP_FOUND)
+    target_compile_options(fbgemm_autovec PRIVATE "/openmp:experimental")
+  endif()
 else(MSVC)
   string(APPEND CMAKE_CXX_FLAGS " -Wall")
   string(APPEND CMAKE_CXX_FLAGS " -Wextra")
@@ -300,29 +306,40 @@ target_include_directories(fbgemm_avx512 BEFORE
       PRIVATE "${ASMJIT_SRC_DIR}/src"
       PRIVATE "${CPUINFO_SOURCE_DIR}/include")
 
+target_include_directories(fbgemm_autovec BEFORE
+      PUBLIC $<BUILD_INTERFACE:${FBGEMM_SOURCE_DIR}>
+      PUBLIC $<BUILD_INTERFACE:${FBGEMM_SOURCE_DIR}/include>
+      PRIVATE "${ASMJIT_SRC_DIR}/src"
+      PRIVATE "${CPUINFO_SOURCE_DIR}/include")
+
 if(FBGEMM_LIBRARY_TYPE STREQUAL "default")
   add_library(fbgemm
     $<TARGET_OBJECTS:fbgemm_generic>
     $<TARGET_OBJECTS:fbgemm_avx2>
-    $<TARGET_OBJECTS:fbgemm_avx512>)
+    $<TARGET_OBJECTS:fbgemm_avx512>
+    $<TARGET_OBJECTS:fbgemm_autovec>)
 elseif(FBGEMM_LIBRARY_TYPE STREQUAL "shared")
   add_library(fbgemm SHARED
     $<TARGET_OBJECTS:fbgemm_generic>
     $<TARGET_OBJECTS:fbgemm_avx2>
-    $<TARGET_OBJECTS:fbgemm_avx512>)
+    $<TARGET_OBJECTS:fbgemm_avx512>
+    $<TARGET_OBJECTS:fbgemm_autovec>)
   set_property(TARGET fbgemm_generic PROPERTY POSITION_INDEPENDENT_CODE ON)
   set_property(TARGET fbgemm_avx2 PROPERTY POSITION_INDEPENDENT_CODE ON)
   set_property(TARGET fbgemm_avx512 PROPERTY POSITION_INDEPENDENT_CODE ON)
+  set_property(TARGET fbgemm_autovec PROPERTY POSITION_INDEPENDENT_CODE ON)
 elseif(FBGEMM_LIBRARY_TYPE STREQUAL "static")
   add_library(fbgemm STATIC
     $<TARGET_OBJECTS:fbgemm_generic>
     $<TARGET_OBJECTS:fbgemm_avx2>
-    $<TARGET_OBJECTS:fbgemm_avx512>)
+    $<TARGET_OBJECTS:fbgemm_avx512>
+    $<TARGET_OBJECTS:fbgemm_autovec>)
   #MSVC need to define FBGEMM_STATIC for fbgemm_generic also to
   #avoid generating _dllimport functions.
   target_compile_definitions(fbgemm_generic PRIVATE FBGEMM_STATIC)
   target_compile_definitions(fbgemm_avx2 PRIVATE FBGEMM_STATIC)
   target_compile_definitions(fbgemm_avx512 PRIVATE FBGEMM_STATIC)
+  target_compile_definitions(fbgemm_autovec PRIVATE FBGEMM_STATIC)
   target_compile_definitions(fbgemm PRIVATE FBGEMM_STATIC)
 else()
   message(FATAL_ERROR "Unsupported library type ${FBGEMM_LIBRARY_TYPE}")
diff --git a/bench/EmbeddingSpMDMNBitBenchmark.cc b/bench/EmbeddingSpMDMNBitBenchmark.cc
index 336fd6b565..d3bd928c70 100644
--- a/bench/EmbeddingSpMDMNBitBenchmark.cc
+++ b/bench/EmbeddingSpMDMNBitBenchmark.cc
@@ -26,6 +26,7 @@
 #include "./BenchUtils.h"
 #include "fbgemm/Fbgemm.h"
 #include "fbgemm/FbgemmConvert.h"
+#include "src/EmbeddingSpMDMAutovec.h"
 #include "src/RefImplementations.h"
 
 using namespace std;
@@ -133,11 +134,13 @@ int run_benchmark(
   }
 
   vector<float> output_sls_ref(batch_size * embedding_dim);
+  vector<float> output_sls_autovec(output_sls_ref.size()),
+      output_slws_autovec(output_sls_ref.size());
   vector<float> output_slws_ref(output_sls_ref.size()),
       output_sls(output_sls_ref.size()), output_slws(output_sls_ref.size());
 
-  constexpr int NUM_WARMUP = 4;
-  constexpr int NUM_ITER = 10;
+  constexpr int NUM_WARMUP = 10;
+  constexpr int NUM_ITER = 100;
   // Only counts the number of bytes for reading embedding table and ignore
   // others. Should be good enough as long as embdding_dim is big enough.
   double bytes = lengths_sum * fused_embedding_dim;
@@ -148,36 +151,10 @@ int run_benchmark(
 
   for (bool has_weight : {false, true}) {
     vector<float>& output_ref = has_weight ? output_slws_ref : output_sls_ref;
+    vector<float>& output_autovec =
+        has_weight ? output_slws_autovec : output_sls_autovec;
 
-    bool success = false, success_ref = false;
-
-    if (use_32_bit_indices) {
-      success_ref = EmbeddingSpMDMNBit_ref(
-          bit_rate,
-          embedding_dim,
-          batch_size,
-          lengths_sum,
-          num_rows,
-          fused_embedding_table.data(),
-          indices_32.data(),
-          offsets.data(),
-          has_weight ? weights.data() : nullptr,
-          normalize_by_lengths,
-          output_ref.data());
-    } else {
-      success = EmbeddingSpMDMNBit_ref(
-          bit_rate,
-          embedding_dim,
-          batch_size,
-          lengths_sum,
-          num_rows,
-          fused_embedding_table.data(),
-          indices.data(),
-          offsets.data(),
-          has_weight ? weights.data() : nullptr,
-          normalize_by_lengths,
-          output_ref.data());
-    }
+    bool success = false, success_ref = false, success_autovec = false;
 
     auto kernel_32 = GenerateEmbeddingSpMDMNBit<int32_t>(
         bit_rate,
@@ -194,6 +171,95 @@ int run_benchmark(
 
     vector<float>& output = has_weight ? output_slws : output_sls;
     for (bool flush_cache : {false, true}) {
+      // Reference implementation
+      double t_ref = measureWithWarmup(
+          [&]() {
+            if (use_32_bit_indices) {
+              success_ref = EmbeddingSpMDMNBit_ref(
+                  bit_rate,
+                  embedding_dim,
+                  batch_size,
+                  lengths_sum,
+                  num_rows,
+                  fused_embedding_table.data(),
+                  indices_32.data(),
+                  offsets.data(),
+                  has_weight ? weights.data() : nullptr,
+                  normalize_by_lengths,
+                  output_ref.data());
+            } else {
+              success_ref = EmbeddingSpMDMNBit_ref(
+                  bit_rate,
+                  embedding_dim,
+                  batch_size,
+                  lengths_sum,
+                  num_rows,
+                  fused_embedding_table.data(),
+                  indices.data(),
+                  offsets.data(),
+                  has_weight ? weights.data() : nullptr,
+                  normalize_by_lengths,
+                  output_ref.data());
+            }
+          },
+          NUM_WARMUP,
+          NUM_ITER,
+          [&]() {
+            if (flush_cache) {
+              cache_evict(fused_embedding_table);
+              cache_evict(indices);
+              cache_evict(indices_32);
+              cache_evict(offsets);
+              cache_evict(weights);
+              cache_evict(output);
+            }
+          });
+
+      // Auto-vectorization implementation
+      double t_autovec = measureWithWarmup(
+          [&]() {
+            if (use_32_bit_indices) {
+              success_autovec = EmbeddingSpMDMNBit_autovec(
+                  bit_rate,
+                  embedding_dim,
+                  batch_size,
+                  lengths_sum,
+                  num_rows,
+                  fused_embedding_table.data(),
+                  indices_32.data(),
+                  offsets.data(),
+                  has_weight ? weights.data() : nullptr,
+                  normalize_by_lengths,
+                  output_autovec.data());
+            } else {
+              success_autovec = EmbeddingSpMDMNBit_autovec(
+                  bit_rate,
+                  embedding_dim,
+                  batch_size,
+                  lengths_sum,
+                  num_rows,
+                  fused_embedding_table.data(),
+                  indices.data(),
+                  offsets.data(),
+                  has_weight ? weights.data() : nullptr,
+                  normalize_by_lengths,
+                  output_autovec.data());
+            }
+          },
+          NUM_WARMUP,
+          NUM_ITER,
+          [&]() {
+            if (flush_cache) {
+              cache_evict(fused_embedding_table);
+              cache_evict(indices);
+              cache_evict(indices_32);
+              cache_evict(offsets);
+              cache_evict(weights);
+              cache_evict(output);
+            }
+          });
+
+      // Hand-written AVX2/AVX512 implementation
       double t = measureWithWarmup(
           [&]() {
             if (use_32_bit_indices) {
@@ -251,37 +317,55 @@ int run_benchmark(
         //     has_weight ? output_slws_ref : output_sls_ref;
         if (success != success_ref) {
           assert(
-              false && "ERROR: refernce impl and JIT imp did not both succeed");
+              false &&
+              "ERROR: reference impl and JIT imp did not both succeed");
+        } else if (success != success_autovec) {
+          assert(
+              false &&
+              "ERROR: reference impl and auto-vec impl did not both succeed");
         } else if (success) {
           for (size_t i = 0; i < output.size(); ++i) {
             assert(fabs(output[i] - output_ref[i]) < 1e-3);
+            assert(fabs(output_autovec[i] - output_ref[i]) < 1e-3);
             if (fabs(output[i] - output_ref[i]) >= 1e-3) {
-              cout << i << " " << output[i] << " " << output_ref[i] << endl;
+              cout << "asmjit vs ref  : " << i << " " << output[i] << " "
+                   << output_ref[i] << endl;
+            }
+            if (fabs(output_autovec[i] - output_ref[i]) >= 1e-3) {
+              cout << "autovec vec ref: " << i << " " << output_autovec[i]
+                   << " " << output_ref[i] << endl;
             }
           }
         }
       }
 
       if (has_weight) {
-        cout << setw(16) << "SLW(WEIGHTED) ";
+        cout << "SLW(WEIGHTED), ";
       } else {
-        cout << setw(16) << "SLS ";
+        cout << "SLS, ";
       }
       if (flush_cache) {
-        cout << setw(20) << "cache flushed";
+        cout << "cache flushed, ";
       } else {
-        cout << setw(20) << "cache not flushed";
+        cout << "cache not flushed, ";
       }
       if (prefetch) {
-        cout << setw(16) << "prefetch on";
+        cout << "prefetch on, ";
       } else {
-        cout << setw(16) << "prefetch off";
+        cout << "prefetch off, ";
       }
 
-      cout << setw(8) << "b/w" << setw(10) << bytes / 1e9 / t << " GB/s"
-           << setw(20) << "effective b/w: " << setw(16)
-           << bytes_padded / 1e9 / t << "GB/s" << setw(8) << " time "
-           << setw(16) << t << endl;
+      cout << "b/w, " << bytes / 1e9 / t << ", GB/s, "
+           << "effective b/w, " << bytes_padded / 1e9 / t << ", GB/s, "
+           << "time, " << t << ", autovec b/w, " << bytes / 1e9 / t_autovec
+           << ", GB/s, "
+           << "autovec eff. b/w, " << bytes_padded / 1e9 / t_autovec
+           << ", GB/s, "
+           << "autovec time, " << t_autovec << ", ref b/w, "
+           << bytes / 1e9 / t_ref << ", GB/s, "
+           << "ref eff. b/w, " << bytes_padded / 1e9 / t_ref << ", GB/s, "
+           << "ref time, " << t_ref << ", autovec speedup, "
+           << t_ref / t_autovec << ", asmjit speedup, " << t_ref / t << endl;
     } // flush_cache
   } // has_weight
   return 0;
@@ -295,7 +379,7 @@ int main() {
 
   vector<vector<int>> inputs(GetInputs_());
 
-  for (int bit_rate : {2, 4}) {
+  for (int bit_rate : {4, 2}) {
     for (auto& input : inputs) {
       assert(input.size() > 3);
       batch_size = input[0];
@@ -303,10 +387,9 @@ int main() {
       embedding_dim = input[2];
       average_len = input[3];
 
-      cout << "bit_rate" << setw(6) << bit_rate << "batch size" << setw(6)
-           << batch_size << setw(10) << "num rows" << setw(16) << num_rows
-           << setw(10) << "emb dim" << setw(6) << embedding_dim << setw(16)
-           << "avg length" << setw(6) << average_len << endl;
+      cout << "bit_rate, " << bit_rate << ", batch size, " << batch_size
+           << ", num rows, " << num_rows << ", emb dim, " << embedding_dim
+           << ", avg length, " << average_len << endl;
       // args: batch sz, num rows, emb dim, avg len, normalize, use 32b,
       // prefetch
       cout << "64 bit indices, ";
diff --git a/defs.bzl b/defs.bzl
index d7dcbd30ec..6aadcc25b8 100644
--- a/defs.bzl
+++ b/defs.bzl
@@ -138,5 +138,10 @@ def get_fbgemm_inline_avx512_srcs(msvc = False, buck = False):
         })
     return asm_srcs if not msvc else intrinsics_srcs
 
+def get_fbgemm_autovec_srcs():
+    return [
+        "src/EmbeddingSpMDMAutovec.cc",
+    ]
+
 def get_fbgemm_tests(skip_tests = []):
     return native.glob(["test/*Test.cc"], exclude = skip_tests)
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 95ae263617..0bbd8a40f8 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -487,6 +487,7 @@ endif()
 
 set(fbgemm_sources_normal
   "${FBGEMM}/src/EmbeddingSpMDM.cc"
+  "${FBGEMM}/src/EmbeddingSpMDMAutovec.cc"
   "${FBGEMM}/src/EmbeddingSpMDMNBit.cc"
   "${FBGEMM}/src/QuantUtils.cc"
   "${FBGEMM}/src/RefImplementations.cc"
diff --git a/include/fbgemm/Types.h b/include/fbgemm/Types.h
index e32a3cdc0d..c6ca8a9281 100644
--- a/include/fbgemm/Types.h
+++ b/include/fbgemm/Types.h
@@ -12,6 +12,25 @@
 #include <cstdlib>
 #include <cstring>
 
+#ifndef __is_identifier
+#define __is_identifier(x) 1
+#endif
+
+#define __has_keyword(__x) !(__is_identifier(__x))
+
+// TODO: we're disabling native fp16 on Windows to workaround test failures
+// due to "undefined symbol __gnu_h2f_ieee" error. We should follup on this
+// later.
+#if __has_keyword(__fp16) && !defined(_WIN32)
+#define HAS_NATIVE_FP16_TYPE
+typedef __fp16 native_fp16_t;
+#elif __has_keyword(_Float16) && !defined(_WIN32)
+#define HAS_NATIVE_FP16_TYPE
+typedef _Float16 native_fp16_t;
+#else
+typedef void native_fp16_t;
+#endif
+
 namespace fbgemm {
 
 using float16 = std::uint16_t;
@@ -161,7 +180,7 @@ static inline float16 cpu_float2half_rz(float f) {
 
 // Converts a 16-bit unsigned integer representation of a IEEE754 half-precision
 // float into an IEEE754 32-bit single-precision float
-static inline float cpu_half2float(const float16 h) {
+inline float cpu_half2float_ref(const float16 h) {
   // Get sign and exponent alone by themselves
   uint32_t sign_bit = (h >> f16_num_non_sign_bits) & 1;
   uint32_t exponent = (h >> f16_num_mantissa_bits) & f16_exponent_mask;
@@ -198,6 +217,18 @@ static inline float cpu_half2float(const float16 h) {
   return ret;
 }
 
+// Same as the previous function, but use the built-in fp16 to fp32
+// conversion provided by the compiler
+inline float cpu_half2float(const float16 h) {
+#ifdef HAS_NATIVE_FP16_TYPE
+  __fp16 h_fp16;
+  std::memcpy(&h_fp16, &h, sizeof(__fp16));
+  return h_fp16;
+#else
+  return cpu_half2float_ref(h);
+#endif
+}
+
 static inline float cpu_bf162float(bfloat16 src) {
   float ret;
   uint32_t val_fp32 =
@@ -212,4 +243,12 @@ static inline bfloat16 cpu_float2bfloat16(float src) {
   return (temp + (1u << 15)) >> 16;
 }
 
+inline int64_t round_up(int64_t val, int64_t unit) {
+  return (val + unit - 1) / unit * unit;
+}
+
+inline int64_t div_up(int64_t val, int64_t unit) {
+  return (val + unit - 1) / unit;
+}
+
 } // namespace fbgemm
diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h
index a9a4f13021..5dc7a5d791 100644
--- a/include/fbgemm/Utils.h
+++ b/include/fbgemm/Utils.h
@@ -146,6 +146,11 @@ FBGEMM_API bool fbgemmHasAvx512VnniSupport();
  */
 FBGEMM_API bool fbgemmHasArmNeonSupport();
 
+/**
+ * @brief Are we running on a ARM SVE2 supported cpu?
+ */
+FBGEMM_API bool fbgemmHasArmSve2Support();
+
 /**
  * @brief Retrieve current CPU instruction set
  */
diff --git a/src/EmbeddingSpMDMAutovec.cc b/src/EmbeddingSpMDMAutovec.cc
new file mode 100644
index 0000000000..d060d65604
--- /dev/null
+++ b/src/EmbeddingSpMDMAutovec.cc
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#define FBGEMM_EXPORTS
+#include "./EmbeddingSpMDMAutovec.h"
+
+#include "fbgemm/FbgemmBuild.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <numeric>
+#include <thread>
+
+using std::vector;
+
+namespace fbgemm {
+
+template <typename IndexType, typename OffsetType, typename OutType>
+bool EmbeddingSpMDMNBit_autovec(
+    const int bit_rate,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
+    const uint8_t* input,
+    const IndexType* indices,
+    const OffsetType* offsets_or_lengths,
+    const float* weights, // optional, can be null for non-weighted sum
+    const bool normalize_by_lengths,
+    OutType* out,
+    const bool is_weight_positional /*=false*/,
+    const bool use_offsets /*=true*/,
+    int64_t output_stride /*=-1*/,
+    int64_t input_stride /*=-1*/,
+    const bool scale_bias_last /*=true*/,
+    const bool is_bf16_out /*=false*/) {
+  assert((bit_rate == 2 || bit_rate == 4) && "bit_rate must be 2 or 4");
+  const int num_elem_per_byte = 8 / bit_rate;
+
+  if (output_stride == -1) {
+    output_stride = block_size;
+  }
+
+  // block_size is the number of elements and fused_block_size is the size of
+  // an entire row, including scale and bias.
+  const auto scale_bias_offset = 2 * sizeof(float16);
+  if (input_stride == -1) {
+    input_stride = div_up(block_size, num_elem_per_byte) + scale_bias_offset;
+  }
+  int64_t current = 0;
+  const int64_t rounded_bs = round_up(block_size, num_elem_per_byte);
+  vector<float> buf(rounded_bs);
+  for (int m = 0; m < output_size; ++m) {
+    memset(buf.data(), 0, sizeof(float) * rounded_bs);
+    int len = use_offsets ? offsets_or_lengths[m + 1] - offsets_or_lengths[m]
+                          : offsets_or_lengths[m];
+    if (current + len > index_size) {
+      return false;
+    }
+    constexpr int tile_size = 4;
+#if _OPENMP >= 202011
+#pragma omp tile sizes(tile_size)
+#endif
+    for (int i = 0; i < len; ++i) {
+      int64_t idx = indices[current];
+      int64_t prefetch_idx =
+          indices[std::min(current + tile_size, index_size - 1)];
+      do_prefetch(
+          reinterpret_cast<const char*>(input + input_stride * prefetch_idx),
+          1);
+      if (idx < 0 || idx >= data_size) {
+        return false;
+      }
+
+      const float16* scale_bias = reinterpret_cast<const float16*>(
+          input + input_stride * idx +
+          (scale_bias_last ? div_up(block_size, num_elem_per_byte) : 0));
+
+      float weight = 1.0f;
+      if (weights) {
+        weight = weights[is_weight_positional ? i : current];
+      }
+      const float scale = weight * cpu_half2float(scale_bias[0]);
+      const float bias = weight * cpu_half2float(scale_bias[1]);
+
+      const int64_t offset =
+          input_stride * idx + (scale_bias_last ? 0 : scale_bias_offset);
+      if (bit_rate == 4) {
+        const size_t halfbufsz = (block_size + 1) / 2;
+        assert(halfbufsz > 0);
+        thread_local vector<float> buf1;
+        thread_local vector<float> buf2;
+        buf1.resize(halfbufsz);
+        buf2.resize(halfbufsz);
+#pragma omp simd
+        for (size_t j = 0; j < halfbufsz; ++j) {
+          uint8_t quantized1 = input[offset + j] & 0xf;
+          uint8_t quantized2 = input[offset + j] & 0xf0;
+          quantized2 = quantized2 >> 4;
+          buf1[j] = quantized1;
+          buf2[j] = quantized2;
+        }
+#pragma omp simd
+        for (size_t j = 0; j < halfbufsz; ++j) {
+          buf[j * 2] = std::fma(scale, buf1[j], buf[j * 2] + bias);
+          buf[j * 2 + 1] = std::fma(scale, buf2[j], buf[j * 2 + 1] + bias);
+        }
+      } else if (bit_rate == 2) {
+        size_t qbufsz = (block_size + 3) / 4;
+        vector<vector<float>> qbufs(4);
+        for (int k = 0; k < 4; ++k) {
+          qbufs[k].resize(qbufsz);
+        }
+        vector<uint8_t> masks{0x3, 0xC, 0x30, 0xC0};
+#pragma omp simd collapse(2)
+        for (size_t j = 0; j < qbufsz; ++j) {
+          for (size_t k = 0; k < 4; ++k) {
+            uint8_t quantized = input[offset + j] & masks[k];
+            quantized = quantized >> (k * 2);
+            qbufs[k][j] = quantized;
+          }
+        }
+#pragma omp simd collapse(2)
+        for (size_t j = 0; j < qbufsz; ++j) {
+          for (size_t k = 0; k < 4; ++k) {
+            buf[j * 4 + k] =
+                std::fma(scale, qbufs[k][j], buf[j * 4 + k] + bias);
+          }
+        }
+      }
+      ++current;
+    }
+
+    if (normalize_by_lengths && len) {
+      float scale = 1.f / len;
+#pragma omp simd
+      for (int j = 0; j < block_size; ++j) {
+        buf[j] *= scale;
+      }
+    }
+    if (std::is_same<OutType, float>::value) {
+#pragma omp simd
+      for (int j = 0; j < block_size; ++j) {
+        out[j] = buf[j];
+      }
+    } else if (std::is_same<OutType, uint16_t>::value && is_bf16_out) {
+#pragma omp simd
+      for (int j = 0; j < block_size; ++j) {
+        out[j] = cpu_bf162float(buf[j]);
+      }
+    } else {
+#pragma omp simd
+      for (int j = 0; j < block_size; ++j) {
+        out[j] = cpu_half2float(buf[j]);
+      }
+    }
+    out += output_stride;
+  }
+  return current == index_size;
+}
+
+#define INSTANTIATE_SPMDM_BASE(INDEX_TYPE, OFFSET_TYPE, OUT_TYPE) \
+  template FBGEMM_API bool EmbeddingSpMDMNBit_autovec(            \
+      const int bit_rate,                                         \
+      const int64_t block_size,                                   \
+      const int64_t output_size,                                  \
+      const int64_t index_size,                                   \
+      const int64_t data_size,                                    \
+      const uint8_t* input,                                       \
+      const INDEX_TYPE* indices,                                  \
+      const OFFSET_TYPE* offsets_or_lengths,                      \
+      const float* weights,                                       \
+      const bool normalize_by_lengths,                            \
+      OUT_TYPE* out,                                              \
+      const bool is_weight_positional,                            \
+      const bool use_offsets,                                     \
+      int64_t output_stride,                                      \
+      int64_t input_stride,                                       \
+      const bool scale_bias_last,                                 \
+      const bool is_bf16_out);
+
+#define INSTANTIATE_SPMDM_OUT_T(INDEX_TYPE, OFFSET_TYPE) \
+  INSTANTIATE_SPMDM_BASE(INDEX_TYPE, OFFSET_TYPE, float) \
+  INSTANTIATE_SPMDM_BASE(INDEX_TYPE, OFFSET_TYPE, float16)
+
+#define INSTANTIATE_SPMDM_OFFSET_T(INDEX_TYPE) \
+  INSTANTIATE_SPMDM_OUT_T(INDEX_TYPE, int32_t) \
+  INSTANTIATE_SPMDM_OUT_T(INDEX_TYPE, int64_t)
+
+INSTANTIATE_SPMDM_OFFSET_T(int32_t)
+INSTANTIATE_SPMDM_OFFSET_T(int64_t)
+
+#undef INSTANTIATE_SPMDM_OFFSET_T
+#undef INSTANTIATE_SPMDM_OUT_T
+#undef INSTANTIATE_SPMDM_BASE
+
+} // namespace fbgemm
diff --git a/src/EmbeddingSpMDMAutovec.h b/src/EmbeddingSpMDMAutovec.h
new file mode 100644
index 0000000000..cad66f4e80
--- /dev/null
+++ b/src/EmbeddingSpMDMAutovec.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+
+#include "fbgemm/ConvUtils.h"
+#include "fbgemm/FbgemmI8Spmdm.h"
+#include "fbgemm/Types.h"
+
+#ifdef _WIN32
+#define do_prefetch(...)
+#else
+#define do_prefetch(...) __builtin_prefetch(__VA_ARGS__)
+#endif
+
+namespace fbgemm {
+
+template <
+    typename IndexType = std::int64_t,
+    typename OffsetType = std::int32_t,
+    typename OutType = float>
+FBGEMM_API bool EmbeddingSpMDMNBit_autovec(
+    const int bit_rate,
+    const std::int64_t block_size,
+    const std::int64_t output_size,
+    const std::int64_t index_size,
+    const std::int64_t data_size,
+    const std::uint8_t* input,
+    const IndexType* indices,
+    const OffsetType* offsets_or_lengths,
+    const float* weights, // optional, can be null for non-weighted sum
+    const bool normalize_by_lengths,
+    OutType* out,
+    const bool is_weight_positional = false,
+    const bool use_offsets = true,
+    std::int64_t output_stride = -1,
+    std::int64_t input_stride = -1,
+    const bool scale_bias_last = true,
+    const bool is_bf16_out = false);
+
+} // namespace fbgemm
diff --git a/src/EmbeddingSpMDMNBit.cc b/src/EmbeddingSpMDMNBit.cc
index 174f02aec8..9d9cadda97 100644
--- a/src/EmbeddingSpMDMNBit.cc
+++ b/src/EmbeddingSpMDMNBit.cc
@@ -20,6 +20,7 @@
 #include <string>
 #include <tuple>
 #include "./CodeCache.h"
+#include "./EmbeddingSpMDMAutovec.h"
 #include "./MaskAvx2.h"
 #include "./RefImplementations.h"
 #include "fbgemm/SimdUtils.h"
@@ -1125,6 +1126,34 @@ typename EmbeddingSpMDMKernelSignature<uint8_t, indxType, offsetType, outType>::
           out,
           internal::avx2_ps_or_epi32_combined_mask);
     };
+  } else if (fbgemmHasArmSve2Support()) {
+    return [=](int64_t output_size,
+               int64_t index_size,
+               int64_t data_size,
+               const uint8_t* input,
+               const indxType* indices,
+               const offsetType* offsets_or_lengths,
+               const float* weights,
+               outType* out) {
+      return EmbeddingSpMDMNBit_autovec(
+          bit_rate,
+          block_size,
+          output_size,
+          index_size,
+          data_size,
+          input,
+          indices,
+          offsets_or_lengths,
+          weights,
+          normalize_by_lengths,
+          out,
+          is_weight_positional,
+          use_offsets,
+          output_stride,
+          input_stride,
+          scale_bias_last,
+          is_bf16_out);
+    };
   } else {
 #ifdef VLOG
     VLOG(0) << "AVX2 or AVX512 not found, taking the slow path";
diff --git a/src/Utils.cc b/src/Utils.cc
index c61b1a4dd1..9f66b23c09 100644
--- a/src/Utils.cc
+++ b/src/Utils.cc
@@ -386,6 +386,10 @@ bool fbgemmHasArmNeonSupport() {
   return (cpuinfo_has_arm_neon());
 }
 
+bool fbgemmHasArmSve2Support() {
+  return (cpuinfo_has_arm_sve2());
+}
+
 void fbgemmPartition1D(
     int thread_id,
     int num_threads,
diff --git a/third_party/cpuinfo b/third_party/cpuinfo
index ed8b86a253..76d5e8f5b5 160000
--- a/third_party/cpuinfo
+++ b/third_party/cpuinfo
@@ -1 +1 @@
-Subproject commit ed8b86a253800bafdb7b25c5c399f91bff9cb1f3
+Subproject commit 76d5e8f5b563daa65340a60fce0e9aec73a936df

From ad2aca20d852056b9ff8c7c4e58ad8fbb64ffff1 Mon Sep 17 00:00:00 2001
From: Supadchaya Puangpontip <supadchaya@meta.com>
Date: Sat, 18 Nov 2023 01:10:55 -0800
Subject: [PATCH 27/73] Fix illegal memory acesss error on fp8 quantize kernel
 (#2131)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2131

This diff addresses illegal memory access issues with FP8 quantization kernel  on large batch training on APS, S373293. Specifically, it fails in `_FP8rowwise_to_float_gpu_t`, when dequantizing a large size tensor. The issue is due to index overflow when tensor size is large.

We fix this by using `PackedTensorAccessor64` and `int64_t`. This also fixes the test result mismatch (e.g., P869606114).

Reviewed By: sryap

Differential Revision: D51187587

fbshipit-source-id: 612c9ef5d8096547e8e2362824a7ce72d3d44ca6
---
 .../src/quantize_ops/quantize_fp8_rowwise.cu  | 130 ++++++++++--------
 1 file changed, 75 insertions(+), 55 deletions(-)

diff --git a/fbgemm_gpu/src/quantize_ops/quantize_fp8_rowwise.cu b/fbgemm_gpu/src/quantize_ops/quantize_fp8_rowwise.cu
index cde36c3d55..f8ae2fa3fd 100644
--- a/fbgemm_gpu/src/quantize_ops/quantize_fp8_rowwise.cu
+++ b/fbgemm_gpu/src/quantize_ops/quantize_fp8_rowwise.cu
@@ -20,11 +20,14 @@ namespace {
 // FP32/FP16 -> FP8 rowwise kernel
 template <typename input_t>
 __global__ inline void _float_to_FP8rowwise_cuda_kernel(
-    const input_t* __restrict__ input,
+    const at::PackedTensorAccessor64<input_t, 1, at::RestrictPtrTraits> input,
     const int64_t nrows,
     const int64_t ncols,
-    std::uint8_t* __restrict__ output,
+    at::PackedTensorAccessor64<uint8_t, 1, at::RestrictPtrTraits> output,
     const bool forward) {
+  // Assert if index is out of bound
+  CUDA_KERNEL_ASSERT(nrows * ncols >= 0);
+
   constexpr float kEpsilon = 1e-20f;
   const int ebit = forward ? 4 : 5;
   const int bias = forward ? 15 : 31;
@@ -36,14 +39,13 @@ __global__ inline void _float_to_FP8rowwise_cuda_kernel(
   const int64_t row = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (row < nrows) {
-    const input_t* input_row = input + row * ncols;
-    std::uint8_t* output_row = output + row * output_columns;
+    const input_t* input_row = &input[row * ncols];
+    std::uint8_t* output_row = &output[row * output_columns];
     float* output_row_scale_bias =
         reinterpret_cast<float*>(output_row + ncols_aligned);
 
     const float minimum_element = fbgemm_gpu::min(input_row, input_row + ncols);
     const float maximum_element = fbgemm_gpu::max(input_row, input_row + ncols);
-
     const auto scale =
         max_pos / (kEpsilon + fmaxf(maximum_element, -minimum_element));
     output_row_scale_bias[0] = scale;
@@ -56,16 +58,18 @@ __global__ inline void _float_to_FP8rowwise_cuda_kernel(
 
 template <typename input_t>
 __global__ inline void _get_FP8_qparam_cuda_kernel(
-    const input_t* __restrict__ input,
+    const at::PackedTensorAccessor64<input_t, 1, at::RestrictPtrTraits> input,
     const int64_t nrows,
     const int64_t ncols,
-    uint8_t* __restrict__ output,
-    float* __restrict__ range_list,
+    at::PackedTensorAccessor64<uint8_t, 1, at::RestrictPtrTraits> output,
     const bool forward) {
+  // Assert if index is out of bound
+  CUDA_KERNEL_ASSERT(nrows * ncols >= 0);
   const int64_t row = blockIdx.x * blockDim.y + threadIdx.y;
 
   const int64_t ncols_aligned = (ncols + 4 - 1) / 4 * 4;
   const int64_t output_columns = ncols_aligned + 2 * sizeof(float);
+
   float max_pos;
   if (forward) {
     max_pos = 0.9375;
@@ -82,8 +86,7 @@ __global__ inline void _get_FP8_qparam_cuda_kernel(
   // March warp-wise through the row, doing thread local min and max reductions.
   // This loop will only execute once when ncol <= 32
   if (row < nrows) {
-    const input_t* const input_row = input + row * ncols;
-
+    const input_t* input_row = &input[row * ncols];
     for (int64_t col = threadIdx.x; col < ncols; col += lane_width) {
       // Get thread-local minmax. These are the smallest min and max ever seen
       // by this thread.
@@ -106,19 +109,21 @@ __global__ inline void _get_FP8_qparam_cuda_kernel(
     return;
   }
   float* const output_row_qparams =
-      reinterpret_cast<float*>(output + row * output_columns + ncols_aligned);
+      reinterpret_cast<float*>(&output[row * output_columns + ncols_aligned]);
 
   output_row_qparams[0] = max_pos / (kEpsilon + maximum_element);
 }
 
 template <typename input_t>
 __global__ inline void _compute_FP8_quantize_cuda_kernel(
-    const input_t* const __restrict__ input,
-    const float* const __restrict__ range_list,
+    const at::PackedTensorAccessor64<input_t, 1, at::RestrictPtrTraits> input,
     const int64_t nrows,
     const int64_t ncols,
-    std::uint8_t* const __restrict__ output,
+    at::PackedTensorAccessor64<uint8_t, 1, at::RestrictPtrTraits> output,
     const bool forward) {
+  // Assert if index is out of bound
+  CUDA_KERNEL_ASSERT(nrows * ncols >= 0);
+
   int ebit;
   int bias;
   float max_pos;
@@ -139,42 +144,41 @@ __global__ inline void _compute_FP8_quantize_cuda_kernel(
   const int64_t col = blockIdx.x * blockDim.x + threadIdx.x;
   const int64_t row_incre = blockDim.y * gridDim.y;
   for (/*row*/; row < nrows; row += row_incre) {
+    std::uint8_t* output_row = &output[row * output_columns];
     if (col < ncols) {
-      float* row_qparams = reinterpret_cast<float*>(
-          output + row * output_columns + ncols_aligned);
+      float* row_qparams = reinterpret_cast<float*>(output_row + ncols_aligned);
       const float scale = row_qparams[0];
-      const auto input_idx = row * ncols + col;
-      uint8_t* output_addr = output + row * output_columns + col;
-      // TODO: lift range_list into shared memory. However, when nrows is large,
-      // it might exceed the size of shared memory.
-      // output_addr[0] = lrintf((input[input_idx] - bias) * inverse_scale);
-      output_addr[0] = float_to_hfp8(
-          to_float(input[input_idx]) * scale, ebit, bias, max_pos);
+      output_row[col] = float_to_hfp8(
+          to_float(input[row * ncols + col]) * scale, ebit, bias, max_pos);
     }
   }
 }
 
 template <typename output_t>
 __global__ inline void _FP8rowwise_to_float_cuda_kernel(
-    const std::uint8_t* const __restrict__ input,
-    const int nrows,
-    const int ncols,
-    output_t* const __restrict__ output,
+    at::PackedTensorAccessor64<uint8_t, 1, at::RestrictPtrTraits> input,
+    const int64_t nrows,
+    const int64_t ncols,
+    at::PackedTensorAccessor64<output_t, 1, at::RestrictPtrTraits> output,
     const bool forward) {
-  const int output_columns = ncols - 2 * sizeof(float);
+  // Assert if index is out of bound
+  CUDA_KERNEL_ASSERT(nrows * ncols >= 0);
+
+  const int64_t output_columns = ncols - 2 * sizeof(float);
   const int ebit = forward ? 4 : 5;
   const int bias = forward ? 15 : 31;
 
-  int row = (int)blockIdx.y * blockDim.y + threadIdx.y;
-  const int col = (int)blockIdx.x * blockDim.x + threadIdx.x;
-  const int row_incre = blockDim.y * gridDim.y;
+  int64_t row = static_cast<int64_t>(blockIdx.y) * blockDim.y + threadIdx.y;
+  const int64_t col =
+      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  const int64_t row_incre = blockDim.y * gridDim.y;
+
   for (/*row*/; row < nrows; row += row_incre) {
     if (col < output_columns) {
-      const std::uint8_t* input_row = input + row * ncols;
+      const std::uint8_t* input_row = &input[row * ncols];
+      output_t* output_row = &output[row * output_columns];
       const float* input_row_scale_bias =
           reinterpret_cast<const float*>(input_row + output_columns);
-      output_t* output_row = output + row * output_columns;
-
       const float output_ =
           hfp8_to_float(input_row[col], ebit, bias) / input_row_scale_bias[0];
       quantize_float_store(&output_row[col], output_);
@@ -189,16 +193,15 @@ template <typename input_t>
 Tensor _float_to_FP8rowwise_gpu_t(const Tensor& input, const bool forward) {
   TENSOR_ON_CUDA_GPU(input);
   TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
-
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(input.get_device());
 
   const auto input_sizes = input.sizes();
   const auto last_dim = input_sizes.size() - 1;
-  const int nrows = c10::size_to_dim_(last_dim, input_sizes);
-  const int ncols = input_sizes[last_dim];
-  const int ncols_aligned = (ncols + 4 - 1) / 4 * 4;
-  const int output_columns = ncols_aligned + 2 * sizeof(float);
+  const int64_t nrows = c10::size_to_dim_(last_dim, input_sizes);
+  const int64_t ncols = input_sizes[last_dim];
+  const int64_t ncols_aligned = (ncols + 4 - 1) / 4 * 4;
+  const int64_t output_columns = ncols_aligned + 2 * sizeof(float);
 
   // Global memory instructions support reading or writing words of size equal
   // to 1, 2, 4, 8, or 16 bytes. Any access (via a variable or a pointer) to
@@ -220,6 +223,9 @@ Tensor _float_to_FP8rowwise_gpu_t(const Tensor& input, const bool forward) {
   const auto num_blocks = cuda_calc_xblock_count(nrows, threads_per_block);
   // think unsigned as we use 0, 255
 
+  const auto input_1D = input.flatten();
+  const auto output_1D = output.flatten();
+
   if (nrows <= 20) {
     FBGEMM_DISPATCH_FLOAT_HALF_AND_BFLOAT16(
         input.scalar_type(), "_float_to_FP8rowwise_cuda_kernel", [&] {
@@ -228,10 +234,12 @@ Tensor _float_to_FP8rowwise_gpu_t(const Tensor& input, const bool forward) {
                  threads_per_block,
                  0,
                  at::cuda::getCurrentCUDAStream()>>>(
-                  input.data_ptr<scalar_t>(),
+                  input_1D
+                      .packed_accessor64<scalar_t, 1, at::RestrictPtrTraits>(),
                   nrows,
                   ncols,
-                  output.data_ptr<std::uint8_t>(),
+                  output_1D
+                      .packed_accessor64<uint8_t, 1, at::RestrictPtrTraits>(),
                   forward);
           C10_CUDA_KERNEL_LAUNCH_CHECK();
         });
@@ -268,18 +276,22 @@ Tensor _float_to_FP8rowwise_gpu_t(const Tensor& input, const bool forward) {
                    dim3(blockDim_x, rows_per_block),
                    0,
                    at::cuda::getCurrentCUDAStream()>>>(
-                    input.data_ptr<scalar_t>(),
+                    input_1D.packed_accessor64<
+                        scalar_t,
+                        1,
+                        at::RestrictPtrTraits>(),
                     nrows,
                     ncols,
-                    output.data_ptr<std::uint8_t>(),
-                    range_tensor.data_ptr<float>(),
+                    output_1D
+                        .packed_accessor64<uint8_t, 1, at::RestrictPtrTraits>(),
                     forward);
             C10_CUDA_KERNEL_LAUNCH_CHECK();
           });
     }
 
     {
-      const int blockDim_x = std::min(ncols, threads_per_block);
+      const int blockDim_x =
+          std::min(ncols, static_cast<int64_t>(threads_per_block));
       dim3 blockDim(blockDim_x, threads_per_block / blockDim_x);
       const auto gridDim_x = cuda_calc_xblock_count(ncols, blockDim.x);
       const auto gridDim_y = cuda_calc_block_count(nrows, blockDim.y);
@@ -289,11 +301,14 @@ Tensor _float_to_FP8rowwise_gpu_t(const Tensor& input, const bool forward) {
           input.scalar_type(), "_compute_FP8_quantize_cuda_kernel", [&] {
             _compute_FP8_quantize_cuda_kernel<scalar_t>
                 <<<gridDim, blockDim, 0, at::cuda::getCurrentCUDAStream()>>>(
-                    input.data_ptr<scalar_t>(),
-                    range_tensor.data_ptr<float>(),
+                    input_1D.packed_accessor64<
+                        scalar_t,
+                        1,
+                        at::RestrictPtrTraits>(),
                     nrows,
                     ncols,
-                    output.data_ptr<std::uint8_t>(),
+                    output_1D
+                        .packed_accessor64<uint8_t, 1, at::RestrictPtrTraits>(),
                     forward);
             C10_CUDA_KERNEL_LAUNCH_CHECK();
           });
@@ -328,10 +343,10 @@ Tensor _FP8rowwise_to_float_gpu_t(
 
   const auto input_sizes = input.sizes();
   const auto last_dim = input_sizes.size() - 1;
-  const int nrows = c10::size_to_dim_(last_dim, input_sizes);
-  const int ncols = input_sizes[last_dim];
-  const int ncols_aligned = (ncols + 4 - 1) / 4 * 4;
-  const int output_columns = ncols_aligned - 2 * sizeof(float);
+  const int64_t nrows = c10::size_to_dim_(last_dim, input_sizes);
+  const int64_t ncols = input_sizes[last_dim];
+  const int64_t ncols_aligned = (ncols + 4 - 1) / 4 * 4;
+  const int64_t output_columns = ncols_aligned - 2 * sizeof(float);
 
   // Global memory instructions support reading or writing words of size equal
   // to 1, 2, 4, 8, or 16 bytes. Any access (via a variable or a pointer) to
@@ -356,21 +371,26 @@ Tensor _FP8rowwise_to_float_gpu_t(
 
   constexpr int threads_per_block = 256;
 
-  const int blockDim_x = std::min(threads_per_block, output_columns);
+  const int blockDim_x =
+      std::min(static_cast<int64_t>(threads_per_block), output_columns);
   const dim3 blockDim(blockDim_x, threads_per_block / blockDim_x);
 
   const auto gridDim_x = cuda_calc_xblock_count(output_columns, blockDim.x);
   const auto gridDim_y = cuda_calc_block_count(nrows, blockDim.y);
   const dim3 gridDim(gridDim_x, gridDim_y);
 
+  const auto input_1D = input.flatten();
+  const auto output_1D = output.flatten();
+
   FBGEMM_DISPATCH_FLOAT_HALF_AND_BFLOAT16(
       output.scalar_type(), "FP8rowwise_to_float_cuda_kernel", [&] {
         _FP8rowwise_to_float_cuda_kernel<scalar_t>
             <<<gridDim, blockDim, 0, at::cuda::getCurrentCUDAStream()>>>(
-                input.data_ptr<std::uint8_t>(),
+                input_1D.packed_accessor64<uint8_t, 1, at::RestrictPtrTraits>(),
                 nrows,
                 ncols,
-                output.data_ptr<scalar_t>(),
+                output_1D
+                    .packed_accessor64<scalar_t, 1, at::RestrictPtrTraits>(),
                 forward);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });

From 92388c170c0b9b8318fbecfcad50b75a666b923f Mon Sep 17 00:00:00 2001
From: Qiang Zhang <drqiangzhang@meta.com>
Date: Sat, 18 Nov 2023 04:59:48 -0800
Subject: [PATCH 28/73] RW Dist change to support uneven sharding (#2142)

Summary:
X-link: https://github.com/pytorch/torchrec/pull/1525

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2142

As titled

Reviewed By: dstaay-fb

Differential Revision: D51053207

fbshipit-source-id: 8aee7d967ceec9ea0739f5d2c56d6e541a0d3648
---
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index 8291927f45..7acefdf18a 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -306,7 +306,8 @@ void _block_bucketize_sparse_features_cpu(
   const index_t* const block_sizes_data = block_sizes.data_ptr<index_t>();
   offset_t* batch_sizes_data = nullptr;
   const auto variable_batch_size = batch_size_per_feature.has_value();
-  const auto variable_bucket_sizes = block_bucketize_pos.has_value();
+  const auto variable_bucket_sizes = block_bucketize_pos.has_value() &&
+      block_bucketize_pos.value().size() != 0;
   using uindex_t = std::make_unsigned_t<index_t>;
   using uoffset_t = std::make_unsigned_t<offset_t>;
   std::vector<int64_t> lower_bounds(indices.numel(), 0);

From 4c0fad5aad5e0325a8bd16d6585459fd1fd557e3 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Sat, 18 Nov 2023 17:21:38 -0800
Subject: [PATCH 29/73] Re-organize ssd_split_embeddings_cache (#2141)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2141

- Migrate ssd_split_embeddings_cache code into its own directory

Reviewed By: sryap

Differential Revision: D51411817

fbshipit-source-id: 90935128331430b82f20aea673243c173ebda6fb
---
 fbgemm_gpu/CMakeLists.txt                     |   7 +-
 .../generate_vbe_metadata.cu                  | 156 +++++++++++
 .../get_infos_metadata.cu                     |  65 +++++
 .../radix_sort_pairs.cu                       |  83 ++++++
 .../split_embeddings_utils.cpp                |   0
 .../transpose_embedding_input.cu}             | 264 +-----------------
 .../ssd_split_embeddings_cache_cuda.cu        |   0
 .../ssd_split_table_batched_embeddings.cpp    |   0
 .../ssd_table_batched_embeddings.h            |   0
 .../test/split_embeddings_utils_test.py       |   5 +-
 10 files changed, 313 insertions(+), 267 deletions(-)
 create mode 100644 fbgemm_gpu/src/split_embeddings_utils/generate_vbe_metadata.cu
 create mode 100644 fbgemm_gpu/src/split_embeddings_utils/get_infos_metadata.cu
 create mode 100644 fbgemm_gpu/src/split_embeddings_utils/radix_sort_pairs.cu
 rename fbgemm_gpu/src/{ => split_embeddings_utils}/split_embeddings_utils.cpp (100%)
 rename fbgemm_gpu/src/{split_embeddings_utils.cu => split_embeddings_utils/transpose_embedding_input.cu} (60%)
 rename fbgemm_gpu/src/{ => ssd_split_embeddings_cache}/ssd_split_embeddings_cache_cuda.cu (100%)
 rename fbgemm_gpu/src/{ => ssd_split_embeddings_cache}/ssd_split_table_batched_embeddings.cpp (100%)
 rename fbgemm_gpu/src/{ => ssd_split_embeddings_cache}/ssd_table_batched_embeddings.h (100%)

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 0bbd8a40f8..73fa70dd49 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -595,7 +595,7 @@ if(NOT FBGEMM_CPU_ONLY)
     src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp
     src/quantize_ops/quantize_ops_gpu.cpp
     src/sparse_ops/sparse_ops_gpu.cpp
-    src/split_embeddings_utils.cpp
+    src/split_embeddings_utils/split_embeddings_utils.cpp
     src/split_embeddings_cache/split_embeddings_cache_ops.cu
     src/metric_ops/metric_ops_host.cpp
     src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
@@ -692,7 +692,10 @@ if(NOT FBGEMM_CPU_ONLY)
       src/split_embeddings_cache/lxu_cache.cu
       src/split_embeddings_cache/linearize_cache_indices.cu
       src/split_embeddings_cache/reset_weight_momentum.cu
-      src/split_embeddings_utils.cu)
+      src/split_embeddings_utils/generate_vbe_metadata.cu
+      src/split_embeddings_utils/get_infos_metadata.cu
+      src/split_embeddings_utils/radix_sort_pairs.cu
+      src/split_embeddings_utils/transpose_embedding_input.cu)
 
   set_source_files_properties(${fbgemm_gpu_sources_static_gpu}
     PROPERTIES COMPILE_OPTIONS
diff --git a/fbgemm_gpu/src/split_embeddings_utils/generate_vbe_metadata.cu b/fbgemm_gpu/src/split_embeddings_utils/generate_vbe_metadata.cu
new file mode 100644
index 0000000000..fe65a8d95e
--- /dev/null
+++ b/fbgemm_gpu/src/split_embeddings_utils/generate_vbe_metadata.cu
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
+#include "fbgemm_gpu/ops_utils.h"
+#include "fbgemm_gpu/sparse_ops_utils.h"
+#include "fbgemm_gpu/split_embeddings_utils.cuh"
+
+using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+__global__
+__launch_bounds__(kMaxThreads) void generate_vbe_metadata_foreach_sample_kernel(
+    at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        row_output_offsets,
+    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> b_t_map,
+    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        B_offsets,
+    const at::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits>
+        B_offsets_rank_per_feature,
+    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        output_offsets_feature_rank,
+    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        D_offsets,
+    const int32_t D,
+    const bool nobag,
+    FixedDivisor fd_max_B,
+    FixedDivisor fd_max_B_T,
+    const int32_t info_B_num_bits) {
+  const auto r_b_t = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto T = B_offsets.size(0) - 1; // Num tables
+  const auto R = B_offsets_rank_per_feature.size(1) - 1; // Num ranks
+
+  int32_t b_t;
+  int32_t r; // Rank ID
+  int32_t t; // Table ID
+  int32_t b; // Relative sample ID in the rank-table matrix
+
+  fd_max_B_T.DivMod(r_b_t, &r, &b_t);
+  if (r >= R) {
+    return;
+  }
+
+  fd_max_B.DivMod(b_t, &t, &b);
+  if (t >= T) {
+    return;
+  }
+
+  const auto B_start_r_t = B_offsets_rank_per_feature[t][r];
+  const auto B_r_t = B_offsets_rank_per_feature[t][r + 1] - B_start_r_t;
+  if (b >= B_r_t) {
+    return;
+  }
+
+  const auto B_start_t = B_offsets[t];
+  // Update b_t
+  b_t = B_start_t + B_start_r_t + b;
+  const auto D_ = nobag ? D : D_offsets[t + 1] - D_offsets[t];
+  row_output_offsets[b_t] = output_offsets_feature_rank[r * T + t] + b * D_;
+
+  // Relative sample ID in the table
+  const auto b_ = B_start_r_t + b;
+  // b_t is always positive.
+  *reinterpret_cast<uint32_t*>(&b_t_map[b_t]) =
+      (reinterpret_cast<uint32_t*>(&t)[0] << info_B_num_bits) |
+      reinterpret_cast<const uint32_t*>(&b_)[0];
+}
+
+/// Generate VBE metadata namely output_offsets and b_t_map
+///
+/// row_output_offsets A 1D tensor that contains the output offset of each b
+///                    (sample) and t (feature/table) pair. The output
+///                    serializes O_r_t where O_r_t is the local output of rank
+///                    r and feature/table t (t is the fastest moving index).
+/// b_t_map            A 1D tensor that contains the b and t information of the
+///                    linearized b and t (b is the fastest moving index).
+///
+/// @param B_offsets                   Batch size offsets for all features.
+/// @param B_offsets_rank_per_feature  Batch size offsets for all ranks (GPUs)
+///                                    for each feature.
+/// @param output_offsets_feature_rank Output offsets for all features and ranks
+///                                    and features.
+/// @param D_offsets                   Embedding dimension offsets. Required if
+///                                    nobag is false.
+/// @param D                           The embedding dimension. Required if
+///                                    nobag is true.
+/// @param nobag                       A boolean to indicate if TBE is pooled
+///                                    (false) or sequence (true).
+/// @param info_B_num_bits             The number of bits used to encode a
+///                                    sample ID. (Used for populating b_t_map).
+/// @param total_B                     The total number of samples (i.e., the
+///                                    total number of b and t pairs).
+DLL_PUBLIC std::tuple<Tensor /*row_output_offsets*/, Tensor /*b_t_map*/>
+generate_vbe_metadata(
+    const Tensor& B_offsets,
+    const Tensor& B_offsets_rank_per_feature,
+    const Tensor& output_offsets_feature_rank,
+    const Tensor& D_offsets,
+    const int64_t D,
+    const bool nobag,
+    const int64_t max_B_feature_rank,
+    const int64_t info_B_num_bits,
+    const int64_t total_B) {
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      B_offsets, B_offsets_rank_per_feature, output_offsets_feature_rank);
+
+  TENSOR_NDIM_EQUALS(B_offsets, 1);
+  TENSOR_NDIM_EQUALS(B_offsets_rank_per_feature, 2);
+  TENSOR_NDIM_EQUALS(output_offsets_feature_rank, 1);
+
+  const int32_t T = B_offsets.numel() - 1;
+  if (!nobag) {
+    TENSOR_ON_CUDA_GPU(D_offsets);
+    TENSORS_ON_SAME_DEVICE(B_offsets, D_offsets);
+    TORCH_CHECK(D_offsets.numel() == T + 1)
+  }
+
+  const auto num_ranks = B_offsets_rank_per_feature.size(1) - 1;
+  TORCH_CHECK(B_offsets_rank_per_feature.size(0) == T);
+  TORCH_CHECK(output_offsets_feature_rank.numel() == num_ranks * T + 1);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(B_offsets.get_device());
+
+  Tensor row_output_offsets =
+      at::empty({total_B}, output_offsets_feature_rank.options());
+  Tensor b_t_map = at::empty({total_B}, B_offsets.options());
+
+  // Over allocate total number of threads to avoid using binary search
+  generate_vbe_metadata_foreach_sample_kernel<<<
+      div_round_up(max_B_feature_rank * T * num_ranks, kMaxThreads),
+      kMaxThreads,
+      0,
+      at::cuda::getCurrentCUDAStream()>>>(
+      row_output_offsets.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+      b_t_map.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+      B_offsets.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+      B_offsets_rank_per_feature
+          .packed_accessor32<int32_t, 2, at::RestrictPtrTraits>(),
+      output_offsets_feature_rank
+          .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+      D_offsets.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+      D,
+      nobag,
+      FixedDivisor(max_B_feature_rank),
+      FixedDivisor(max_B_feature_rank * T),
+      info_B_num_bits);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {row_output_offsets, b_t_map};
+}
diff --git a/fbgemm_gpu/src/split_embeddings_utils/get_infos_metadata.cu b/fbgemm_gpu/src/split_embeddings_utils/get_infos_metadata.cu
new file mode 100644
index 0000000000..e8fdcc1411
--- /dev/null
+++ b/fbgemm_gpu/src/split_embeddings_utils/get_infos_metadata.cu
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
+#include "fbgemm_gpu/ops_utils.h"
+#include "fbgemm_gpu/split_embeddings_utils.cuh"
+
+using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+DLL_PUBLIC std::tuple<int32_t, uint32_t> adjust_info_B_num_bits(
+    int32_t B,
+    int32_t T) {
+  int32_t info_B_num_bits = DEFAULT_INFO_B_NUM_BITS;
+  uint32_t info_B_mask = DEFAULT_INFO_B_MASK;
+  uint32_t max_T = MAX_T;
+  uint32_t max_B = MAX_B;
+  bool invalid_T = T > max_T;
+  bool invalid_B = B > max_B;
+
+  TORCH_CHECK(
+      !(invalid_T && invalid_B),
+      "Not enough infos bits to accommodate T and B. Default num bits = ",
+      DEFAULT_INFO_NUM_BITS);
+
+  if (invalid_T) {
+    // Reduce info_B_num_bits
+    while (invalid_T && !invalid_B && info_B_num_bits > 0) {
+      info_B_num_bits--;
+      max_T = ((max_T + 1) << 1) - 1;
+      max_B = ((max_B + 1) >> 1) - 1;
+      invalid_T = T > max_T;
+      invalid_B = B > max_B;
+    }
+  } else if (invalid_B) {
+    // Increase info_B_num_bits
+    while (!invalid_T && invalid_B && info_B_num_bits < DEFAULT_INFO_NUM_BITS) {
+      info_B_num_bits++;
+      max_T = ((max_T + 1) >> 1) - 1;
+      max_B = ((max_B + 1) << 1) - 1;
+      invalid_T = T > max_T;
+      invalid_B = B > max_B;
+    }
+  }
+
+  TORCH_CHECK(
+      !invalid_T && !invalid_B,
+      "Not enough infos bits to accommodate T and B. Default num bits = ",
+      DEFAULT_INFO_NUM_BITS);
+
+  // Recompute info_B_mask using new info_B_num_bits
+  info_B_mask = (1u << info_B_num_bits) - 1;
+
+  return {info_B_num_bits, info_B_mask};
+}
+
+std::tuple<int64_t, int64_t> DLL_PUBLIC
+get_infos_metadata(Tensor unused, int64_t B, int64_t T) {
+  return adjust_info_B_num_bits(B, T);
+}
diff --git a/fbgemm_gpu/src/split_embeddings_utils/radix_sort_pairs.cu b/fbgemm_gpu/src/split_embeddings_utils/radix_sort_pairs.cu
new file mode 100644
index 0000000000..121f66625e
--- /dev/null
+++ b/fbgemm_gpu/src/split_embeddings_utils/radix_sort_pairs.cu
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "fbgemm_gpu/split_embeddings_utils.cuh"
+
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAStream.h>
+#include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
+#include "fbgemm_gpu/ops_utils.h"
+
+// clang-format off
+#include "fbgemm_gpu/cub_namespace_prefix.cuh"
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_run_length_encode.cuh>
+#include <cub/device/device_scan.cuh>
+#include "fbgemm_gpu/cub_namespace_postfix.cuh"
+// clang-format on
+
+using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
+#define DEF_RADIX_SORT_PAIRS_FN(KeyT, ValueT)                        \
+  DLL_PUBLIC cudaError_t radix_sort_pairs(                           \
+      void* d_temp_storage,                                          \
+      size_t& temp_storage_bytes,                                    \
+      const KeyT* d_keys_in,                                         \
+      KeyT* d_keys_out,                                              \
+      const ValueT* d_values_in,                                     \
+      ValueT* d_values_out,                                          \
+      const int num_items,                                           \
+      const int begin_bit,                                           \
+      const int end_bit,                                             \
+      cudaStream_t stream) {                                         \
+    return FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs( \
+        d_temp_storage,                                              \
+        temp_storage_bytes,                                          \
+        d_keys_in,                                                   \
+        d_keys_out,                                                  \
+        d_values_in,                                                 \
+        d_values_out,                                                \
+        num_items,                                                   \
+        begin_bit,                                                   \
+        end_bit,                                                     \
+        stream);                                                     \
+  }
+#else
+#define DEF_RADIX_SORT_PAIRS_FN(KeyT, ValueT)                        \
+  DLL_PUBLIC cudaError_t radix_sort_pairs(                           \
+      void* d_temp_storage,                                          \
+      size_t& temp_storage_bytes,                                    \
+      const KeyT* d_keys_in,                                         \
+      KeyT* d_keys_out,                                              \
+      const ValueT* d_values_in,                                     \
+      ValueT* d_values_out,                                          \
+      const int num_items,                                           \
+      const int begin_bit,                                           \
+      const int end_bit,                                             \
+      cudaStream_t stream) {                                         \
+    return FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs( \
+        d_temp_storage,                                              \
+        temp_storage_bytes,                                          \
+        d_keys_in,                                                   \
+        d_keys_out,                                                  \
+        d_values_in,                                                 \
+        d_values_out,                                                \
+        num_items,                                                   \
+        begin_bit,                                                   \
+        end_bit,                                                     \
+        stream,                                                      \
+        false);                                                      \
+  }
+#endif
+
+DEF_RADIX_SORT_PAIRS_FN(int64_t, float);
+DEF_RADIX_SORT_PAIRS_FN(int64_t, double);
+DEF_RADIX_SORT_PAIRS_FN(int64_t, int64_t);
+DEF_RADIX_SORT_PAIRS_FN(int64_t, int32_t);
diff --git a/fbgemm_gpu/src/split_embeddings_utils.cpp b/fbgemm_gpu/src/split_embeddings_utils/split_embeddings_utils.cpp
similarity index 100%
rename from fbgemm_gpu/src/split_embeddings_utils.cpp
rename to fbgemm_gpu/src/split_embeddings_utils/split_embeddings_utils.cpp
diff --git a/fbgemm_gpu/src/split_embeddings_utils.cu b/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu
similarity index 60%
rename from fbgemm_gpu/src/split_embeddings_utils.cu
rename to fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu
index dd5c0ec70a..d1eb5e00a1 100644
--- a/fbgemm_gpu/src/split_embeddings_utils.cu
+++ b/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu
@@ -6,12 +6,9 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "fbgemm_gpu/split_embeddings_utils.cuh"
-
-#include <c10/cuda/CUDAException.h>
-#include <c10/cuda/CUDAStream.h>
 #include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
 #include "fbgemm_gpu/ops_utils.h"
+#include "fbgemm_gpu/split_embeddings_utils.cuh"
 
 // clang-format off
 #include "fbgemm_gpu/cub_namespace_prefix.cuh"
@@ -21,9 +18,8 @@
 #include "fbgemm_gpu/cub_namespace_postfix.cuh"
 // clang-format on
 
-#ifdef USE_ROCM
-#include <rocm_version.h>
-#endif
+using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
 
 inline at::Tensor asynchronous_complete_cumsum(at::Tensor t_in) {
   at::cuda::OptionalCUDAGuard device_guard;
@@ -62,10 +58,6 @@ inline at::Tensor asynchronous_complete_cumsum(at::Tensor t_in) {
   return t_out;
 }
 
-using Tensor = at::Tensor;
-
-using namespace fbgemm_gpu;
-
 template <typename index_t, typename info_acc_t, bool nobag, bool vbe>
 __global__ __launch_bounds__(kMaxThreads) void linearize_index_kernel(
     const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
@@ -394,253 +386,3 @@ transpose_embedding_input(
       sorted_linear_indices_num_runs,
       sorted_linear_indices_cumulative_run_lengths};
 }
-
-std::tuple<int64_t, int64_t>
-get_infos_metadata(Tensor unused, int64_t B, int64_t T) {
-  return adjust_info_B_num_bits(B, T);
-}
-
-DLL_PUBLIC std::tuple<int32_t, uint32_t> adjust_info_B_num_bits(
-    int32_t B,
-    int32_t T) {
-  int32_t info_B_num_bits = DEFAULT_INFO_B_NUM_BITS;
-  uint32_t info_B_mask = DEFAULT_INFO_B_MASK;
-  uint32_t max_T = MAX_T;
-  uint32_t max_B = MAX_B;
-  bool invalid_T = T > max_T;
-  bool invalid_B = B > max_B;
-
-  TORCH_CHECK(
-      !(invalid_T && invalid_B),
-      "Not enough infos bits to accommodate T and B. Default num bits = ",
-      DEFAULT_INFO_NUM_BITS);
-
-  if (invalid_T) {
-    // Reduce info_B_num_bits
-    while (invalid_T && !invalid_B && info_B_num_bits > 0) {
-      info_B_num_bits--;
-      max_T = ((max_T + 1) << 1) - 1;
-      max_B = ((max_B + 1) >> 1) - 1;
-      invalid_T = T > max_T;
-      invalid_B = B > max_B;
-    }
-  } else if (invalid_B) {
-    // Increase info_B_num_bits
-    while (!invalid_T && invalid_B && info_B_num_bits < DEFAULT_INFO_NUM_BITS) {
-      info_B_num_bits++;
-      max_T = ((max_T + 1) >> 1) - 1;
-      max_B = ((max_B + 1) << 1) - 1;
-      invalid_T = T > max_T;
-      invalid_B = B > max_B;
-    }
-  }
-
-  TORCH_CHECK(
-      !invalid_T && !invalid_B,
-      "Not enough infos bits to accommodate T and B. Default num bits = ",
-      DEFAULT_INFO_NUM_BITS);
-
-  // Recompute info_B_mask using new info_B_num_bits
-  info_B_mask = (1u << info_B_num_bits) - 1;
-
-  return {info_B_num_bits, info_B_mask};
-}
-
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
-#define DEF_RADIX_SORT_PAIRS_FN(KeyT, ValueT)                        \
-  DLL_PUBLIC cudaError_t radix_sort_pairs(                           \
-      void* d_temp_storage,                                          \
-      size_t& temp_storage_bytes,                                    \
-      const KeyT* d_keys_in,                                         \
-      KeyT* d_keys_out,                                              \
-      const ValueT* d_values_in,                                     \
-      ValueT* d_values_out,                                          \
-      const int num_items,                                           \
-      const int begin_bit,                                           \
-      const int end_bit,                                             \
-      cudaStream_t stream) {                                         \
-    return FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs( \
-        d_temp_storage,                                              \
-        temp_storage_bytes,                                          \
-        d_keys_in,                                                   \
-        d_keys_out,                                                  \
-        d_values_in,                                                 \
-        d_values_out,                                                \
-        num_items,                                                   \
-        begin_bit,                                                   \
-        end_bit,                                                     \
-        stream);                                                     \
-  }
-#else
-#define DEF_RADIX_SORT_PAIRS_FN(KeyT, ValueT)                        \
-  DLL_PUBLIC cudaError_t radix_sort_pairs(                           \
-      void* d_temp_storage,                                          \
-      size_t& temp_storage_bytes,                                    \
-      const KeyT* d_keys_in,                                         \
-      KeyT* d_keys_out,                                              \
-      const ValueT* d_values_in,                                     \
-      ValueT* d_values_out,                                          \
-      const int num_items,                                           \
-      const int begin_bit,                                           \
-      const int end_bit,                                             \
-      cudaStream_t stream) {                                         \
-    return FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs( \
-        d_temp_storage,                                              \
-        temp_storage_bytes,                                          \
-        d_keys_in,                                                   \
-        d_keys_out,                                                  \
-        d_values_in,                                                 \
-        d_values_out,                                                \
-        num_items,                                                   \
-        begin_bit,                                                   \
-        end_bit,                                                     \
-        stream,                                                      \
-        false);                                                      \
-  }
-#endif
-
-DEF_RADIX_SORT_PAIRS_FN(int64_t, float);
-DEF_RADIX_SORT_PAIRS_FN(int64_t, double);
-DEF_RADIX_SORT_PAIRS_FN(int64_t, int64_t);
-DEF_RADIX_SORT_PAIRS_FN(int64_t, int32_t);
-
-__global__
-__launch_bounds__(kMaxThreads) void generate_vbe_metadata_foreach_sample_kernel(
-    at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        row_output_offsets,
-    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> b_t_map,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        B_offsets,
-    const at::PackedTensorAccessor32<int32_t, 2, at::RestrictPtrTraits>
-        B_offsets_rank_per_feature,
-    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
-        output_offsets_feature_rank,
-    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        D_offsets,
-    const int32_t D,
-    const bool nobag,
-    FixedDivisor fd_max_B,
-    FixedDivisor fd_max_B_T,
-    const int32_t info_B_num_bits) {
-  const auto r_b_t = blockIdx.x * blockDim.x + threadIdx.x;
-  const auto T = B_offsets.size(0) - 1; // Num tables
-  const auto R = B_offsets_rank_per_feature.size(1) - 1; // Num ranks
-
-  int32_t b_t;
-  int32_t r; // Rank ID
-  int32_t t; // Table ID
-  int32_t b; // Relative sample ID in the rank-table matrix
-
-  fd_max_B_T.DivMod(r_b_t, &r, &b_t);
-  if (r >= R) {
-    return;
-  }
-
-  fd_max_B.DivMod(b_t, &t, &b);
-  if (t >= T) {
-    return;
-  }
-
-  const auto B_start_r_t = B_offsets_rank_per_feature[t][r];
-  const auto B_r_t = B_offsets_rank_per_feature[t][r + 1] - B_start_r_t;
-  if (b >= B_r_t) {
-    return;
-  }
-
-  const auto B_start_t = B_offsets[t];
-  // Update b_t
-  b_t = B_start_t + B_start_r_t + b;
-  const auto D_ = nobag ? D : D_offsets[t + 1] - D_offsets[t];
-  row_output_offsets[b_t] = output_offsets_feature_rank[r * T + t] + b * D_;
-
-  // Relative sample ID in the table
-  const auto b_ = B_start_r_t + b;
-  // b_t is always positive.
-  *reinterpret_cast<uint32_t*>(&b_t_map[b_t]) =
-      (reinterpret_cast<uint32_t*>(&t)[0] << info_B_num_bits) |
-      reinterpret_cast<const uint32_t*>(&b_)[0];
-}
-
-/// Generate VBE metadata namely output_offsets and b_t_map
-///
-/// row_output_offsets A 1D tensor that contains the output offset of each b
-///                    (sample) and t (feature/table) pair. The output
-///                    serializes O_r_t where O_r_t is the local output of rank
-///                    r and feature/table t (t is the fastest moving index).
-/// b_t_map            A 1D tensor that contains the b and t information of the
-///                    linearized b and t (b is the fastest moving index).
-///
-/// @param B_offsets                   Batch size offsets for all features.
-/// @param B_offsets_rank_per_feature  Batch size offsets for all ranks (GPUs)
-///                                    for each feature.
-/// @param output_offsets_feature_rank Output offsets for all features and ranks
-///                                    and features.
-/// @param D_offsets                   Embedding dimension offsets. Required if
-///                                    nobag is false.
-/// @param D                           The embedding dimension. Required if
-///                                    nobag is true.
-/// @param nobag                       A boolean to indicate if TBE is pooled
-///                                    (false) or sequence (true).
-/// @param info_B_num_bits             The number of bits used to encode a
-///                                    sample ID. (Used for populating b_t_map).
-/// @param total_B                     The total number of samples (i.e., the
-///                                    total number of b and t pairs).
-DLL_PUBLIC std::tuple<Tensor /*row_output_offsets*/, Tensor /*b_t_map*/>
-generate_vbe_metadata(
-    const Tensor& B_offsets,
-    const Tensor& B_offsets_rank_per_feature,
-    const Tensor& output_offsets_feature_rank,
-    const Tensor& D_offsets,
-    const int64_t D,
-    const bool nobag,
-    const int64_t max_B_feature_rank,
-    const int64_t info_B_num_bits,
-    const int64_t total_B) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      B_offsets, B_offsets_rank_per_feature, output_offsets_feature_rank);
-
-  TENSOR_NDIM_EQUALS(B_offsets, 1);
-  TENSOR_NDIM_EQUALS(B_offsets_rank_per_feature, 2);
-  TENSOR_NDIM_EQUALS(output_offsets_feature_rank, 1);
-
-  const int32_t T = B_offsets.numel() - 1;
-  if (!nobag) {
-    TENSOR_ON_CUDA_GPU(D_offsets);
-    TENSORS_ON_SAME_DEVICE(B_offsets, D_offsets);
-    TORCH_CHECK(D_offsets.numel() == T + 1)
-  }
-
-  const auto num_ranks = B_offsets_rank_per_feature.size(1) - 1;
-  TORCH_CHECK(B_offsets_rank_per_feature.size(0) == T);
-  TORCH_CHECK(output_offsets_feature_rank.numel() == num_ranks * T + 1);
-
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(B_offsets.get_device());
-
-  Tensor row_output_offsets =
-      at::empty({total_B}, output_offsets_feature_rank.options());
-  Tensor b_t_map = at::empty({total_B}, B_offsets.options());
-
-  // Over allocate total number of threads to avoid using binary search
-  generate_vbe_metadata_foreach_sample_kernel<<<
-      div_round_up(max_B_feature_rank * T * num_ranks, kMaxThreads),
-      kMaxThreads,
-      0,
-      at::cuda::getCurrentCUDAStream()>>>(
-      row_output_offsets.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-      b_t_map.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-      B_offsets.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-      B_offsets_rank_per_feature
-          .packed_accessor32<int32_t, 2, at::RestrictPtrTraits>(),
-      output_offsets_feature_rank
-          .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-      D_offsets.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-      D,
-      nobag,
-      FixedDivisor(max_B_feature_rank),
-      FixedDivisor(max_B_feature_rank * T),
-      info_B_num_bits);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  return {row_output_offsets, b_t_map};
-}
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache_cuda.cu b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_embeddings_cache_cuda.cu
similarity index 100%
rename from fbgemm_gpu/src/ssd_split_embeddings_cache_cuda.cu
rename to fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_embeddings_cache_cuda.cu
diff --git a/fbgemm_gpu/src/ssd_split_table_batched_embeddings.cpp b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_table_batched_embeddings.cpp
similarity index 100%
rename from fbgemm_gpu/src/ssd_split_table_batched_embeddings.cpp
rename to fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_table_batched_embeddings.cpp
diff --git a/fbgemm_gpu/src/ssd_table_batched_embeddings.h b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_table_batched_embeddings.h
similarity index 100%
rename from fbgemm_gpu/src/ssd_table_batched_embeddings.h
rename to fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_table_batched_embeddings.h
diff --git a/fbgemm_gpu/test/split_embeddings_utils_test.py b/fbgemm_gpu/test/split_embeddings_utils_test.py
index 19bb14fff5..bbf0960e5d 100644
--- a/fbgemm_gpu/test/split_embeddings_utils_test.py
+++ b/fbgemm_gpu/test/split_embeddings_utils_test.py
@@ -13,15 +13,12 @@
 
 import hypothesis.strategies as st
 import torch
-
+from fbgemm_gpu import sparse_ops  # noqa: F401
 from hypothesis import given, HealthCheck, settings
 
 try:
-    # pyre-ignore[21]
-    from fbgemm_gpu import open_source  # noqa: F401
     from test_utils import gpu_unavailable  # pyre-ignore[21]
 except Exception:
-    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
     from fbgemm_gpu.test.test_utils import gpu_unavailable
 
 

From f65d7e22ba9466096346f183947cf65cbe50754c Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Mon, 20 Nov 2023 10:29:56 -0800
Subject: [PATCH 30/73] Benchmark block_bucketize_sparse_features uneven
 sharding (#2140)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2140

Benchmark block_bucketize_sparse_features

Reviewed By: jiayisuse

Differential Revision: D51288847

fbshipit-source-id: dbb8dc705f32bc90fbdb316bdba8923c89d4f606
---
 fbgemm_gpu/bench/bench_utils.py          |  6 +--
 fbgemm_gpu/bench/sparse_ops_benchmark.py | 64 ++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 3 deletions(-)

diff --git a/fbgemm_gpu/bench/bench_utils.py b/fbgemm_gpu/bench/bench_utils.py
index 02ffda22f6..be5b06287d 100644
--- a/fbgemm_gpu/bench/bench_utils.py
+++ b/fbgemm_gpu/bench/bench_utils.py
@@ -41,13 +41,13 @@ def benchmark_torch_function(  # noqa: C901
     copy_f_for_multi_thread_test: bool = False,
 ) -> Tuple[float, torch.Tensor]:
     logging.info(f"Start to benchmark {name}...")
-    if device != "" and device != "cuda":
+    if device != "cpu" and device != "" and device != "cuda":
         torch.cuda.set_device(device)
     for _ in range(num_warmups):
         output = f(*args)
 
     assert num_threads > 0
-    if torch.cuda.is_available() and (num_threads == 1):
+    if device != "cpu" and torch.cuda.is_available() and (num_threads == 1):
         cache = torch.empty(
             int(flush_gpu_cache_size_mb * 1024 * 1024 // 4),
             dtype=torch.float,
@@ -69,7 +69,7 @@ def benchmark_torch_function(  # noqa: C901
             [s.elapsed_time(e) for s, e in zip(start_event, end_event)]
         )
         elapsed_time = torch.mean(times).item() * 1.0e-3
-    elif torch.cuda.is_available() and (num_threads > 1):
+    elif device != "cpu" and torch.cuda.is_available() and (num_threads > 1):
         cache = torch.empty(
             int(flush_gpu_cache_size_mb * 1024 * 1024 // 4),
             dtype=torch.float,
diff --git a/fbgemm_gpu/bench/sparse_ops_benchmark.py b/fbgemm_gpu/bench/sparse_ops_benchmark.py
index bf0ad96c1e..b06ace2f74 100644
--- a/fbgemm_gpu/bench/sparse_ops_benchmark.py
+++ b/fbgemm_gpu/bench/sparse_ops_benchmark.py
@@ -7,6 +7,7 @@
 import contextlib
 import functools
 import logging
+import math
 import random
 from typing import List
 
@@ -868,5 +869,68 @@ def ben(fn, name, ad_indices, ad_lengths, batch_offsets, num_ads_in_batch):
     ben(pass_4, "pass_4", ad_indices, ad_lengths, batch_offsets, num_ads_in_batch)
 
 
+@cli.command()
+@click.option("--row-size", default=2560000)
+@click.option("--batch-size", default=4096)
+@click.option("--bucket-num", default=16)
+@click.option("--input-precision", type=str, default="long")
+def block_bucketize_sparse_features_bench(
+    row_size: int, batch_size: int, bucket_num: int, input_precision: str
+) -> None:
+
+    dtype = torch.int
+    if input_precision == "int":
+        dtype = torch.int
+    elif input_precision == "long":
+        dtype = torch.long
+    else:
+        raise RuntimeError(f"Does not support data type {input_precision}")
+
+    indices = torch.randint(0, row_size, (batch_size,), dtype=dtype)
+    weights = torch.randint(0, row_size, (batch_size,), dtype=torch.float)
+    total = 0
+    lengths = []
+    for _ in range(batch_size):
+        length = random.randint(0, 10)
+        lengths.append(min(length, batch_size - total))
+        total += length
+        if total > batch_size:
+            break
+    lengths = torch.tensor(lengths, dtype=dtype)
+    bucket_size = math.ceil(row_size / bucket_num)
+    block_sizes = torch.tensor([bucket_size] * lengths.numel(), dtype=dtype)
+
+    bucket_pos = [j * bucket_size for j in range(bucket_num + 1)]
+    block_bucketize_pos = [torch.tensor(bucket_pos)] * lengths.numel()
+    test_param = {"uneven": block_bucketize_pos, "even": None}
+    for name, is_block_bucketize_pos in test_param.items():
+        time, output = benchmark_torch_function(
+            torch.ops.fbgemm.block_bucketize_sparse_features,
+            (
+                lengths,
+                indices,
+                False,
+                True,
+                block_sizes,
+                bucket_num,
+                weights,
+                None,
+                -1,  # unused
+                is_block_bucketize_pos,
+            ),
+            iters=100,
+            device="cpu",
+        )
+
+        num_bytes = 0
+        for tensor in [lengths, indices, weights, *block_bucketize_pos, *output]:
+            if isinstance(tensor, torch.Tensor):
+                num_bytes += (tensor.numel()) * tensor.element_size()
+
+        logging.info(
+            f"{name}_block_bucketize_sparse_features forward: {dtype}, {num_bytes} bytes read/write, {time * 1e3} ms, {num_bytes / time / 1e9} GB/s"
+        )
+
+
 if __name__ == "__main__":
     cli()

From 54340d4531b2ffa9b1c210ed704adc042506397e Mon Sep 17 00:00:00 2001
From: Joshua Deng <joshuadeng@meta.com>
Date: Mon, 20 Nov 2023 14:57:15 -0800
Subject: [PATCH 31/73] Add variable batch per feature support to EBC (tw/cw)
 (#1986)

Summary:
X-link: https://github.com/pytorch/torchrec/pull/1358

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1986

To enable provide stride_per_key_per_rank and inverse_indices to the KJT

Reindexing is done internally with EBC and with user provided inverse indices

Reviewed By: bigning

Differential Revision: D48805440

fbshipit-source-id: e490baa41b9c91b02e6c246940d4a3a2c4551d65
---
 .../permute_pooled_embedding_modules.py       | 34 ++++++-------------
 .../test/permute_pooled_embedding_test.py     |  2 +-
 2 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules.py b/fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules.py
index 4ce1cce9ad..cb66088b9f 100644
--- a/fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules.py
+++ b/fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules.py
@@ -10,7 +10,6 @@
 from typing import List, Optional
 
 import torch
-from torch import nn
 
 try:
     # pyre-ignore[21]
@@ -24,48 +23,37 @@
     )
 
 
-class PermutePooledEmbeddings(nn.Module):
+class PermutePooledEmbeddings:
     def __init__(
         self,
         embs_dims: List[int],
         permute: List[int],
         device: Optional[torch.device] = None,
     ) -> None:
-        super(PermutePooledEmbeddings, self).__init__()
         logging.info("Using Permute Pooled Embeddings")
-
-        self.register_buffer(
-            "_offset_dim_list",
-            torch.tensor(
-                [0] + list(accumulate(embs_dims)), device=device, dtype=torch.int64
-            ),
+        self._offset_dim_list: torch.Tensor = torch.tensor(
+            [0] + list(accumulate(embs_dims)), device=device, dtype=torch.int64
         )
-        self.register_buffer(
-            "_permute", torch.tensor(permute, device=device, dtype=torch.int64)
+
+        self._permute: torch.Tensor = torch.tensor(
+            permute, device=device, dtype=torch.int64
         )
 
         inv_permute: List[int] = [0] * len(permute)
         for i, p in enumerate(permute):
             inv_permute[p] = i
 
-        self.register_buffer(
-            "_inv_permute", torch.tensor(inv_permute, device=device, dtype=torch.int64)
+        self._inv_permute: torch.Tensor = torch.tensor(
+            inv_permute, device=device, dtype=torch.int64
         )
 
-        #  `Union[BoundMethod[typing.Callable(torch.Tensor.tolist)[[Named(self,
-        #  torch.Tensor)], List[typing.Any]], torch.Tensor], nn.Module, torch.Tensor]`
-        #  is not a function.
-
         inv_embs_dims = [embs_dims[i] for i in permute]
 
-        self.register_buffer(
-            "_inv_offset_dim_list",
-            torch.tensor(
-                [0] + list(accumulate(inv_embs_dims)), device=device, dtype=torch.int64
-            ),
+        self._inv_offset_dim_list: torch.Tensor = torch.tensor(
+            [0] + list(accumulate(inv_embs_dims)), device=device, dtype=torch.int64
         )
 
-    def forward(self, pooled_embs: torch.Tensor) -> torch.Tensor:
+    def __call__(self, pooled_embs: torch.Tensor) -> torch.Tensor:
         result = torch.ops.fbgemm.permute_pooled_embs_auto_grad(
             pooled_embs,
             self._offset_dim_list.to(device=pooled_embs.device),
diff --git a/fbgemm_gpu/test/permute_pooled_embedding_test.py b/fbgemm_gpu/test/permute_pooled_embedding_test.py
index 3365457c30..3723ee76c4 100644
--- a/fbgemm_gpu/test/permute_pooled_embedding_test.py
+++ b/fbgemm_gpu/test/permute_pooled_embedding_test.py
@@ -46,7 +46,7 @@
 FIXED_EXTERN_API = {
     "PermutePooledEmbeddings": {
         "__init__": ["self", "embs_dims", "permute", "device"],
-        "forward": ["self", "pooled_embs"],
+        "__call__": ["self", "pooled_embs"],
     },
 }
 

From a5edc61dd6eb55812af0fb7aed304665d48f81e1 Mon Sep 17 00:00:00 2001
From: Carlos Torres <cltorres@meta.com>
Date: Mon, 20 Nov 2023 16:05:59 -0800
Subject: [PATCH 32/73] Back out "Add an auto-vectorization implementation for
 int4 CPU TBE kernel"

Summary:
Original commit changeset: 11320ef18883

Original Phabricator Diff: D50289383

Reviewed By: bangshengtang

Differential Revision: D51472051

fbshipit-source-id: 45ed7a737ed77e5e53fe1e10a2d4292e66dbd43e
---
 CMakeLists.txt                       |  23 +--
 bench/EmbeddingSpMDMNBitBenchmark.cc | 179 +++++++----------------
 defs.bzl                             |   5 -
 fbgemm_gpu/CMakeLists.txt            |   1 -
 include/fbgemm/Types.h               |  41 +-----
 include/fbgemm/Utils.h               |   5 -
 src/EmbeddingSpMDMAutovec.cc         | 204 ---------------------------
 src/EmbeddingSpMDMAutovec.h          |  49 -------
 src/EmbeddingSpMDMNBit.cc            |  29 ----
 src/Utils.cc                         |   4 -
 third_party/cpuinfo                  |   2 +-
 11 files changed, 53 insertions(+), 489 deletions(-)
 delete mode 100644 src/EmbeddingSpMDMAutovec.cc
 delete mode 100644 src/EmbeddingSpMDMAutovec.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b510717a0..134523e7d7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -152,21 +152,18 @@ get_filelist("get_fbgemm_inline_avx2_srcs(msvc=${MSVC_BOOL})"
 get_filelist("get_fbgemm_avx512_srcs(msvc=${MSVC_BOOL})" FBGEMM_AVX512_SRCS)
 get_filelist("get_fbgemm_inline_avx512_srcs(msvc=${MSVC_BOOL})"
   FBGEMM_AVX512_INLINE_SRCS)
-get_filelist("get_fbgemm_autovec_srcs()" FBGEMM_AUTOVEC_SRCS)
 get_filelist("get_fbgemm_public_headers()" FBGEMM_PUBLIC_HEADERS)
 
 add_library(fbgemm_generic OBJECT ${FBGEMM_GENERIC_SRCS})
 add_library(fbgemm_avx2 OBJECT ${FBGEMM_AVX2_SRCS} ${FBGEMM_AVX2_INLINE_SRCS})
 add_library(fbgemm_avx512 OBJECT
   ${FBGEMM_AVX512_SRCS} ${FBGEMM_AVX512_INLINE_SRCS})
-add_library(fbgemm_autovec OBJECT ${FBGEMM_AUTOVEC_SRCS})
 
 # Make libraries depend on defs.bzl
 add_custom_target(defs.bzl DEPENDS defs.bzl)
 add_dependencies(fbgemm_generic defs.bzl)
 add_dependencies(fbgemm_avx2 defs.bzl)
 add_dependencies(fbgemm_avx512 defs.bzl)
-add_dependencies(fbgemm_autovec defs.bzl)
 
 # On Windows:
 # 1)  Adding definition of ASMJIT_STATIC to avoid generating asmjit function
@@ -189,9 +186,6 @@ if(MSVC)
   endif()
   target_compile_options(fbgemm_avx2 PRIVATE "/arch:AVX2")
   target_compile_options(fbgemm_avx512 PRIVATE "/arch:AVX512")
-  if(OpenMP_FOUND)
-    target_compile_options(fbgemm_autovec PRIVATE "/openmp:experimental")
-  endif()
 else(MSVC)
   string(APPEND CMAKE_CXX_FLAGS " -Wall")
   string(APPEND CMAKE_CXX_FLAGS " -Wextra")
@@ -306,40 +300,29 @@ target_include_directories(fbgemm_avx512 BEFORE
       PRIVATE "${ASMJIT_SRC_DIR}/src"
       PRIVATE "${CPUINFO_SOURCE_DIR}/include")
 
-target_include_directories(fbgemm_autovec BEFORE
-      PUBLIC $<BUILD_INTERFACE:${FBGEMM_SOURCE_DIR}>
-      PUBLIC $<BUILD_INTERFACE:${FBGEMM_SOURCE_DIR}/include>
-      PRIVATE "${ASMJIT_SRC_DIR}/src"
-      PRIVATE "${CPUINFO_SOURCE_DIR}/include")
-
 if(FBGEMM_LIBRARY_TYPE STREQUAL "default")
   add_library(fbgemm
     $<TARGET_OBJECTS:fbgemm_generic>
     $<TARGET_OBJECTS:fbgemm_avx2>
-    $<TARGET_OBJECTS:fbgemm_avx512>
-    $<TARGET_OBJECTS:fbgemm_autovec>)
+    $<TARGET_OBJECTS:fbgemm_avx512>)
 elseif(FBGEMM_LIBRARY_TYPE STREQUAL "shared")
   add_library(fbgemm SHARED
     $<TARGET_OBJECTS:fbgemm_generic>
     $<TARGET_OBJECTS:fbgemm_avx2>
-    $<TARGET_OBJECTS:fbgemm_avx512>
-    $<TARGET_OBJECTS:fbgemm_autovec>)
+    $<TARGET_OBJECTS:fbgemm_avx512>)
   set_property(TARGET fbgemm_generic PROPERTY POSITION_INDEPENDENT_CODE ON)
   set_property(TARGET fbgemm_avx2 PROPERTY POSITION_INDEPENDENT_CODE ON)
   set_property(TARGET fbgemm_avx512 PROPERTY POSITION_INDEPENDENT_CODE ON)
-  set_property(TARGET fbgemm_autovec PROPERTY POSITION_INDEPENDENT_CODE ON)
 elseif(FBGEMM_LIBRARY_TYPE STREQUAL "static")
   add_library(fbgemm STATIC
     $<TARGET_OBJECTS:fbgemm_generic>
     $<TARGET_OBJECTS:fbgemm_avx2>
-    $<TARGET_OBJECTS:fbgemm_avx512>
-    $<TARGET_OBJECTS:fbgemm_autovec>)
+    $<TARGET_OBJECTS:fbgemm_avx512>)
   #MSVC need to define FBGEMM_STATIC for fbgemm_generic also to
   #avoid generating _dllimport functions.
   target_compile_definitions(fbgemm_generic PRIVATE FBGEMM_STATIC)
   target_compile_definitions(fbgemm_avx2 PRIVATE FBGEMM_STATIC)
   target_compile_definitions(fbgemm_avx512 PRIVATE FBGEMM_STATIC)
-  target_compile_definitions(fbgemm_autovec PRIVATE FBGEMM_STATIC)
   target_compile_definitions(fbgemm PRIVATE FBGEMM_STATIC)
 else()
   message(FATAL_ERROR "Unsupported library type ${FBGEMM_LIBRARY_TYPE}")
diff --git a/bench/EmbeddingSpMDMNBitBenchmark.cc b/bench/EmbeddingSpMDMNBitBenchmark.cc
index d3bd928c70..336fd6b565 100644
--- a/bench/EmbeddingSpMDMNBitBenchmark.cc
+++ b/bench/EmbeddingSpMDMNBitBenchmark.cc
@@ -26,7 +26,6 @@
 #include "./BenchUtils.h"
 #include "fbgemm/Fbgemm.h"
 #include "fbgemm/FbgemmConvert.h"
-#include "src/EmbeddingSpMDMAutovec.h"
 #include "src/RefImplementations.h"
 
 using namespace std;
@@ -134,13 +133,11 @@ int run_benchmark(
   }
 
   vector<float> output_sls_ref(batch_size * embedding_dim);
-  vector<float> output_sls_autovec(output_sls_ref.size()),
-      output_slws_autovec(output_sls_ref.size());
   vector<float> output_slws_ref(output_sls_ref.size()),
       output_sls(output_sls_ref.size()), output_slws(output_sls_ref.size());
 
-  constexpr int NUM_WARMUP = 10;
-  constexpr int NUM_ITER = 100;
+  constexpr int NUM_WARMUP = 4;
+  constexpr int NUM_ITER = 10;
   // Only counts the number of bytes for reading embedding table and ignore
   // others. Should be good enough as long as embdding_dim is big enough.
   double bytes = lengths_sum * fused_embedding_dim;
@@ -151,10 +148,36 @@ int run_benchmark(
 
   for (bool has_weight : {false, true}) {
     vector<float>& output_ref = has_weight ? output_slws_ref : output_sls_ref;
-    vector<float>& output_autovec =
-        has_weight ? output_slws_autovec : output_sls_autovec;
 
-    bool success = false, success_ref = false, success_autovec = false;
+    bool success = false, success_ref = false;
+
+    if (use_32_bit_indices) {
+      success_ref = EmbeddingSpMDMNBit_ref(
+          bit_rate,
+          embedding_dim,
+          batch_size,
+          lengths_sum,
+          num_rows,
+          fused_embedding_table.data(),
+          indices_32.data(),
+          offsets.data(),
+          has_weight ? weights.data() : nullptr,
+          normalize_by_lengths,
+          output_ref.data());
+    } else {
+      success = EmbeddingSpMDMNBit_ref(
+          bit_rate,
+          embedding_dim,
+          batch_size,
+          lengths_sum,
+          num_rows,
+          fused_embedding_table.data(),
+          indices.data(),
+          offsets.data(),
+          has_weight ? weights.data() : nullptr,
+          normalize_by_lengths,
+          output_ref.data());
+    }
 
     auto kernel_32 = GenerateEmbeddingSpMDMNBit<int32_t>(
         bit_rate,
@@ -171,95 +194,6 @@ int run_benchmark(
 
     vector<float>& output = has_weight ? output_slws : output_sls;
     for (bool flush_cache : {false, true}) {
-      // Reference implementation
-      double t_ref = measureWithWarmup(
-          [&]() {
-            if (use_32_bit_indices) {
-              success_ref = EmbeddingSpMDMNBit_ref(
-                  bit_rate,
-                  embedding_dim,
-                  batch_size,
-                  lengths_sum,
-                  num_rows,
-                  fused_embedding_table.data(),
-                  indices_32.data(),
-                  offsets.data(),
-                  has_weight ? weights.data() : nullptr,
-                  normalize_by_lengths,
-                  output_ref.data());
-            } else {
-              success_ref = EmbeddingSpMDMNBit_ref(
-                  bit_rate,
-                  embedding_dim,
-                  batch_size,
-                  lengths_sum,
-                  num_rows,
-                  fused_embedding_table.data(),
-                  indices.data(),
-                  offsets.data(),
-                  has_weight ? weights.data() : nullptr,
-                  normalize_by_lengths,
-                  output_ref.data());
-            }
-          },
-          NUM_WARMUP,
-          NUM_ITER,
-          [&]() {
-            if (flush_cache) {
-              cache_evict(fused_embedding_table);
-              cache_evict(indices);
-              cache_evict(indices_32);
-              cache_evict(offsets);
-              cache_evict(weights);
-              cache_evict(output);
-            }
-          });
-
-      // Auto-vectorization implementation
-      double t_autovec = measureWithWarmup(
-          [&]() {
-            if (use_32_bit_indices) {
-              success_autovec = EmbeddingSpMDMNBit_autovec(
-                  bit_rate,
-                  embedding_dim,
-                  batch_size,
-                  lengths_sum,
-                  num_rows,
-                  fused_embedding_table.data(),
-                  indices_32.data(),
-                  offsets.data(),
-                  has_weight ? weights.data() : nullptr,
-                  normalize_by_lengths,
-                  output_autovec.data());
-            } else {
-              success_autovec = EmbeddingSpMDMNBit_autovec(
-                  bit_rate,
-                  embedding_dim,
-                  batch_size,
-                  lengths_sum,
-                  num_rows,
-                  fused_embedding_table.data(),
-                  indices.data(),
-                  offsets.data(),
-                  has_weight ? weights.data() : nullptr,
-                  normalize_by_lengths,
-                  output_autovec.data());
-            }
-          },
-          NUM_WARMUP,
-          NUM_ITER,
-          [&]() {
-            if (flush_cache) {
-              cache_evict(fused_embedding_table);
-              cache_evict(indices);
-              cache_evict(indices_32);
-              cache_evict(offsets);
-              cache_evict(weights);
-              cache_evict(output);
-            }
-          });
-
-      // Hand-written AVX2/AVX512 implementation
       double t = measureWithWarmup(
           [&]() {
             if (use_32_bit_indices) {
@@ -317,55 +251,37 @@ int run_benchmark(
         //     has_weight ? output_slws_ref : output_sls_ref;
         if (success != success_ref) {
           assert(
-              false &&
-              "ERROR: reference impl and JIT imp did not both succeed");
-        } else if (success != success_autovec) {
-          assert(
-              false &&
-              "ERROR: reference impl and auto-vec impl did not both succeed");
+              false && "ERROR: refernce impl and JIT imp did not both succeed");
         } else if (success) {
           for (size_t i = 0; i < output.size(); ++i) {
             assert(fabs(output[i] - output_ref[i]) < 1e-3);
-            assert(fabs(output_autovec[i] - output_ref[i]) < 1e-3);
             if (fabs(output[i] - output_ref[i]) >= 1e-3) {
-              cout << "asmjit vs ref  : " << i << " " << output[i] << " "
-                   << output_ref[i] << endl;
-            }
-            if (fabs(output_autovec[i] - output_ref[i]) >= 1e-3) {
-              cout << "autovec vec ref: " << i << " " << output_autovec[i]
-                   << " " << output_ref[i] << endl;
+              cout << i << " " << output[i] << " " << output_ref[i] << endl;
             }
           }
         }
       }
 
       if (has_weight) {
-        cout << "SLW(WEIGHTED), ";
+        cout << setw(16) << "SLW(WEIGHTED) ";
       } else {
-        cout << "SLS, ";
+        cout << setw(16) << "SLS ";
       }
       if (flush_cache) {
-        cout << "cache flushed, ";
+        cout << setw(20) << "cache flushed";
       } else {
-        cout << "cache not flushed, ";
+        cout << setw(20) << "cache not flushed";
       }
       if (prefetch) {
-        cout << "prefetch on, ";
+        cout << setw(16) << "prefetch on";
       } else {
-        cout << "prefetch off, ";
+        cout << setw(16) << "prefetch off";
       }
 
-      cout << "b/w, " << bytes / 1e9 / t << ", GB/s, "
-           << "effective b/w, " << bytes_padded / 1e9 / t << ", GB/s, "
-           << "time, " << t << ", autovec b/w, " << bytes / 1e9 / t_autovec
-           << ", GB/s, "
-           << "autovec eff. b/w, " << bytes_padded / 1e9 / t_autovec
-           << ", GB/s, "
-           << "autovec time, " << t_autovec << ", ref b/w, "
-           << bytes / 1e9 / t_ref << ", GB/s, "
-           << "ref eff. b/w, " << bytes_padded / 1e9 / t_ref << ", GB/s, "
-           << "ref time, " << t_ref << ", autovec speedup, "
-           << t_ref / t_autovec << ", asmjit speedup, " << t_ref / t << endl;
+      cout << setw(8) << "b/w" << setw(10) << bytes / 1e9 / t << " GB/s"
+           << setw(20) << "effective b/w: " << setw(16)
+           << bytes_padded / 1e9 / t << "GB/s" << setw(8) << " time "
+           << setw(16) << t << endl;
     } // flush_cache
   } // has_weight
   return 0;
@@ -379,7 +295,7 @@ int main() {
 
   vector<vector<int>> inputs(GetInputs_());
 
-  for (int bit_rate : {4, 2}) {
+  for (int bit_rate : {2, 4}) {
     for (auto& input : inputs) {
       assert(input.size() > 3);
       batch_size = input[0];
@@ -387,9 +303,10 @@ int main() {
       embedding_dim = input[2];
       average_len = input[3];
 
-      cout << "bit_rate, " << bit_rate << ", batch size, " << batch_size
-           << ", num rows, " << num_rows << ", emb dim, " << embedding_dim
-           << ", avg length, " << average_len << endl;
+      cout << "bit_rate" << setw(6) << bit_rate << "batch size" << setw(6)
+           << batch_size << setw(10) << "num rows" << setw(16) << num_rows
+           << setw(10) << "emb dim" << setw(6) << embedding_dim << setw(16)
+           << "avg length" << setw(6) << average_len << endl;
       // args: batch sz, num rows, emb dim, avg len, normalize, use 32b,
       // prefetch
       cout << "64 bit indices, ";
diff --git a/defs.bzl b/defs.bzl
index 6aadcc25b8..d7dcbd30ec 100644
--- a/defs.bzl
+++ b/defs.bzl
@@ -138,10 +138,5 @@ def get_fbgemm_inline_avx512_srcs(msvc = False, buck = False):
         })
     return asm_srcs if not msvc else intrinsics_srcs
 
-def get_fbgemm_autovec_srcs():
-    return [
-        "src/EmbeddingSpMDMAutovec.cc",
-    ]
-
 def get_fbgemm_tests(skip_tests = []):
     return native.glob(["test/*Test.cc"], exclude = skip_tests)
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 73fa70dd49..cac90f07cf 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -487,7 +487,6 @@ endif()
 
 set(fbgemm_sources_normal
   "${FBGEMM}/src/EmbeddingSpMDM.cc"
-  "${FBGEMM}/src/EmbeddingSpMDMAutovec.cc"
   "${FBGEMM}/src/EmbeddingSpMDMNBit.cc"
   "${FBGEMM}/src/QuantUtils.cc"
   "${FBGEMM}/src/RefImplementations.cc"
diff --git a/include/fbgemm/Types.h b/include/fbgemm/Types.h
index c6ca8a9281..e32a3cdc0d 100644
--- a/include/fbgemm/Types.h
+++ b/include/fbgemm/Types.h
@@ -12,25 +12,6 @@
 #include <cstdlib>
 #include <cstring>
 
-#ifndef __is_identifier
-#define __is_identifier(x) 1
-#endif
-
-#define __has_keyword(__x) !(__is_identifier(__x))
-
-// TODO: we're disabling native fp16 on Windows to workaround test failures
-// due to "undefined symbol __gnu_h2f_ieee" error. We should follup on this
-// later.
-#if __has_keyword(__fp16) && !defined(_WIN32)
-#define HAS_NATIVE_FP16_TYPE
-typedef __fp16 native_fp16_t;
-#elif __has_keyword(_Float16) && !defined(_WIN32)
-#define HAS_NATIVE_FP16_TYPE
-typedef _Float16 native_fp16_t;
-#else
-typedef void native_fp16_t;
-#endif
-
 namespace fbgemm {
 
 using float16 = std::uint16_t;
@@ -180,7 +161,7 @@ static inline float16 cpu_float2half_rz(float f) {
 
 // Converts a 16-bit unsigned integer representation of a IEEE754 half-precision
 // float into an IEEE754 32-bit single-precision float
-inline float cpu_half2float_ref(const float16 h) {
+static inline float cpu_half2float(const float16 h) {
   // Get sign and exponent alone by themselves
   uint32_t sign_bit = (h >> f16_num_non_sign_bits) & 1;
   uint32_t exponent = (h >> f16_num_mantissa_bits) & f16_exponent_mask;
@@ -217,18 +198,6 @@ inline float cpu_half2float_ref(const float16 h) {
   return ret;
 }
 
-// Same as the previous function, but use the built-in fp16 to fp32
-// conversion provided by the compiler
-inline float cpu_half2float(const float16 h) {
-#ifdef HAS_NATIVE_FP16_TYPE
-  __fp16 h_fp16;
-  std::memcpy(&h_fp16, &h, sizeof(__fp16));
-  return h_fp16;
-#else
-  return cpu_half2float_ref(h);
-#endif
-}
-
 static inline float cpu_bf162float(bfloat16 src) {
   float ret;
   uint32_t val_fp32 =
@@ -243,12 +212,4 @@ static inline bfloat16 cpu_float2bfloat16(float src) {
   return (temp + (1u << 15)) >> 16;
 }
 
-inline int64_t round_up(int64_t val, int64_t unit) {
-  return (val + unit - 1) / unit * unit;
-}
-
-inline int64_t div_up(int64_t val, int64_t unit) {
-  return (val + unit - 1) / unit;
-}
-
 } // namespace fbgemm
diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h
index 5dc7a5d791..a9a4f13021 100644
--- a/include/fbgemm/Utils.h
+++ b/include/fbgemm/Utils.h
@@ -146,11 +146,6 @@ FBGEMM_API bool fbgemmHasAvx512VnniSupport();
  */
 FBGEMM_API bool fbgemmHasArmNeonSupport();
 
-/**
- * @brief Are we running on a ARM SVE2 supported cpu?
- */
-FBGEMM_API bool fbgemmHasArmSve2Support();
-
 /**
  * @brief Retrieve current CPU instruction set
  */
diff --git a/src/EmbeddingSpMDMAutovec.cc b/src/EmbeddingSpMDMAutovec.cc
deleted file mode 100644
index d060d65604..0000000000
--- a/src/EmbeddingSpMDMAutovec.cc
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#define FBGEMM_EXPORTS
-#include "./EmbeddingSpMDMAutovec.h"
-
-#include "fbgemm/FbgemmBuild.h"
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <cstring>
-#include <numeric>
-#include <thread>
-
-using std::vector;
-
-namespace fbgemm {
-
-template <typename IndexType, typename OffsetType, typename OutType>
-bool EmbeddingSpMDMNBit_autovec(
-    const int bit_rate,
-    const int64_t block_size,
-    const int64_t output_size,
-    const int64_t index_size,
-    const int64_t data_size,
-    const uint8_t* input,
-    const IndexType* indices,
-    const OffsetType* offsets_or_lengths,
-    const float* weights, // optional, can be null for non-weighted sum
-    const bool normalize_by_lengths,
-    OutType* out,
-    const bool is_weight_positional /*=false*/,
-    const bool use_offsets /*=true*/,
-    int64_t output_stride /*=-1*/,
-    int64_t input_stride /*=-1*/,
-    const bool scale_bias_last /*=true*/,
-    const bool is_bf16_out /*=false*/) {
-  assert((bit_rate == 2 || bit_rate == 4) && "bit_rate must be 2 or 4");
-  const int num_elem_per_byte = 8 / bit_rate;
-
-  if (output_stride == -1) {
-    output_stride = block_size;
-  }
-
-  // block_size is the number of elements and fused_block_size is the size of
-  // an entire row, including scale and bias.
-  const auto scale_bias_offset = 2 * sizeof(float16);
-  if (input_stride == -1) {
-    input_stride = div_up(block_size, num_elem_per_byte) + scale_bias_offset;
-  }
-  int64_t current = 0;
-  const int64_t rounded_bs = round_up(block_size, num_elem_per_byte);
-  vector<float> buf(rounded_bs);
-  for (int m = 0; m < output_size; ++m) {
-    memset(buf.data(), 0, sizeof(float) * rounded_bs);
-    int len = use_offsets ? offsets_or_lengths[m + 1] - offsets_or_lengths[m]
-                          : offsets_or_lengths[m];
-    if (current + len > index_size) {
-      return false;
-    }
-    constexpr int tile_size = 4;
-#if _OPENMP >= 202011
-#pragma omp tile sizes(tile_size)
-#endif
-    for (int i = 0; i < len; ++i) {
-      int64_t idx = indices[current];
-      int64_t prefetch_idx =
-          indices[std::min(current + tile_size, index_size - 1)];
-      do_prefetch(
-          reinterpret_cast<const char*>(input + input_stride * prefetch_idx),
-          1);
-      if (idx < 0 || idx >= data_size) {
-        return false;
-      }
-
-      const float16* scale_bias = reinterpret_cast<const float16*>(
-          input + input_stride * idx +
-          (scale_bias_last ? div_up(block_size, num_elem_per_byte) : 0));
-
-      float weight = 1.0f;
-      if (weights) {
-        weight = weights[is_weight_positional ? i : current];
-      }
-      const float scale = weight * cpu_half2float(scale_bias[0]);
-      const float bias = weight * cpu_half2float(scale_bias[1]);
-
-      const int64_t offset =
-          input_stride * idx + (scale_bias_last ? 0 : scale_bias_offset);
-      if (bit_rate == 4) {
-        const size_t halfbufsz = (block_size + 1) / 2;
-        assert(halfbufsz > 0);
-        thread_local vector<float> buf1;
-        thread_local vector<float> buf2;
-        buf1.resize(halfbufsz);
-        buf2.resize(halfbufsz);
-#pragma omp simd
-        for (size_t j = 0; j < halfbufsz; ++j) {
-          uint8_t quantized1 = input[offset + j] & 0xf;
-          uint8_t quantized2 = input[offset + j] & 0xf0;
-          quantized2 = quantized2 >> 4;
-          buf1[j] = quantized1;
-          buf2[j] = quantized2;
-        }
-#pragma omp simd
-        for (size_t j = 0; j < halfbufsz; ++j) {
-          buf[j * 2] = std::fma(scale, buf1[j], buf[j * 2] + bias);
-          buf[j * 2 + 1] = std::fma(scale, buf2[j], buf[j * 2 + 1] + bias);
-        }
-      } else if (bit_rate == 2) {
-        size_t qbufsz = (block_size + 3) / 4;
-        vector<vector<float>> qbufs(4);
-        for (int k = 0; k < 4; ++k) {
-          qbufs[k].resize(qbufsz);
-        }
-        vector<uint8_t> masks{0x3, 0xC, 0x30, 0xC0};
-#pragma omp simd collapse(2)
-        for (size_t j = 0; j < qbufsz; ++j) {
-          for (size_t k = 0; k < 4; ++k) {
-            uint8_t quantized = input[offset + j] & masks[k];
-            quantized = quantized >> (k * 2);
-            qbufs[k][j] = quantized;
-          }
-        }
-#pragma omp simd collapse(2)
-        for (size_t j = 0; j < qbufsz; ++j) {
-          for (size_t k = 0; k < 4; ++k) {
-            buf[j * 4 + k] =
-                std::fma(scale, qbufs[k][j], buf[j * 4 + k] + bias);
-          }
-        }
-      }
-      ++current;
-    }
-
-    if (normalize_by_lengths && len) {
-      float scale = 1.f / len;
-#pragma omp simd
-      for (int j = 0; j < block_size; ++j) {
-        buf[j] *= scale;
-      }
-    }
-    if (std::is_same<OutType, float>::value) {
-#pragma omp simd
-      for (int j = 0; j < block_size; ++j) {
-        out[j] = buf[j];
-      }
-    } else if (std::is_same<OutType, uint16_t>::value && is_bf16_out) {
-#pragma omp simd
-      for (int j = 0; j < block_size; ++j) {
-        out[j] = cpu_bf162float(buf[j]);
-      }
-    } else {
-#pragma omp simd
-      for (int j = 0; j < block_size; ++j) {
-        out[j] = cpu_half2float(buf[j]);
-      }
-    }
-    out += output_stride;
-  }
-  return current == index_size;
-}
-
-#define INSTANTIATE_SPMDM_BASE(INDEX_TYPE, OFFSET_TYPE, OUT_TYPE) \
-  template FBGEMM_API bool EmbeddingSpMDMNBit_autovec(            \
-      const int bit_rate,                                         \
-      const int64_t block_size,                                   \
-      const int64_t output_size,                                  \
-      const int64_t index_size,                                   \
-      const int64_t data_size,                                    \
-      const uint8_t* input,                                       \
-      const INDEX_TYPE* indices,                                  \
-      const OFFSET_TYPE* offsets_or_lengths,                      \
-      const float* weights,                                       \
-      const bool normalize_by_lengths,                            \
-      OUT_TYPE* out,                                              \
-      const bool is_weight_positional,                            \
-      const bool use_offsets,                                     \
-      int64_t output_stride,                                      \
-      int64_t input_stride,                                       \
-      const bool scale_bias_last,                                 \
-      const bool is_bf16_out);
-
-#define INSTANTIATE_SPMDM_OUT_T(INDEX_TYPE, OFFSET_TYPE) \
-  INSTANTIATE_SPMDM_BASE(INDEX_TYPE, OFFSET_TYPE, float) \
-  INSTANTIATE_SPMDM_BASE(INDEX_TYPE, OFFSET_TYPE, float16)
-
-#define INSTANTIATE_SPMDM_OFFSET_T(INDEX_TYPE) \
-  INSTANTIATE_SPMDM_OUT_T(INDEX_TYPE, int32_t) \
-  INSTANTIATE_SPMDM_OUT_T(INDEX_TYPE, int64_t)
-
-INSTANTIATE_SPMDM_OFFSET_T(int32_t)
-INSTANTIATE_SPMDM_OFFSET_T(int64_t)
-
-#undef INSTANTIATE_SPMDM_OFFSET_T
-#undef INSTANTIATE_SPMDM_OUT_T
-#undef INSTANTIATE_SPMDM_BASE
-
-} // namespace fbgemm
diff --git a/src/EmbeddingSpMDMAutovec.h b/src/EmbeddingSpMDMAutovec.h
deleted file mode 100644
index cad66f4e80..0000000000
--- a/src/EmbeddingSpMDMAutovec.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <algorithm>
-#include <cstdint>
-
-#include "fbgemm/ConvUtils.h"
-#include "fbgemm/FbgemmI8Spmdm.h"
-#include "fbgemm/Types.h"
-
-#ifdef _WIN32
-#define do_prefetch(...)
-#else
-#define do_prefetch(...) __builtin_prefetch(__VA_ARGS__)
-#endif
-
-namespace fbgemm {
-
-template <
-    typename IndexType = std::int64_t,
-    typename OffsetType = std::int32_t,
-    typename OutType = float>
-FBGEMM_API bool EmbeddingSpMDMNBit_autovec(
-    const int bit_rate,
-    const std::int64_t block_size,
-    const std::int64_t output_size,
-    const std::int64_t index_size,
-    const std::int64_t data_size,
-    const std::uint8_t* input,
-    const IndexType* indices,
-    const OffsetType* offsets_or_lengths,
-    const float* weights, // optional, can be null for non-weighted sum
-    const bool normalize_by_lengths,
-    OutType* out,
-    const bool is_weight_positional = false,
-    const bool use_offsets = true,
-    std::int64_t output_stride = -1,
-    std::int64_t input_stride = -1,
-    const bool scale_bias_last = true,
-    const bool is_bf16_out = false);
-
-} // namespace fbgemm
diff --git a/src/EmbeddingSpMDMNBit.cc b/src/EmbeddingSpMDMNBit.cc
index 9d9cadda97..174f02aec8 100644
--- a/src/EmbeddingSpMDMNBit.cc
+++ b/src/EmbeddingSpMDMNBit.cc
@@ -20,7 +20,6 @@
 #include <string>
 #include <tuple>
 #include "./CodeCache.h"
-#include "./EmbeddingSpMDMAutovec.h"
 #include "./MaskAvx2.h"
 #include "./RefImplementations.h"
 #include "fbgemm/SimdUtils.h"
@@ -1126,34 +1125,6 @@ typename EmbeddingSpMDMKernelSignature<uint8_t, indxType, offsetType, outType>::
           out,
           internal::avx2_ps_or_epi32_combined_mask);
     };
-  } else if (fbgemmHasArmSve2Support()) {
-    return [=](int64_t output_size,
-               int64_t index_size,
-               int64_t data_size,
-               const uint8_t* input,
-               const indxType* indices,
-               const offsetType* offsets_or_lengths,
-               const float* weights,
-               outType* out) {
-      return EmbeddingSpMDMNBit_autovec(
-          bit_rate,
-          block_size,
-          output_size,
-          index_size,
-          data_size,
-          input,
-          indices,
-          offsets_or_lengths,
-          weights,
-          normalize_by_lengths,
-          out,
-          is_weight_positional,
-          use_offsets,
-          output_stride,
-          input_stride,
-          scale_bias_last,
-          is_bf16_out);
-    };
   } else {
 #ifdef VLOG
     VLOG(0) << "AVX2 or AVX512 not found, taking the slow path";
diff --git a/src/Utils.cc b/src/Utils.cc
index 9f66b23c09..c61b1a4dd1 100644
--- a/src/Utils.cc
+++ b/src/Utils.cc
@@ -386,10 +386,6 @@ bool fbgemmHasArmNeonSupport() {
   return (cpuinfo_has_arm_neon());
 }
 
-bool fbgemmHasArmSve2Support() {
-  return (cpuinfo_has_arm_sve2());
-}
-
 void fbgemmPartition1D(
     int thread_id,
     int num_threads,
diff --git a/third_party/cpuinfo b/third_party/cpuinfo
index 76d5e8f5b5..ed8b86a253 160000
--- a/third_party/cpuinfo
+++ b/third_party/cpuinfo
@@ -1 +1 @@
-Subproject commit 76d5e8f5b563daa65340a60fce0e9aec73a936df
+Subproject commit ed8b86a253800bafdb7b25c5c399f91bff9cb1f3

From 8a015d3b9ed67aeda33238a038e2785611ff0342 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Mon, 20 Nov 2023 20:56:23 -0800
Subject: [PATCH 33/73] Update GitHub checkout actions (#2146)

Summary:
- Update GitHub checkout actions

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2146

Reviewed By: sryap

Differential Revision: D51486729

Pulled By: q10

fbshipit-source-id: 3510f5273465e8bc594bffeb21f0006a18a37097
---
 .github/workflows/fbgemm_ci.yml               | 8 ++++----
 .github/workflows/fbgemm_gpu_ci.yml           | 6 +++---
 .github/workflows/fbgemm_gpu_cpu_nightly.yml  | 4 ++--
 .github/workflows/fbgemm_gpu_cpu_release.yml  | 4 ++--
 .github/workflows/fbgemm_gpu_cuda_nightly.yml | 6 +++++-
 .github/workflows/fbgemm_gpu_cuda_release.yml | 2 +-
 .github/workflows/fbgemm_gpu_docs.yml         | 2 +-
 .github/workflows/fbgemm_gpu_lint.yml         | 2 +-
 .github/workflows/fbgemm_gpu_pip.yml          | 6 +++---
 fbgemm_gpu/bench/sparse_ops_benchmark.py      | 1 -
 10 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/fbgemm_ci.yml b/.github/workflows/fbgemm_ci.yml
index 00ebfa358d..19b515c031 100644
--- a/.github/workflows/fbgemm_ci.yml
+++ b/.github/workflows/fbgemm_ci.yml
@@ -48,7 +48,7 @@ jobs:
         git config --global --add safe.directory '*'
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
@@ -86,7 +86,7 @@ jobs:
 
     steps:
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
@@ -127,7 +127,7 @@ jobs:
         git config --global --add safe.directory '*'
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
@@ -159,7 +159,7 @@ jobs:
 
     steps:
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml
index ba89c57329..43006e5a3e 100644
--- a/.github/workflows/fbgemm_gpu_ci.yml
+++ b/.github/workflows/fbgemm_gpu_ci.yml
@@ -57,7 +57,7 @@ jobs:
         git config --global --add safe.directory '*'
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
@@ -126,7 +126,7 @@ jobs:
         git config --global --add safe.directory '*'
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
@@ -191,7 +191,7 @@ jobs:
         git config --global --add safe.directory '*'
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
diff --git a/.github/workflows/fbgemm_gpu_cpu_nightly.yml b/.github/workflows/fbgemm_gpu_cpu_nightly.yml
index 5501ee89e3..8c5efd66fe 100644
--- a/.github/workflows/fbgemm_gpu_cpu_nightly.yml
+++ b/.github/workflows/fbgemm_gpu_cpu_nightly.yml
@@ -71,7 +71,7 @@ jobs:
       run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
@@ -136,7 +136,7 @@ jobs:
       run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
diff --git a/.github/workflows/fbgemm_gpu_cpu_release.yml b/.github/workflows/fbgemm_gpu_cpu_release.yml
index 1300b4781f..aba87df783 100644
--- a/.github/workflows/fbgemm_gpu_cpu_release.yml
+++ b/.github/workflows/fbgemm_gpu_cpu_release.yml
@@ -68,7 +68,7 @@ jobs:
       run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
@@ -133,7 +133,7 @@ jobs:
       run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
diff --git a/.github/workflows/fbgemm_gpu_cuda_nightly.yml b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
index be1fc32fc4..7b001fa180 100644
--- a/.github/workflows/fbgemm_gpu_cuda_nightly.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
@@ -70,7 +70,7 @@ jobs:
       run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
@@ -140,6 +140,10 @@ jobs:
     needs: build_artifact
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which
+
+    # Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
diff --git a/.github/workflows/fbgemm_gpu_cuda_release.yml b/.github/workflows/fbgemm_gpu_cuda_release.yml
index 838be62996..74b79a88dc 100644
--- a/.github/workflows/fbgemm_gpu_cuda_release.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_release.yml
@@ -74,7 +74,7 @@ jobs:
       run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
diff --git a/.github/workflows/fbgemm_gpu_docs.yml b/.github/workflows/fbgemm_gpu_docs.yml
index bcc6a93d2a..d3a69bca5b 100644
--- a/.github/workflows/fbgemm_gpu_docs.yml
+++ b/.github/workflows/fbgemm_gpu_docs.yml
@@ -44,7 +44,7 @@ jobs:
       run: yum update -y; yum install -y binutils findutils git pciutils rsync sudo tar wget which
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
diff --git a/.github/workflows/fbgemm_gpu_lint.yml b/.github/workflows/fbgemm_gpu_lint.yml
index 41c8a08967..3dccceacca 100644
--- a/.github/workflows/fbgemm_gpu_lint.yml
+++ b/.github/workflows/fbgemm_gpu_lint.yml
@@ -39,7 +39,7 @@ jobs:
 
     steps:
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Setup Miniconda
       run: . $PRELUDE; setup_miniconda $HOME/miniconda
diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
index f0858138fb..4b9e720352 100644
--- a/.github/workflows/fbgemm_gpu_pip.yml
+++ b/.github/workflows/fbgemm_gpu_pip.yml
@@ -66,7 +66,7 @@ jobs:
       run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Display System Info
       run: . $PRELUDE; print_system_info; print_ec2_info
@@ -117,7 +117,7 @@ jobs:
 
     steps:
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Install NVIDIA Drivers and NVIDIA-Docker Runtime
       uses: pytorch/test-infra/.github/actions/setup-nvidia@main
@@ -182,7 +182,7 @@ jobs:
         git config --global --add safe.directory '*'
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Display System Info
       run: . $PRELUDE; print_system_info
diff --git a/fbgemm_gpu/bench/sparse_ops_benchmark.py b/fbgemm_gpu/bench/sparse_ops_benchmark.py
index b06ace2f74..afca7d4678 100644
--- a/fbgemm_gpu/bench/sparse_ops_benchmark.py
+++ b/fbgemm_gpu/bench/sparse_ops_benchmark.py
@@ -877,7 +877,6 @@ def ben(fn, name, ad_indices, ad_lengths, batch_offsets, num_ads_in_batch):
 def block_bucketize_sparse_features_bench(
     row_size: int, batch_size: int, bucket_num: int, input_precision: str
 ) -> None:
-
     dtype = torch.int
     if input_precision == "int":
         dtype = torch.int

From 2b3f861528a9767902561c197d4bcb5bcdc9db38 Mon Sep 17 00:00:00 2001
From: Qiang Zhang <drqiangzhang@meta.com>
Date: Mon, 20 Nov 2023 22:44:59 -0800
Subject: [PATCH 34/73] Back out "RW Dist change to support uneven sharding"

Summary:
X-link: https://github.com/pytorch/torchrec/pull/1532

Revert this diff due to errors https://fb.workplace.com/groups/1069285536500339/permalink/6853872844708217/

Reviewed By: dstaay-fb, joshuadeng

Differential Revision: D51480773

fbshipit-source-id: ffb1418334cf81bc5a1541918f0548986a9e5678
---
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index 7acefdf18a..8291927f45 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -306,8 +306,7 @@ void _block_bucketize_sparse_features_cpu(
   const index_t* const block_sizes_data = block_sizes.data_ptr<index_t>();
   offset_t* batch_sizes_data = nullptr;
   const auto variable_batch_size = batch_size_per_feature.has_value();
-  const auto variable_bucket_sizes = block_bucketize_pos.has_value() &&
-      block_bucketize_pos.value().size() != 0;
+  const auto variable_bucket_sizes = block_bucketize_pos.has_value();
   using uindex_t = std::make_unsigned_t<index_t>;
   using uoffset_t = std::make_unsigned_t<offset_t>;
   std::vector<int64_t> lower_bounds(indices.numel(), 0);

From e7787930f058d7ca3a479ec1f99b60c5d6a2572f Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Tue, 21 Nov 2023 10:51:04 -0800
Subject: [PATCH 35/73] Re-organize int8_ops (#2145)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2145

- Migrate int8_ops code into its own directory

Reviewed By: spcyppt

Differential Revision: D51476486

fbshipit-source-id: 552f2e62b7ac0b01e5ae9ffa7c2a87526ea89d5d
---
 .../src/split_embeddings_utils/generate_vbe_metadata.cu       | 4 ++++
 fbgemm_gpu/src/split_embeddings_utils/get_infos_metadata.cu   | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/fbgemm_gpu/src/split_embeddings_utils/generate_vbe_metadata.cu b/fbgemm_gpu/src/split_embeddings_utils/generate_vbe_metadata.cu
index fe65a8d95e..a67cf1065f 100644
--- a/fbgemm_gpu/src/split_embeddings_utils/generate_vbe_metadata.cu
+++ b/fbgemm_gpu/src/split_embeddings_utils/generate_vbe_metadata.cu
@@ -14,6 +14,8 @@
 using Tensor = at::Tensor;
 using namespace fbgemm_gpu;
 
+namespace {
+
 __global__
 __launch_bounds__(kMaxThreads) void generate_vbe_metadata_foreach_sample_kernel(
     at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
@@ -71,6 +73,8 @@ __launch_bounds__(kMaxThreads) void generate_vbe_metadata_foreach_sample_kernel(
       reinterpret_cast<const uint32_t*>(&b_)[0];
 }
 
+} // namespace
+
 /// Generate VBE metadata namely output_offsets and b_t_map
 ///
 /// row_output_offsets A 1D tensor that contains the output offset of each b
diff --git a/fbgemm_gpu/src/split_embeddings_utils/get_infos_metadata.cu b/fbgemm_gpu/src/split_embeddings_utils/get_infos_metadata.cu
index e8fdcc1411..d9f40e1f63 100644
--- a/fbgemm_gpu/src/split_embeddings_utils/get_infos_metadata.cu
+++ b/fbgemm_gpu/src/split_embeddings_utils/get_infos_metadata.cu
@@ -59,7 +59,7 @@ DLL_PUBLIC std::tuple<int32_t, uint32_t> adjust_info_B_num_bits(
   return {info_B_num_bits, info_B_mask};
 }
 
-std::tuple<int64_t, int64_t> DLL_PUBLIC
+DLL_PUBLIC std::tuple<int64_t, int64_t>
 get_infos_metadata(Tensor unused, int64_t B, int64_t T) {
   return adjust_info_B_num_bits(B, T);
 }

From 543632050ad87b6368927b8570be991f7a2e9959 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Tue, 21 Nov 2023 12:22:14 -0800
Subject: [PATCH 36/73] Update GitHub checkout actions, pt2 (#2149)

Summary:
- Fix remaining issues with updating the GitHub checkout actions

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2149

Reviewed By: sryap

Differential Revision: D51502939

Pulled By: q10

fbshipit-source-id: f05f7aa2e1abdc73b0cbf55887001ac9bf6e1d2b
---
 .github/workflows/fbgemm_gpu_cuda_nightly.yml | 3 ---
 .github/workflows/fbgemm_gpu_pip.yml          | 3 ++-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/fbgemm_gpu_cuda_nightly.yml b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
index 7b001fa180..f5ed26aec3 100644
--- a/.github/workflows/fbgemm_gpu_cuda_nightly.yml
+++ b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
@@ -140,9 +140,6 @@ jobs:
     needs: build_artifact
 
     steps:
-    - name: Setup Build Container
-      run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which
-
     # Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
     - name: Checkout the Repository
       uses: actions/checkout@v3
diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
index 4b9e720352..d9f6dc2ff6 100644
--- a/.github/workflows/fbgemm_gpu_pip.yml
+++ b/.github/workflows/fbgemm_gpu_pip.yml
@@ -116,8 +116,9 @@ jobs:
         cuda-version-publish: [ "11.8.0" ]
 
     steps:
+    # Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
     - name: Checkout the Repository
-      uses: actions/checkout@v4
+      uses: actions/checkout@v3
 
     - name: Install NVIDIA Drivers and NVIDIA-Docker Runtime
       uses: pytorch/test-infra/.github/actions/setup-nvidia@main

From 37111f54f0b5dc4d9feca41afb1e952b6b94fd07 Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Tue, 21 Nov 2023 13:07:25 -0800
Subject: [PATCH 37/73] Recompute linear_cache_indices for pipeline prefetching
 (#2147)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2147

When pipeline prefetching is enabled (`prefetch_pipeline=True`) for
`EmbeddingLocation.MANAGED_CACHING`, TBE has to update
`lxu_cache_locations` to ensure cache consistency before the backward
pass.  The `lxu_cache_locations` update requires
`linear_cache_indices` as an input.  Prior to this diff, TBE keeps
`linear_cache_indices` alive after prefetching until the tensor is
used for the `lxu_cache_locations` update.  This puts a lot of
pressure to the memory space requirement limiting the enablement of
pipeline prefetching for some models.  This diff addresses the memory
limitation issue by recomputing `linear_cache_indices` when it is
needed.

Reviewed By: jspark1105

Differential Revision: D50983176

fbshipit-source-id: 050c4bb59db4697a5d53d09b52e39e101ecd50ee
---
 ...t_table_batched_embeddings_ops_training.py | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
index cfea8e1f88..5751817173 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
@@ -280,7 +280,6 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
     record_cache_metrics: RecordCacheMetrics
     uvm_cache_stats: torch.Tensor
     local_uvm_cache_stats: torch.Tensor
-    linear_cache_indices_list: List[Tensor]
 
     def __init__(  # noqa C901
         self,
@@ -941,6 +940,11 @@ def forward(  # noqa: C901
                 B_offsets=vbe_metadata.B_offsets,
                 max_B=vbe_metadata.max_B,
             )
+
+        # Storing indices and offsets for linear_cache_indices recomputation
+        self._indices = indices
+        self._offsets = offsets
+
         self.step += 1
         if len(self.timesteps_prefetched) == 0:
             self._prefetch(indices, offsets)
@@ -1234,8 +1238,6 @@ def _prefetch(self, indices: Tensor, offsets: Tensor) -> None:
         )
 
         self.lxu_cache_locations_list.append(lxu_cache_locations)
-        if self.prefetch_pipeline:
-            self.linear_cache_indices_list.append(linear_cache_indices)
 
         if self.gather_uvm_cache_stats:
             # Accumulate local_uvm_cache_stats (int32) into uvm_cache_stats (int64).
@@ -1255,8 +1257,6 @@ def _prefetch_tensors_record_stream(
 
         for t in self.lxu_cache_locations_list:
             t.record_stream(forward_stream)
-        for t in self.linear_cache_indices_list:
-            t.record_stream(forward_stream)
 
     def _update_cache_miss_counter(
         self,
@@ -1580,8 +1580,9 @@ def _apply_cache_state(
             0, device=self.current_device, dtype=torch.int32
         ).fill_(-1)
         self.lxu_cache_locations = self.lxu_cache_locations_empty
+        self._indices = self.lxu_cache_locations_empty
+        self._offsets = self.lxu_cache_locations_empty
         self.prefetch_stream: Optional[torch.cuda.Stream] = None
-        self.linear_cache_indices_list = []
 
         self._init_uvm_cache_stats()
 
@@ -1796,7 +1797,12 @@ def _update_cache_counter_and_locations(
             self.lxu_cache_locations,
         )
 
-        linear_cache_indices = self.linear_cache_indices_list.pop(0)
+        # Recompute linear_cache_indices
+        linear_cache_indices = torch.ops.fbgemm.linearize_cache_indices(
+            self.cache_hash_size_cumsum,
+            self._indices,
+            self._offsets,
+        )
         lxu_cache_locations_new = torch.ops.fbgemm.lxu_cache_lookup(
             linear_cache_indices,
             self.lxu_cache_state,

From f49dea6a16f153584b926203f8b3a1ee5801dea7 Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Wed, 22 Nov 2023 10:34:42 -0800
Subject: [PATCH 38/73] Disable @optests.generate_opcheck_tests in TBE unit
 tests (#2152)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2152

`optests.generate_opcheck_tests` generates test cases that cause a
lot of failures (see P887745303).  We will investigate this and put
the decorator back once all the failures are resolved.

Reviewed By: zou3519

Differential Revision: D51512895

fbshipit-source-id: 504e7da46e8b94f1031436f6ba44490648f19e79
---
 fbgemm_gpu/test/split_table_batched_embeddings_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
index 641b3c1e31..c7e00215d0 100644
--- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py
+++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -178,7 +178,6 @@ def format_ref_tensors_in_mixed_B_layout(
 }
 
 
-@optests.generate_opcheck_tests(fast=True, additional_decorators=additional_decorators)
 class SplitTableBatchedEmbeddingsTest(unittest.TestCase):
     def execute_forward_(  # noqa C901
         self,

From 84c7b278be31ea104ae8ae86ee197edbb0103e7a Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Wed, 22 Nov 2023 21:55:58 -0800
Subject: [PATCH 39/73] Refactor embedding_bounds_check (#2155)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2155

- Migrate embedding_bounds_check out of codegen and into split_embeddings_utils

Reviewed By: r-barnes

Differential Revision: D51536598

fbshipit-source-id: 76f5eac707ba19688620451b6a270a7082045dd1
---
 fbgemm_gpu/CMakeLists.txt                                | 6 +++---
 fbgemm_gpu/docs/Doxyfile.in                              | 4 ++--
 .../split_embeddings_utils}/embedding_bounds_check.cu    | 3 +--
 .../embedding_bounds_check_cpu.cpp}                      | 0
 .../embedding_bounds_check_gpu.cpp}                      | 2 --
 fbgemm_gpu/test/split_table_batched_embeddings_test.py   | 9 +--------
 6 files changed, 7 insertions(+), 17 deletions(-)
 rename fbgemm_gpu/{codegen => src/split_embeddings_utils}/embedding_bounds_check.cu (99%)
 rename fbgemm_gpu/{codegen/embedding_bounds_check_host_cpu.cpp => src/split_embeddings_utils/embedding_bounds_check_cpu.cpp} (100%)
 rename fbgemm_gpu/{codegen/embedding_bounds_check_host.cpp => src/split_embeddings_utils/embedding_bounds_check_gpu.cpp} (96%)

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index cac90f07cf..8af7862213 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -560,7 +560,6 @@ set(fbgemm_gpu_sources_static_cpu
     codegen/embedding_forward_split_cpu.cpp
     codegen/embedding_forward_quantized_host_cpu.cpp
     codegen/embedding_backward_dense_host_cpu.cpp
-    codegen/embedding_bounds_check_host_cpu.cpp
     src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
@@ -580,13 +579,13 @@ set(fbgemm_gpu_sources_static_cpu
     src/split_embeddings_cache/lru_cache_populate_byte.cpp
     src/split_embeddings_cache/lxu_cache.cpp
     src/split_embeddings_cache/split_embeddings_cache_ops.cpp
+    src/split_embeddings_utils/embedding_bounds_check_cpu.cpp
     codegen/batch_index_select_dim0_cpu_host.cpp)
 
 if(NOT FBGEMM_CPU_ONLY)
   list(APPEND fbgemm_gpu_sources_static_cpu
     codegen/embedding_forward_quantized_host.cpp
     codegen/embedding_backward_dense_host.cpp
-    codegen/embedding_bounds_check_host.cpp
     src/memory_utils/memory_utils.cpp
     src/memory_utils/memory_utils_ops.cpp
     src/layout_transform_ops/layout_transform_ops_gpu.cpp
@@ -598,6 +597,7 @@ if(NOT FBGEMM_CPU_ONLY)
     src/split_embeddings_cache/split_embeddings_cache_ops.cu
     src/metric_ops/metric_ops_host.cpp
     src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
+    src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
     src/input_combine_ops/input_combine_gpu.cpp
     codegen/batch_index_select_dim0_host.cpp)
 
@@ -627,7 +627,6 @@ endif()
 
 if(NOT FBGEMM_CPU_ONLY)
   set(fbgemm_gpu_sources_static_gpu
-      codegen/embedding_bounds_check.cu
       codegen/embedding_forward_quantized_split_lookup.cu
       src/memory_utils/memory_utils.cu
       src/memory_utils/memory_utils_ops.cu
@@ -691,6 +690,7 @@ if(NOT FBGEMM_CPU_ONLY)
       src/split_embeddings_cache/lxu_cache.cu
       src/split_embeddings_cache/linearize_cache_indices.cu
       src/split_embeddings_cache/reset_weight_momentum.cu
+      src/split_embeddings_utils/embedding_bounds_check.cu
       src/split_embeddings_utils/generate_vbe_metadata.cu
       src/split_embeddings_utils/get_infos_metadata.cu
       src/split_embeddings_utils/radix_sort_pairs.cu
diff --git a/fbgemm_gpu/docs/Doxyfile.in b/fbgemm_gpu/docs/Doxyfile.in
index 447ac5f62d..3e5c8c20c8 100644
--- a/fbgemm_gpu/docs/Doxyfile.in
+++ b/fbgemm_gpu/docs/Doxyfile.in
@@ -932,10 +932,10 @@ INPUT                  = "../include/fbgemm_gpu" \
                          "../codegen/embedding_backward_split_host_cpu_template.cpp" \
                          "../codegen/embedding_forward_quantized_host.cpp" \
                          "../codegen/embedding_forward_quantized_host_cpu.cpp" \
-                         "../codegen/embedding_bounds_check_host.cpp" \
-                         "../codegen/embedding_bounds_check_host_cpu.cpp" \
                          "../codegen/embedding_backward_dense_host.cpp" \
                          "../codegen/embedding_forward_template_helpers.cuh " \
+                         "../src/split_embeddings_utils/embedding_bounds_check_cpu.cpp" \
+                         "../src/split_embeddings_utils/embedding_bounds_check_gpu.cpp" \
                          "../src/permute_pooled_embedding_ops_gpu.cpp" \
 
 # This tag can be used to specify the character encoding of the source files
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check.cu b/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check.cu
similarity index 99%
rename from fbgemm_gpu/codegen/embedding_bounds_check.cu
rename to fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check.cu
index a93be2ba6c..700d8974d8 100644
--- a/fbgemm_gpu/codegen/embedding_bounds_check.cu
+++ b/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check.cu
@@ -6,10 +6,9 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
-
 #include <c10/cuda/CUDADeviceAssertion.h>
 #include <c10/cuda/CUDAException.h>
+#include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
 
 using Tensor = at::Tensor;
 using namespace fbgemm_gpu;
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp b/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_cpu.cpp
similarity index 100%
rename from fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
rename to fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_cpu.cpp
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp b/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
similarity index 96%
rename from fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
rename to fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
index afa52729b7..77f68eb1df 100644
--- a/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
+++ b/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
@@ -9,9 +9,7 @@
 #include <ATen/ATen.h>
 #include <ATen/TypeDefault.h>
 #include <ATen/core/op_registration/op_registration.h>
-#include <ATen/cuda/CUDAContext.h>
 #include <torch/library.h>
-
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
 using Tensor = at::Tensor;
diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
index c7e00215d0..302f35f48b 100644
--- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py
+++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -73,19 +73,12 @@
 
 if open_source:
     # pyre-ignore[21]
-    from test_utils import (
-        gpu_available,
-        gpu_unavailable,
-        gradcheck,
-        optests,
-        TEST_WITH_ROCM,
-    )
+    from test_utils import gpu_available, gpu_unavailable, gradcheck, TEST_WITH_ROCM
 else:
     from fbgemm_gpu.test.test_utils import (
         gpu_available,
         gpu_unavailable,
         gradcheck,
-        optests,
         TEST_WITH_ROCM,
     )
 

From b40f4197174bc470bc42ede8c8c6af432058e97a Mon Sep 17 00:00:00 2001
From: Vladimir Matveev <vladima@meta.com>
Date: Fri, 24 Nov 2023 11:02:12 -0800
Subject: [PATCH 40/73] Back out "Refactor embedding_bounds_check" (#2156)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2156

Original commit changeset: 76f5eac707ba

Original Phabricator Diff: D51536598

Reviewed By: q10

Differential Revision: D51552986

fbshipit-source-id: 421b9bf9b07c77ca8abfafa0908e5d9aa7bc87b7
---
 fbgemm_gpu/CMakeLists.txt                                | 6 +++---
 .../embedding_bounds_check.cu                            | 3 ++-
 .../embedding_bounds_check_host.cpp}                     | 2 ++
 .../embedding_bounds_check_host_cpu.cpp}                 | 0
 fbgemm_gpu/docs/Doxyfile.in                              | 4 ++--
 fbgemm_gpu/test/split_table_batched_embeddings_test.py   | 9 ++++++++-
 6 files changed, 17 insertions(+), 7 deletions(-)
 rename fbgemm_gpu/{src/split_embeddings_utils => codegen}/embedding_bounds_check.cu (99%)
 rename fbgemm_gpu/{src/split_embeddings_utils/embedding_bounds_check_gpu.cpp => codegen/embedding_bounds_check_host.cpp} (96%)
 rename fbgemm_gpu/{src/split_embeddings_utils/embedding_bounds_check_cpu.cpp => codegen/embedding_bounds_check_host_cpu.cpp} (100%)

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 8af7862213..cac90f07cf 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -560,6 +560,7 @@ set(fbgemm_gpu_sources_static_cpu
     codegen/embedding_forward_split_cpu.cpp
     codegen/embedding_forward_quantized_host_cpu.cpp
     codegen/embedding_backward_dense_host_cpu.cpp
+    codegen/embedding_bounds_check_host_cpu.cpp
     src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
@@ -579,13 +580,13 @@ set(fbgemm_gpu_sources_static_cpu
     src/split_embeddings_cache/lru_cache_populate_byte.cpp
     src/split_embeddings_cache/lxu_cache.cpp
     src/split_embeddings_cache/split_embeddings_cache_ops.cpp
-    src/split_embeddings_utils/embedding_bounds_check_cpu.cpp
     codegen/batch_index_select_dim0_cpu_host.cpp)
 
 if(NOT FBGEMM_CPU_ONLY)
   list(APPEND fbgemm_gpu_sources_static_cpu
     codegen/embedding_forward_quantized_host.cpp
     codegen/embedding_backward_dense_host.cpp
+    codegen/embedding_bounds_check_host.cpp
     src/memory_utils/memory_utils.cpp
     src/memory_utils/memory_utils_ops.cpp
     src/layout_transform_ops/layout_transform_ops_gpu.cpp
@@ -597,7 +598,6 @@ if(NOT FBGEMM_CPU_ONLY)
     src/split_embeddings_cache/split_embeddings_cache_ops.cu
     src/metric_ops/metric_ops_host.cpp
     src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
-    src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
     src/input_combine_ops/input_combine_gpu.cpp
     codegen/batch_index_select_dim0_host.cpp)
 
@@ -627,6 +627,7 @@ endif()
 
 if(NOT FBGEMM_CPU_ONLY)
   set(fbgemm_gpu_sources_static_gpu
+      codegen/embedding_bounds_check.cu
       codegen/embedding_forward_quantized_split_lookup.cu
       src/memory_utils/memory_utils.cu
       src/memory_utils/memory_utils_ops.cu
@@ -690,7 +691,6 @@ if(NOT FBGEMM_CPU_ONLY)
       src/split_embeddings_cache/lxu_cache.cu
       src/split_embeddings_cache/linearize_cache_indices.cu
       src/split_embeddings_cache/reset_weight_momentum.cu
-      src/split_embeddings_utils/embedding_bounds_check.cu
       src/split_embeddings_utils/generate_vbe_metadata.cu
       src/split_embeddings_utils/get_infos_metadata.cu
       src/split_embeddings_utils/radix_sort_pairs.cu
diff --git a/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check.cu b/fbgemm_gpu/codegen/embedding_bounds_check.cu
similarity index 99%
rename from fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check.cu
rename to fbgemm_gpu/codegen/embedding_bounds_check.cu
index 700d8974d8..a93be2ba6c 100644
--- a/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check.cu
+++ b/fbgemm_gpu/codegen/embedding_bounds_check.cu
@@ -6,9 +6,10 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
+
 #include <c10/cuda/CUDADeviceAssertion.h>
 #include <c10/cuda/CUDAException.h>
-#include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
 
 using Tensor = at::Tensor;
 using namespace fbgemm_gpu;
diff --git a/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_gpu.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
similarity index 96%
rename from fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
rename to fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
index 77f68eb1df..afa52729b7 100644
--- a/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
+++ b/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
@@ -9,7 +9,9 @@
 #include <ATen/ATen.h>
 #include <ATen/TypeDefault.h>
 #include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <torch/library.h>
+
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
 using Tensor = at::Tensor;
diff --git a/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_cpu.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
similarity index 100%
rename from fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_cpu.cpp
rename to fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
diff --git a/fbgemm_gpu/docs/Doxyfile.in b/fbgemm_gpu/docs/Doxyfile.in
index 3e5c8c20c8..447ac5f62d 100644
--- a/fbgemm_gpu/docs/Doxyfile.in
+++ b/fbgemm_gpu/docs/Doxyfile.in
@@ -932,10 +932,10 @@ INPUT                  = "../include/fbgemm_gpu" \
                          "../codegen/embedding_backward_split_host_cpu_template.cpp" \
                          "../codegen/embedding_forward_quantized_host.cpp" \
                          "../codegen/embedding_forward_quantized_host_cpu.cpp" \
+                         "../codegen/embedding_bounds_check_host.cpp" \
+                         "../codegen/embedding_bounds_check_host_cpu.cpp" \
                          "../codegen/embedding_backward_dense_host.cpp" \
                          "../codegen/embedding_forward_template_helpers.cuh " \
-                         "../src/split_embeddings_utils/embedding_bounds_check_cpu.cpp" \
-                         "../src/split_embeddings_utils/embedding_bounds_check_gpu.cpp" \
                          "../src/permute_pooled_embedding_ops_gpu.cpp" \
 
 # This tag can be used to specify the character encoding of the source files
diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
index 302f35f48b..c7e00215d0 100644
--- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py
+++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -73,12 +73,19 @@
 
 if open_source:
     # pyre-ignore[21]
-    from test_utils import gpu_available, gpu_unavailable, gradcheck, TEST_WITH_ROCM
+    from test_utils import (
+        gpu_available,
+        gpu_unavailable,
+        gradcheck,
+        optests,
+        TEST_WITH_ROCM,
+    )
 else:
     from fbgemm_gpu.test.test_utils import (
         gpu_available,
         gpu_unavailable,
         gradcheck,
+        optests,
         TEST_WITH_ROCM,
     )
 

From 934d881c1fbf8146717d2be9038dfc81993fbeab Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Mon, 27 Nov 2023 16:06:23 -0800
Subject: [PATCH 41/73] Remove unused pyre-ignore in TBE tests (#2162)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2162

As title

Reviewed By: q10

Differential Revision: D51600941

fbshipit-source-id: 514d6b52cbf2d25c7ed1a87d12390cdd49e5374e
---
 .../split_table_batched_embeddings_test.py    | 29 ++-----------------
 1 file changed, 2 insertions(+), 27 deletions(-)

diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
index c7e00215d0..250948db12 100644
--- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py
+++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -4370,14 +4370,9 @@ def execute_nbit_forward_(  # noqa C901
             # Initialize and insert Array index remapping based data structure
             index_remappings_array = []
             for t in range(T):
-                # pyre-fixme[6]: For 1st param expected `dtype` but got `Union[int,
-                #  str]`.
                 indice_t = (indices.view(T, B, L))[t].long().view(-1).to(current_device)
                 dense_indice_t = (
-                    (dense_indices.view(T, B, L))[t].view(-1)
-                    # pyre-fixme[6]: For 1st param expected `dtype` but got `Union[int,
-                    #  str]`.
-                    .to(current_device)
+                    (dense_indices.view(T, B, L))[t].view(-1).to(current_device)
                 )
                 index_remappings_array_t = torch.tensor(
                     [-1] * original_E,
@@ -5143,19 +5138,13 @@ def test_pruning(
         index_remappings_array_offsets = torch.empty(
             T + 1,
             dtype=torch.int64,
-            # pyre-fixme[6]: For 3rd param expected `Union[None, str, device]` but
-            #  got `Union[int, str]`.
             device=current_device,
         )
         index_remappings_array_offsets[0] = 0
         for t in range(T):
-            # pyre-fixme[6]: For 1st param expected `dtype` but got `Union[int, str]`.
             indice_t = (indices.view(T, B, L))[t].long().view(-1).to(current_device)
             dense_indice_t = (
-                (dense_indices.view(T, B, L))[t].view(-1)
-                # pyre-fixme[6]: For 1st param expected `dtype` but got `Union[int,
-                #  str]`.
-                .to(current_device)
+                (dense_indices.view(T, B, L))[t].view(-1).to(current_device)
             )
             selected_indices = torch.add(indice_t, t * original_E)[:E]
             index_remappings_array[selected_indices] = dense_indice_t
@@ -5174,26 +5163,12 @@ def test_pruning(
                 index_remappings_array,
                 index_remappings_array_offsets,
             ) = (
-                # pyre-fixme[6]: For 1st param expected `dtype` but got `Union[int,
-                #  str]`.
                 indices.to(current_device),
-                # pyre-fixme[6]: For 1st param expected `dtype` but got `Union[int,
-                #  str]`.
                 dense_indices.to(current_device),
-                # pyre-fixme[6]: For 1st param expected `dtype` but got `Union[int,
-                #  str]`.
                 offsets.to(current_device),
-                # pyre-fixme[6]: For 1st param expected `dtype` but got `Union[int,
-                #  str]`.
                 hash_table.to(current_device),
-                # pyre-fixme[6]: For 1st param expected `dtype` but got `Union[int,
-                #  str]`.
                 hash_table_offsets.to(current_device),
-                # pyre-fixme[6]: For 1st param expected `dtype` but got `Union[int,
-                #  str]`.
                 index_remappings_array.to(current_device),
-                # pyre-fixme[6]: For 1st param expected `dtype` but got `Union[int,
-                #  str]`.
                 index_remappings_array_offsets.to(current_device),
             )
 

From 753bc107576c6ff94e46a0da557ee318b28c6bba Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Mon, 27 Nov 2023 16:28:10 -0800
Subject: [PATCH 42/73] Fix run-lint issue in OSS (#2161)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2161

As title

Reviewed By: q10

Differential Revision: D51601188

fbshipit-source-id: 1b337ae4890f8ad06c36117ad2ac82835c6f7165
---
 fbgemm_gpu/test/split_table_batched_embeddings_test.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
index 250948db12..f523773d25 100644
--- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py
+++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -73,19 +73,12 @@
 
 if open_source:
     # pyre-ignore[21]
-    from test_utils import (
-        gpu_available,
-        gpu_unavailable,
-        gradcheck,
-        optests,
-        TEST_WITH_ROCM,
-    )
+    from test_utils import gpu_available, gpu_unavailable, gradcheck, TEST_WITH_ROCM
 else:
     from fbgemm_gpu.test.test_utils import (
         gpu_available,
         gpu_unavailable,
         gradcheck,
-        optests,
         TEST_WITH_ROCM,
     )
 

From ba2b921c5e0f9d043aa2cd64e7eb9973db248982 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Mon, 27 Nov 2023 16:55:46 -0800
Subject: [PATCH 43/73] Annotate unused params (#2158)

Summary:
- Annotate unused params to unblock compilation in strict mode (see https://github.com/pytorch/FBGEMM/issues/2157)

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2158

Reviewed By: spcyppt

Differential Revision: D51595444

Pulled By: q10

fbshipit-source-id: 86011c0398235e4240da40f991bc9d9623e2fa27
---
 src/EmbeddingSpMDM.cc | 4 ++--
 src/QuantUtils.cc     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/EmbeddingSpMDM.cc b/src/EmbeddingSpMDM.cc
index 144f8c0730..8aedf284dd 100644
--- a/src/EmbeddingSpMDM.cc
+++ b/src/EmbeddingSpMDM.cc
@@ -1018,9 +1018,9 @@ typename EmbeddingSpMDMKernelSignature<inType, indxType, offsetType, outType>::
     Type
     GenerateEmbeddingSpMDMWithStrides(
         const int64_t block_size,
-        bool has_weight,
+        [[maybe_unused]] bool has_weight,
         bool normalize_by_lengths,
-        int prefetch,
+        [[maybe_unused]] int prefetch,
         bool is_weight_positional,
         bool use_offsets,
         int64_t output_stride /*=-1*/,
diff --git a/src/QuantUtils.cc b/src/QuantUtils.cc
index da20d71ede..7e2fb37264 100644
--- a/src/QuantUtils.cc
+++ b/src/QuantUtils.cc
@@ -268,7 +268,7 @@ FBGEMM_SPECIALIZED_QUANTIZE(uint8_t, false)
       const TensorQuantizationParams& qparams,                      \
       int thread_id,                                                \
       int num_threads,                                              \
-      float noise_ratio) {                                          \
+      [[maybe_unused]] float noise_ratio) {                         \
     int64_t i_begin, i_end;                                         \
     fbgemmPartition1D(thread_id, num_threads, len, i_begin, i_end); \
     for (int64_t i = i_begin; i < i_end; ++i) {                     \

From 18af2b2da8c4de0b662c2e4ee6820a5679c140da Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Mon, 27 Nov 2023 18:23:09 -0800
Subject: [PATCH 44/73] Add/modify LXU cache lookup ops for pipeline
 prefetching (#2154)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2154

This diff adds/updates LXU cache APIs for pipeline prefetching:

- Update `lxu_cache_lookup` to allow for unique linear cache indices
lookup and external output tensor to be passed to the op
- Update `lxu_cache_locations_update` to support unique cache
locations update
- Add Python binding for `get_unique_indices`

Reviewed By: levythu

Differential Revision: D51532548

fbshipit-source-id: dcb55d06f2537d1a7a3d51d92da87f7898ba95de
---
 .../embedding_forward_quantized_host.cpp      |  5 +-
 .../split_embeddings_cache_cuda.cuh           |  7 +-
 .../src/split_embeddings_cache/common.h       |  4 +-
 .../src/split_embeddings_cache/lxu_cache.cpp  | 14 +--
 .../src/split_embeddings_cache/lxu_cache.cu   | 93 +++++++++++++++----
 .../reset_weight_momentum.cu                  |  5 +-
 .../split_embeddings_cache_ops.cpp            |  6 +-
 .../split_embeddings_cache_ops.cu             |  1 +
 8 files changed, 103 insertions(+), 32 deletions(-)

diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp
index 722201e4fb..11b514acc0 100644
--- a/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp
+++ b/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp
@@ -497,7 +497,10 @@ Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function(
         lxu_cache_state.value(),
         total_cache_hash_size.value(),
         gather_uvm_stats,
-        uvm_cache_stats);
+        uvm_cache_stats,
+        c10::optional<Tensor>(), // num_uniq_cache_indices
+        c10::optional<Tensor>() // lxu_cache_locations_output
+    );
 
 #ifdef FBCODE_CAFFE2
     if (FLAGS_tbe_uvm_cache_enforced_misses > 0) {
diff --git a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh
index 800f238f5e..57cd695d8f 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh
@@ -174,7 +174,9 @@ at::Tensor lxu_cache_lookup_cuda(
     at::Tensor lxu_cache_state,
     int64_t invalid_index,
     bool gather_cache_stats,
-    c10::optional<at::Tensor> uvm_cache_stats);
+    c10::optional<at::Tensor> uvm_cache_stats,
+    c10::optional<at::Tensor> num_uniq_cache_indices,
+    c10::optional<at::Tensor> lxu_cache_locations_output);
 
 at::Tensor emulate_cache_miss(
     at::Tensor lxu_cache_locations,
@@ -240,4 +242,5 @@ void lxu_cache_locking_counter_decrement_cuda(
 /// and lxu_cache_locations_new[i] >= 0
 void lxu_cache_locations_update_cuda(
     at::Tensor lxu_cache_locations,
-    at::Tensor lxu_cache_locations_new);
+    at::Tensor lxu_cache_locations_new,
+    c10::optional<at::Tensor> num_uniq_cache_indices);
diff --git a/fbgemm_gpu/src/split_embeddings_cache/common.h b/fbgemm_gpu/src/split_embeddings_cache/common.h
index 9c22f02e40..d43e4467c6 100644
--- a/fbgemm_gpu/src/split_embeddings_cache/common.h
+++ b/fbgemm_gpu/src/split_embeddings_cache/common.h
@@ -89,7 +89,9 @@ Tensor lxu_cache_lookup_cpu(
     Tensor lxu_cache_state,
     int64_t invalid_index,
     bool gather_cache_stats,
-    c10::optional<Tensor> uvm_cache_stats);
+    c10::optional<Tensor> uvm_cache_stats,
+    c10::optional<Tensor> num_uniq_cache_indices,
+    c10::optional<Tensor> lxu_cache_locations_output);
 
 Tensor direct_mapped_lxu_cache_lookup_cpu(
     Tensor linear_cache_indices,
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cpp b/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cpp
index 490da1fb7d..296f966417 100644
--- a/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cpp
+++ b/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cpp
@@ -14,12 +14,14 @@ namespace fbgemm_gpu {
 
 DLL_PUBLIC Tensor lxu_cache_lookup_cpu(
     Tensor linear_cache_indices,
-    Tensor lxu_cache_state,
-    int64_t invalid_index,
-    bool gather_cache_stats,
-    c10::optional<Tensor> uvm_cache_stats) {
-  return empty_like(
-      linear_cache_indices, linear_cache_indices.options().dtype(at::kInt));
+    Tensor /* lxu_cache_state */,
+    int64_t /* invalid_index */,
+    bool /* gather_cache_stats */,
+    c10::optional<Tensor> /* uvm_cache_stats */,
+    c10::optional<Tensor> /* num_uniq_cache_indices */,
+    c10::optional<Tensor> lxu_cache_locations_output) {
+  return lxu_cache_locations_output.value_or(empty_like(
+      linear_cache_indices, linear_cache_indices.options().dtype(at::kInt)));
 }
 
 DLL_PUBLIC Tensor direct_mapped_lxu_cache_lookup_cpu(
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu b/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu
index 6d43c259b5..3e39eb9e23 100644
--- a/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu
+++ b/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu
@@ -254,9 +254,11 @@ __global__ __launch_bounds__(kMaxThreads) void lxu_cache_lookup_kernel(
         lxu_cache_locations,
     const bool gather_cache_stats,
     pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        uvm_cache_stats) {
+        uvm_cache_stats,
+    const int32_t* N_unique) {
   const int32_t C = lxu_cache_state.size(0);
-  const int32_t N = linear_cache_indices.size(0);
+  const int32_t N =
+      N_unique == nullptr ? linear_cache_indices.size(0) : *N_unique;
   const int32_t n0 =
       blockIdx.x * blockDim.y * blockDim.x + threadIdx.y * blockDim.x;
   if (n0 >= N) {
@@ -368,14 +370,56 @@ __launch_bounds__(kMaxThreads) void direct_mapped_lxu_cache_lookup_kernel(
 
 } // namespace
 
+/// Lookup the cache locations for each linear cache indices in
+/// linear_cache_indices and return lxu_cache_locations
+///
+/// lxu_cache_locations A 1D tensor with the same length as
+///                     linear_cache_indices.  It contains the cache locations
+///                     (the row indices in the cache) of the corresponding
+///                     indices in linear_cache_indices, i.e.,
+///                     lxu_cache_locations[i] is the cache location for
+///                     linear_cache_indices[i], where 0 <= i <
+///                     linear_cache_indices.numel().
+///
+/// @param linear_cache_indices        Linear cache indices tensor (1D)
+/// @param lxu_cache_state             LXU cache state tensor (2D tensor of
+///                                    shape (# of cache sets, # of cache
+///                                    slots per set)).  It contains linear
+///                                    indices of rows that are in the
+///                                    corresponding cache slots. If the cache
+///                                    slot is empty, a sentinel value is
+///                                    stored.
+/// @param invalid_index               A sentinel value for linear cache
+///                                    indices.  A cache index is skipped if it
+///                                    is a sentinel value.
+/// @param gather_cache_stats          A flag to enable/disable cache stats
+///                                    collection.
+/// @param uvm_cache_stats             A tensor for storing cache stats.
+/// @param num_uniq_cache_indices      An optional GPU tensor that contains the
+///                                    number of unique cache indices.  If this
+///                                    tensor is passed, the kernel will only
+///                                    lookup num_uniq_cache_indices number of
+///                                    indices instead of looking up the entire
+///                                    linear_cache_indices.
+/// @param lxu_cache_locations_output  An optional output tensor.  If the
+///                                    tensor is passed, the operator will not
+///                                    allocate a new output tensor and use
+///                                    this tensor as an output tensor.
 DLL_PUBLIC Tensor lxu_cache_lookup_cuda(
-    Tensor linear_cache_indices,
-    Tensor lxu_cache_state,
-    int64_t invalid_index,
-    bool gather_cache_stats,
-    c10::optional<Tensor> uvm_cache_stats) {
+    const Tensor linear_cache_indices,
+    const Tensor lxu_cache_state,
+    const int64_t invalid_index,
+    const bool gather_cache_stats,
+    const c10::optional<Tensor> uvm_cache_stats,
+    const c10::optional<Tensor> num_uniq_cache_indices,
+    const c10::optional<Tensor> lxu_cache_locations_output) {
+  const auto uniq_lookup = num_uniq_cache_indices.has_value();
+  // TODO: Support gather_cache_stats=true when uniq_lookup=true
+  TORCH_CHECK(
+      !uniq_lookup || !gather_cache_stats,
+      "Unique lxu_cache_locations generation does not support gather_cache_stats=true");
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      linear_cache_indices, lxu_cache_state);
+      linear_cache_indices, lxu_cache_state, num_uniq_cache_indices);
   Tensor uvm_cache_stats_ =
       at::empty({0}, linear_cache_indices.options().dtype(at::kInt));
   if (gather_cache_stats) {
@@ -386,9 +430,12 @@ DLL_PUBLIC Tensor lxu_cache_lookup_cuda(
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(linear_cache_indices.get_device());
 
+  const auto lxu_cache_locations =
+      lxu_cache_locations_output.value_or(empty_like(
+          linear_cache_indices,
+          linear_cache_indices.options().dtype(at::kInt)));
+
   const auto N = linear_cache_indices.numel();
-  auto lxu_cache_locations = empty_like(
-      linear_cache_indices, linear_cache_indices.options().dtype(at::kInt));
   if (linear_cache_indices.numel() == 0) {
     // nothing to do
     return lxu_cache_locations;
@@ -412,10 +459,12 @@ DLL_PUBLIC Tensor lxu_cache_lookup_cuda(
             invalid_index,
             MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32),
             gather_cache_stats,
-            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats_, int32_t, 1, 32));
+            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats_, int32_t, 1, 32),
+            num_uniq_cache_indices.has_value()
+                ? num_uniq_cache_indices.value().data_ptr<int32_t>()
+                : nullptr);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
-
   return lxu_cache_locations;
 }
 
@@ -479,11 +528,13 @@ __launch_bounds__(kMaxThreads) void lxu_cache_locations_update_kernel(
     pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         lxu_cache_locations,
     const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
-        lxu_cache_locations_new) {
-  const int32_t N = lxu_cache_locations.size(0);
+        lxu_cache_locations_new,
+    const int32_t* N_unique) {
+  const auto N = N_unique == nullptr ? lxu_cache_locations.size(0) : *N_unique;
   CUDA_KERNEL_LOOP(n, N) {
-    if (lxu_cache_locations[n] == kCacheLocationMissing &&
-        lxu_cache_locations_new[n] >= 0) {
+    if (N_unique != nullptr ||
+        (lxu_cache_locations[n] == kCacheLocationMissing &&
+         lxu_cache_locations_new[n] >= 0)) {
       lxu_cache_locations[n] = lxu_cache_locations_new[n];
     }
   }
@@ -493,9 +544,10 @@ __launch_bounds__(kMaxThreads) void lxu_cache_locations_update_kernel(
 
 DLL_PUBLIC void lxu_cache_locations_update_cuda(
     Tensor lxu_cache_locations,
-    Tensor lxu_cache_locations_new) {
+    Tensor lxu_cache_locations_new,
+    c10::optional<Tensor> num_uniq_cache_indices) {
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      lxu_cache_locations, lxu_cache_locations_new);
+      lxu_cache_locations, lxu_cache_locations_new, num_uniq_cache_indices);
 
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(lxu_cache_locations.get_device());
@@ -520,7 +572,10 @@ DLL_PUBLIC void lxu_cache_locations_update_cuda(
       0,
       at::cuda::getCurrentCUDAStream()>>>(
       MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32),
-      MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations_new, int32_t, 1, 32));
+      MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations_new, int32_t, 1, 32),
+      num_uniq_cache_indices.has_value()
+          ? num_uniq_cache_indices.value().data_ptr<int32_t>()
+          : nullptr);
 
   C10_CUDA_KERNEL_LAUNCH_CHECK();
   return;
diff --git a/fbgemm_gpu/src/split_embeddings_cache/reset_weight_momentum.cu b/fbgemm_gpu/src/split_embeddings_cache/reset_weight_momentum.cu
index 104bf140e1..321c225772 100644
--- a/fbgemm_gpu/src/split_embeddings_cache/reset_weight_momentum.cu
+++ b/fbgemm_gpu/src/split_embeddings_cache/reset_weight_momentum.cu
@@ -276,7 +276,10 @@ DLL_PUBLIC void reset_weight_momentum_cuda(
         lxu_cache_state,
         total_cache_hash_size,
         false, // gather_cache_stats
-        uvm_cache_stats);
+        uvm_cache_stats,
+        c10::optional<Tensor>(), // num_uniq_cache_indices
+        c10::optional<Tensor>() // lxu_cache_locations_output
+    );
   }
 
   // Reset weight and momentum of pruned rows
diff --git a/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cpp b/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cpp
index 2565ed219c..816f0fdaf5 100644
--- a/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cpp
+++ b/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cpp
@@ -26,7 +26,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "lfu_cache_populate_byte(Tensor weights, Tensor cache_hash_size_cumsum, int total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor linear_cache_indices, Tensor(a!) lxu_cache_state, Tensor(b!) lxu_cache_weights, Tensor(c!) lfu_state, int row_alignment=16) -> ()");
   m.def(
-      "lxu_cache_lookup(Tensor linear_cache_indices, Tensor lxu_cache_state, int invalid_index = -1, bool gather_cache_stats=False, Tensor(a!)? uvm_cache_stats=None) -> Tensor");
+      "lxu_cache_lookup(Tensor linear_cache_indices, Tensor lxu_cache_state, int invalid_index = -1, bool gather_cache_stats=False, Tensor(a!)? uvm_cache_stats=None, Tensor? num_uniq_cache_indices=None, Tensor(b!)? lxu_cache_locations_output=None) -> Tensor");
   m.def(
       "direct_mapped_lxu_cache_lookup(Tensor linear_cache_indices, Tensor lxu_cache_state, int invalid_index = -1, bool gather_cache_stats=False, Tensor(a!)? uvm_cache_stats=None) -> Tensor");
   m.def(
@@ -37,7 +37,9 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "lxu_cache_locking_counter_decrement(Tensor(a!) lxu_cache_locking_counter, Tensor lxu_cache_locations) -> ()");
   m.def(
-      "lxu_cache_locations_update(Tensor(a!) lxu_cache_locations, Tensor lxu_cache_locations_new) -> ()");
+      "lxu_cache_locations_update(Tensor(a!) lxu_cache_locations, Tensor lxu_cache_locations_new, Tensor? num_uniq_cache_indices=None) -> ()");
+  m.def(
+      "get_unique_indices(Tensor linear_indices, int max_indices, bool compute_count) -> (Tensor, Tensor, Tensor?)");
 }
 
 using namespace fbgemm_gpu;
diff --git a/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cu b/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cu
index 91b4024dc9..fdf5c2ccc7 100644
--- a/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cu
+++ b/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cu
@@ -33,6 +33,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       lxu_cache_locking_counter_decrement_cuda);
   DISPATCH_TO_CUDA(
       "lxu_cache_locations_update", lxu_cache_locations_update_cuda);
+  DISPATCH_TO_CUDA("get_unique_indices", get_unique_indices_cuda);
 }
 
 } // namespace

From ca1da75537d4a0954cb791d8df0c21abe2d014a1 Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Mon, 27 Nov 2023 18:23:09 -0800
Subject: [PATCH 45/73] Add unit test for unique cache lookup (#2160)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2160

As title

Differential Revision: D51600570

fbshipit-source-id: 3004dad5cfecd72c28067e6f9185bc131952bb01
---
 .../split_table_batched_embeddings_test.py    | 158 ++++++++++++++++++
 1 file changed, 158 insertions(+)

diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
index f523773d25..7415e5c303 100644
--- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py
+++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -6543,6 +6543,164 @@ def check_weight_momentum(v: int) -> None:
 
         check_weight_momentum(0)
 
+    @unittest.skipIf(*gpu_unavailable)
+    @given(
+        T=st.integers(min_value=1, max_value=10),
+        D=st.integers(min_value=2, max_value=128),
+        B=st.integers(min_value=1, max_value=128),
+        log_E=st.integers(min_value=3, max_value=5),
+        L=st.integers(min_value=0, max_value=20),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    def test_unique_lxu_cache_lookup(
+        self,
+        T: int,
+        D: int,
+        B: int,
+        log_E: int,
+        L: int,
+    ) -> None:
+        E = int(10**log_E)
+
+        indices = to_device(
+            torch.randint(low=0, high=E, size=(T * L * B,)),
+            use_cpu=False,
+        ).long()
+        offsets = to_device(
+            torch.tensor([0] + list(accumulate([L] * (T * L)))),
+            use_cpu=False,
+        ).long()
+
+        def unique_lookup(
+            indices: Tensor,
+            offsets: Tensor,
+            cache_hash_size_cumsum: Tensor,
+            total_cache_hash_size: int,
+        ) -> Tuple[Tensor, Tensor]:
+            linear_cache_indices = torch.ops.fbgemm.linearize_cache_indices(
+                cache_hash_size_cumsum,
+                indices,
+                offsets,
+            )
+
+            uniq_indices, uniq_indices_length, _ = torch.ops.fbgemm.get_unique_indices(
+                linear_cache_indices, total_cache_hash_size, compute_count=False
+            )
+
+            uniq_lxu_cache_locations = torch.ops.fbgemm.lxu_cache_lookup(
+                uniq_indices,
+                lxu_cache_state,
+                total_cache_hash_size,
+                gather_cache_stats=False,
+                num_uniq_cache_indices=uniq_indices_length,
+            )
+
+            return uniq_lxu_cache_locations, uniq_indices_length
+
+        def duplicate_lookup(
+            indices: Tensor,
+            offsets: Tensor,
+            cache_hash_size_cumsum: Tensor,
+            total_cache_hash_size: int,
+        ) -> Tensor:
+            linear_cache_indices = torch.ops.fbgemm.linearize_cache_indices(
+                cache_hash_size_cumsum,
+                indices,
+                offsets,
+            )
+
+            lxu_cache_locations = torch.ops.fbgemm.lxu_cache_lookup(
+                linear_cache_indices,
+                lxu_cache_state,
+                total_cache_hash_size,
+                gather_cache_stats=False,
+            )
+            return lxu_cache_locations
+
+        cache_sets = int((E * T) * 0.2)
+        lxu_cache_state = torch.zeros(
+            cache_sets,
+            DEFAULT_ASSOC,
+            device="cuda",
+            dtype=torch.int64,
+        ).fill_(-1)
+
+        hash_sizes = torch.tensor([E] * T, dtype=torch.long, device="cuda")
+        cache_hash_size_cumsum = torch.ops.fbgemm.asynchronous_complete_cumsum(
+            hash_sizes
+        )
+        total_cache_hash_size = cache_hash_size_cumsum[-1].item()
+
+        linear_cache_indices = torch.ops.fbgemm.linearize_cache_indices(
+            cache_hash_size_cumsum,
+            indices,
+            offsets,
+        )
+
+        # Emulate cache population
+        uniq_indices_cpu = linear_cache_indices.unique().cpu()
+        index_cache_set_map = uniq_indices_cpu.clone()
+        index_cache_set_map.apply_(
+            lambda x: torch.ops.fbgemm.lxu_cache_slot(x, cache_sets)
+        )
+        index_cache_set_map = index_cache_set_map.tolist()
+        uniq_indices_cpu = uniq_indices_cpu.tolist()
+
+        slots = {}
+        for idx, c in zip(uniq_indices_cpu, index_cache_set_map):
+            if c not in slots:
+                slots[c] = 0
+            slot = slots[c]
+            if slot < DEFAULT_ASSOC:
+                lxu_cache_state[c][slot] = idx
+            slots[c] = slot + 1
+
+        # Run unique lookup
+        uniq_lookup_output, uniq_indices_length = unique_lookup(
+            indices, offsets, cache_hash_size_cumsum, total_cache_hash_size
+        )
+
+        # Run duplicate lookup
+        duplicate_lookup_output = duplicate_lookup(
+            indices, offsets, cache_hash_size_cumsum, total_cache_hash_size
+        )
+
+        # Start running validation
+
+        # Compute unique indices using PyTorch ops
+        sorted_linear_cache_indices, inverse_sorted_cache_indices = torch.sort(
+            linear_cache_indices
+        )
+        ref_uniq_cache_indices, cache_indices_counts = torch.unique_consecutive(
+            sorted_linear_cache_indices, return_inverse=False, return_counts=True
+        )
+
+        # Convert to lists
+        cache_indices_counts = cache_indices_counts.cpu().tolist()
+        uniq_lookup_output = uniq_lookup_output.cpu().tolist()
+
+        # Validate the number of unique cache indices
+        ref_num_uniq_indices = ref_uniq_cache_indices.numel()
+        assert ref_num_uniq_indices == uniq_indices_length.item()
+
+        # Expand
+        reshaped_uniq_lookup_output = uniq_lookup_output[:ref_num_uniq_indices]
+        sorted_lxu_cache_locations = to_device(
+            torch.tensor(
+                np.repeat(reshaped_uniq_lookup_output, cache_indices_counts),
+                dtype=duplicate_lookup_output.dtype,
+            ),
+            use_cpu=False,
+        )
+
+        _, cache_location_indices = torch.sort(inverse_sorted_cache_indices)
+
+        expanded_lxu_cache_locations = torch.index_select(
+            sorted_lxu_cache_locations, 0, cache_location_indices
+        )
+
+        assert torch.equal(expanded_lxu_cache_locations, duplicate_lookup_output)
+
 
 if __name__ == "__main__":
     unittest.main()

From 035ed1f66b30ce438baf088a3c9053c0824e108e Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Mon, 27 Nov 2023 18:42:13 -0800
Subject: [PATCH 46/73] Use unique cache locations in backward for pipeline
 prefetching (#2151)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2151

When pipeline prefetching is enabled (`prefetch_pipeline=True`) for
`EmbeddingLocation.MANAGED_CACHING`, TBE has to update
`lxu_cache_locations` to ensure cache consistency.  Prior to this
diff, TBE performs the full cache lookup when updating
`lxu_cache_locations` (i.e., looking up all indices although they are
duplicated).  The time complexity of such the lookup grows with
the number of indices.  Thus, the lookup can be expensive when the
number of indices is large.  This diff addresses this problem by
looking up only the unique indices (which is normally much smaller
than the full indices).  The number of unique indices tends to stay
more or less the same even when the total number of indices grows.
Thus, looking up only unique indices can reduce cost of cache lookup
effectively.  The unique `lxu_cache_locations` are fed directly to TBE
backward to consume.  Thus, there is no decompression cost.

Reviewed By: ehsanardestani, jianyuh

Differential Revision: D51339208

fbshipit-source-id: be117db4a4a516341a99210e4c4e32dd9526588c
---
 .../embedding_backward_split_grad_template.cu |  73 ++++++++++
 ...embedding_backward_split_host_template.cpp |  28 +++-
 ...ding_backward_split_kernel_cta_template.cu |   6 +-
 ...ing_backward_split_kernel_warp_template.cu |   6 +-
 .../embedding_backward_split_template.cu      | 130 +++++++++++++++---
 ...optimizer_split_device_kernel_template.cuh |   4 +-
 ...bedding_optimizer_split_kernel_template.cu |   4 +-
 7 files changed, 222 insertions(+), 29 deletions(-)

diff --git a/fbgemm_gpu/codegen/embedding_backward_split_grad_template.cu b/fbgemm_gpu/codegen/embedding_backward_split_grad_template.cu
index 5367f292b7..356e8a0c79 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_grad_template.cu
+++ b/fbgemm_gpu/codegen/embedding_backward_split_grad_template.cu
@@ -57,6 +57,79 @@ split_embedding_backward_codegen_find_long_segments(
   }
 }
 
+template <typename info_pta_t, typename info_t, bool nobag>
+__global__ __launch_bounds__(kMaxThreads)
+void split_embedding_backward_count_unique_indices_kernel(
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        sorted_linear_indices_num_runs,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        sorted_linear_indices_cumulative_run_lengths,
+    const pta::PackedTensorAccessor32<info_pta_t, 1, at::RestrictPtrTraits>
+        sorted_infos,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        weights_placements,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        dev_or_uvm_unique_indices,
+    const int info_B_num_bits
+) {
+  const int32_t num_runs = sorted_linear_indices_num_runs[0];
+  const auto T = weights_placements.size(0);
+  for (auto run_id = blockIdx.x * blockDim.x + threadIdx.x;
+       run_id < num_runs;
+       run_id += blockDim.x * gridDim.x) {
+    // Obtain the associated table id of the run id
+    const auto segment_start = sorted_linear_indices_cumulative_run_lengths[run_id];
+    const auto info = reinterpret_cast<const info_t*>(&sorted_infos[0])[segment_start];
+    const auto t = nobag ? (info % T) : (info >> info_B_num_bits);
+
+    int32_t t_next = -1;
+    const auto unique_count_offset = run_id + 1;
+    if (unique_count_offset < num_runs) {
+      const auto segment_start_next = sorted_linear_indices_cumulative_run_lengths[unique_count_offset];
+      const auto info_next = reinterpret_cast<const info_t*>(&sorted_infos[0])[segment_start_next];
+      t_next = nobag ? (info_next % T) : (info_next >> info_B_num_bits);
+    }
+
+    if (t != t_next) {
+      const auto placement = static_cast<PlacementType>(weights_placements[t]);
+      if (placement != PlacementType::MANAGED_CACHING) {
+        // Record num unique indices for PlacementType::DEVICE from unique_count_offset
+        gpuAtomicAdd(&dev_or_uvm_unique_indices[t], unique_count_offset);
+      }
+      if (t_next != -1) {
+        const auto placement_next = static_cast<PlacementType>(weights_placements[t_next]);
+        if (placement_next != PlacementType::MANAGED_CACHING) {
+          // Record num unique indices for PlacementType::DEVICE from unique_count_offset
+          gpuAtomicAdd(&dev_or_uvm_unique_indices[t_next], -unique_count_offset);
+        }
+      }
+    }
+  }
+}
+
+{% for nobag in [True, False] %}
+{% set info_pta_t = "int64_t" if nobag else "int32_t" %}
+template __global__ __launch_bounds__(kMaxThreads)
+void split_embedding_backward_count_unique_indices_kernel
+<
+  {{ info_pta_t }},
+  {{ "int64_t" if nobag else "uint32_t" }},
+  {{ "true" if nobag else "false" }}
+> (
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        sorted_linear_indices_num_runs,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        sorted_linear_indices_cumulative_run_lengths,
+    const pta::PackedTensorAccessor32<{{ info_pta_t }}, 1, at::RestrictPtrTraits>
+        sorted_infos,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        weights_placements,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        dev_or_uvm_unique_indices,
+    const int info_B_num_bits
+);
+{% endfor %}
+
 {% for vbe in [True, False] %}
 {% set vbe_desc = "_vbe" if vbe else "" %}
 template <typename grad_t>
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
index 82b345365e..c5eb20a620 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
+++ b/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
@@ -98,6 +98,8 @@ Tensor split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_e
     const Tensor& vbe_row_output_offsets,
     const Tensor& vbe_b_t_map,
     {%- endif %}
+    const bool use_uniq_cache_locations,
+    const bool use_homogeneous_placements,
     {{ args.split_function_args | join(", ") }});
 
 {%- endfor %} {#-/*for nobag*/#}
@@ -177,6 +179,8 @@ class {{ autograd_func }} :
     const int64_t vbe_output_size,
     {%- endif %}
     const bool is_experimental,
+    const bool use_uniq_cache_locations_bwd,
+    const bool use_homogeneous_placements,
     {{ args.split_function_args | join(", ") }}) {
 
     const auto T = weights_offsets.sym_numel();
@@ -263,6 +267,8 @@ class {{ autograd_func }} :
     ctx->saved_data["info_B_num_bits"] = info_B_num_bits;
     const auto info_B_mask_int64 = static_cast<int64_t>(info_B_mask);
     ctx->saved_data["info_B_mask"] = info_B_mask_int64;
+    ctx->saved_data["use_uniq_cache_locations_bwd"] = use_uniq_cache_locations_bwd;
+    ctx->saved_data["use_homogeneous_placements"] = use_homogeneous_placements;
 
     {%- for (var, _) in args.saved_data %}
     ctx->saved_data["{{ var }}"] = {{ var }};
@@ -392,6 +398,10 @@ class {{ autograd_func }} :
     {%- endif %} {#-/* if optimizer != "none" */#}
     const int32_t info_B_num_bits = ctx->saved_data["info_B_num_bits"].toInt();
     const int64_t info_B_mask_int64 = ctx->saved_data["info_B_mask"].toInt();
+    const auto use_uniq_cache_locations_bwd =
+      ctx->saved_data["use_uniq_cache_locations_bwd"].toBool();
+    const auto use_homogeneous_placements =
+      ctx->saved_data["use_homogeneous_placements"].toBool();
 
     {%- for (var, ivalue_cast) in args.saved_data %}
     auto {{ var }} = ctx->saved_data["{{ var }}"].{{ ivalue_cast }}();
@@ -510,6 +520,8 @@ class {{ autograd_func }} :
           vbe_row_output_offsets,
           vbe_b_t_map,
           {%- endif %}
+          use_uniq_cache_locations_bwd,
+          use_homogeneous_placements,
           {{ args.split_function_arg_names | join(", ") }}
       ) {{ ":" if not weighted else ";" }}
       {%- endfor %} {#-/* for weighted in [False, True] */#}
@@ -546,6 +558,8 @@ class {{ autograd_func }} :
         Variable(), // vbe_output_size
         {%- endif %}
         Variable(), // is_experimental
+        Variable(), // use_uniq_cache_locations_bwd
+        Variable(), // use_homogeneous_placements
         {{ args.split_variables | join(", ") }}
     };
     {%- else %}
@@ -585,6 +599,8 @@ class {{ autograd_func }} :
         vbe_row_output_offsets,
         vbe_b_t_map,
         {%- endif %}
+        use_uniq_cache_locations_bwd,
+        use_homogeneous_placements,
         {{ args.split_function_arg_names | join(", ") }}
     );
     return {
@@ -615,6 +631,8 @@ class {{ autograd_func }} :
         Variable(), // vbe_output_size
         {%- endif %}
         Variable(), // is_experimental
+        Variable(), // use_uniq_cache_locations_bwd
+        Variable(), // use_homogeneous_placements
         {{ args.split_variables | join(", ") }}
     };
     {%- endif %}
@@ -657,7 +675,9 @@ Tensor split_embedding_codegen_lookup_{{ optimizer }}_function(
     const int64_t max_B = -1,
     const int64_t max_B_feature_rank = -1,
     const int64_t vbe_output_size = -1,
-    const bool is_experimental = false
+    const bool is_experimental = false,
+    const bool use_uniq_cache_locations_bwd = false,
+    const bool use_homogeneous_placements = false
 ) {
   {%- if has_gpu_support %}
   {%- for vbe in ([True, False] if has_vbe_support else [False]) %}
@@ -721,6 +741,8 @@ Tensor split_embedding_codegen_lookup_{{ optimizer }}_function(
           vbe_output_size,
           {%- endif %}
           is_experimental,
+          use_uniq_cache_locations_bwd,
+          use_homogeneous_placements,
           {{ args.split_function_arg_names | join(", ") }})[0];
     }
     {%- endfor %} {#-/* for nobag */#}
@@ -767,7 +789,9 @@ TORCH_LIBRARY_FRAGMENT({{ lib_name }}, m) {
           "    int max_B=-1, "
           "    int max_B_feature_rank=-1, "
           "    int vbe_output_size=-1, "
-          "    bool is_experimental=False) -> Tensor");
+          "    bool is_experimental=False, "
+          "    bool use_uniq_cache_locations_bwd=False, "
+          "    bool use_homogeneous_placements=False) -> Tensor");
     // We're playing a funny trick here: we're using the autograd
     // implementation of the operator at all the dispatch keys.  This is OK
     // because autograd.Function works even in a context where there is
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_kernel_cta_template.cu b/fbgemm_gpu/codegen/embedding_backward_split_kernel_cta_template.cu
index ce0a5db17c..1cd572a33c 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_kernel_cta_template.cu
+++ b/fbgemm_gpu/codegen/embedding_backward_split_kernel_cta_template.cu
@@ -62,6 +62,8 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
     {%- endif %}
     {%- if not dense %}
     const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> sorted_lxu_cache_locations,
+    const bool use_uniq_cache_locations,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> table_unique_indices_offsets,
     {%- endif %}
     {%- if weighted %}
     const pta::PackedTensorAccessor32<at::acc_type<cache_t, true>, 1, at::RestrictPtrTraits> sorted_indice_weights,
@@ -381,10 +383,10 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
               stochastic_rounding,
               stochastic_rounding_philox_args,
               current_run_id,
+              use_uniq_cache_locations ? (current_run_id - table_unique_indices_offsets[t_0]) : segment_start,
               D,
               t_0,
               idx,
-              segment_start,
               shfl_sync_mask,
               0, // shared_weight_offset
               {{ args.split_function_arg_names | join(", ") }});
@@ -462,6 +464,8 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
     {%- if not dense %}
     const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         sorted_lxu_cache_locations,
+    const bool use_uniq_cache_locations,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> table_unique_indices_offsets,
     {%- endif %}
     {%- if weighted %}
     const pta::PackedTensorAccessor32<at::acc_type<{{ cache_type }}, true>, 1, at::RestrictPtrTraits> sorted_indice_weights,
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_kernel_warp_template.cu b/fbgemm_gpu/codegen/embedding_backward_split_kernel_warp_template.cu
index ad81b31b1b..176f80bd9d 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_kernel_warp_template.cu
+++ b/fbgemm_gpu/codegen/embedding_backward_split_kernel_warp_template.cu
@@ -60,6 +60,8 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
     {%- endif %}
     {%- if not dense %}
     const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> sorted_lxu_cache_locations,
+    const bool use_uniq_cache_locations,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> table_unique_indices_offsets,
     {%- endif %}
     {%- if weighted %}
     const pta::PackedTensorAccessor32<at::acc_type<cache_t, true>, 1, at::RestrictPtrTraits> sorted_indice_weights,
@@ -222,10 +224,10 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
               stochastic_rounding,
               stochastic_rounding_philox_args,
               run_id,
+              use_uniq_cache_locations ? (run_id - table_unique_indices_offsets[t_0]) : segment_start,
               D,
               t_0,
               idx,
-              segment_start,
               shfl_sync_mask,
               threadIdx.y * kMaxVecsPerThread * kThreadGroupSize, // shared_weight_offset
               {{ args.split_function_arg_names | join(", ") }});
@@ -301,6 +303,8 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
     {%- endif %}
     {%- if not dense %}
     const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> sorted_lxu_cache_locations,
+    const bool use_uniq_cache_locations,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> table_unique_indices_offsets,
     {%- endif %}
     {%- if weighted %}
     const pta::PackedTensorAccessor32<at::acc_type<{{ cache_type }}, true>, 1, at::RestrictPtrTraits> sorted_indice_weights,
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/embedding_backward_split_template.cu
index c621769bdc..1d928cd17a 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_template.cu
+++ b/fbgemm_gpu/codegen/embedding_backward_split_template.cu
@@ -20,6 +20,7 @@
 #include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
 #include "fbgemm_gpu/fbgemm_tensor_accessor.h"
 #include "fbgemm_gpu/split_embeddings_utils.cuh"
+#include "fbgemm_gpu/sparse_ops.h"
 
 using Tensor = at::Tensor;
 using namespace fbgemm_gpu;
@@ -67,6 +68,8 @@ split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}{{ vdesc
     {%- endif %}
     {%- if not dense %}
     const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> sorted_lxu_cache_locations,
+    const bool use_uniq_cache_locations,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> table_unique_indices_offsets,
     {%- endif %}
     {%- if weighted %}
     const pta::PackedTensorAccessor32<at::acc_type<cache_t, true>, 1, at::RestrictPtrTraits> sorted_indice_weights,
@@ -139,6 +142,8 @@ split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}{{ vdesc
     {%- endif %}
     {%- if not dense %}
     const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> sorted_lxu_cache_locations,
+    const bool use_uniq_cache_locations,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> table_unique_indices_offsets,
     {%- endif %}
     {%- if weighted %}
     const pta::PackedTensorAccessor32<at::acc_type<cache_t, true>, 1, at::RestrictPtrTraits> sorted_indice_weights,
@@ -201,6 +206,21 @@ grad_mean{{ vdesc }}_kernel(
     {%- endif %}
 );
 
+template <typename info_pta_t, typename info_t, bool nobag>
+__global__ __launch_bounds__(kMaxThreads) void
+split_embedding_backward_count_unique_indices_kernel(
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        sorted_linear_indices_num_runs,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        sorted_linear_indices_cumulative_run_lengths,
+    const pta::PackedTensorAccessor32<info_pta_t, 1, at::RestrictPtrTraits>
+        sorted_infos,
+    const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        weights_placements,
+    pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        dev_or_uvm_unique_indices,
+    const int info_B_num_bits
+);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Utility Macros
@@ -332,6 +352,10 @@ Tensor split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_e
     const Tensor& vbe_row_output_offsets,
     const Tensor& vbe_b_t_map,
     {%- endif %}
+    {%- if not is_index_select and not dense %}
+    const bool use_uniq_cache_locations,
+    const bool use_homogeneous_placements,
+    {%- endif %}
     {%- if is_index_select %}
     const Tensor& grad_offsets,
     const Tensor& total_L_offsets,
@@ -511,34 +535,90 @@ Tensor split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_e
         );
 
     {%- if not dense %}
-    auto lxu_cache_locations_sorted = at::empty_like(lxu_cache_locations);
+    Tensor lxu_cache_locations_sorted = lxu_cache_locations;
+    Tensor table_unique_indices_offsets;
     if (lxu_cache_locations.size(0) > 0) {
+      if (use_uniq_cache_locations) {
+        if (!use_homogeneous_placements) {
+          // When use_uniq_cache_locations=true, lxu_cache_locations are unique
+          // and sorted in an ascending order based on the linear cache indices.
+          // Linear cache indices of tables that are not placed in cache are set
+          // to a sentinel value (i.e., the sum of hash sizes of all embedding
+          // tables).  Since the sentinel value is larger than the max linear
+          // cache index value, the lxu_cache_locations can be sorted differently
+          // than the sorted_linear_indices.
+          //
+          // For this reason, the run ids of sorted and unique
+          // lxu_cache_locations can be different from those of the
+          // sorted_linear_indices.  We need the following code to compute
+          // table_unique_indices_offsets which contains the differences between
+          // lxu_cache_locations run ids and sorted_linear_indices run ids.
+          auto dev_or_uvm_unique_indices = at::zeros_like(weights_placements);
+
+#ifdef FBGEMM_GPU_MEMCHECK
+          const auto func_name = "split_embedding_backward_count_unique_indices_kernel";
+#endif
+          split_embedding_backward_count_unique_indices_kernel<
+          {{ "int64_t" if nobag else "int32_t" }},
+          {{ "int64_t" if nobag else "uint32_t" }},
+          {{ "true" if nobag else "false" }}
+          ><<<
+            div_round_up(total_unique_indices, kMaxThreads),
+            kMaxThreads,
+            0,
+            at::cuda::getCurrentCUDAStream()
+              >>>(
+                  MAKE_PTA_WITH_NAME(
+                    func_name, sorted_linear_indices_num_runs, int32_t, 1, 32),
+                  MAKE_PTA_WITH_NAME(
+                    func_name, sorted_linear_indices_cumulative_run_lengths, int32_t, 1, 32),
+                  MAKE_PTA_WITH_NAME(
+                    func_name, infos_sorted, {{ "int64_t" if nobag else "int32_t" }}, 1, 32),
+                  MAKE_PTA_WITH_NAME(
+                    func_name, weights_placements, int32_t, 1, 32),
+                  MAKE_PTA_WITH_NAME(
+                    func_name, dev_or_uvm_unique_indices, int32_t, 1, 32),
+                  info_B_num_bits
+                 );
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+          table_unique_indices_offsets =
+            fbgemm_gpu::asynchronous_complete_cumsum_gpu(dev_or_uvm_unique_indices).to(at::kInt);
+        }
+      }
+      else {
+        lxu_cache_locations_sorted = at::empty_like(lxu_cache_locations);
         size_t temp_storage_bytes = 0;
         AT_CUDA_CHECK(radix_sort_pairs(
-            nullptr,
-            temp_storage_bytes,
-            linear_indices.data_ptr<int64_t>(),
-            linear_indices_sorted.data_ptr<int64_t>(),
-            lxu_cache_locations.data_ptr<int32_t>(),
-            lxu_cache_locations_sorted.data_ptr<int32_t>(),
-            linear_indices.numel(),
-            0,
-            total_hash_size_bits,
-            at::cuda::getCurrentCUDAStream()));
+              nullptr,
+              temp_storage_bytes,
+              linear_indices.data_ptr<int64_t>(),
+              linear_indices_sorted.data_ptr<int64_t>(),
+              lxu_cache_locations.data_ptr<int32_t>(),
+              lxu_cache_locations_sorted.data_ptr<int32_t>(),
+              linear_indices.numel(),
+              0,
+              total_hash_size_bits,
+              at::cuda::getCurrentCUDAStream()));
         auto temp_storage = at::empty(
             {static_cast<int64_t>(temp_storage_bytes)},
             indices.options().dtype(at::kByte));
         AT_CUDA_CHECK(radix_sort_pairs(
-            temp_storage.data_ptr(),
-            temp_storage_bytes,
-            linear_indices.data_ptr<int64_t>(),
-            linear_indices_sorted.data_ptr<int64_t>(),
-            lxu_cache_locations.data_ptr<int32_t>(),
-            lxu_cache_locations_sorted.data_ptr<int32_t>(),
-            linear_indices.numel(),
-            0,
-            total_hash_size_bits,
-            at::cuda::getCurrentCUDAStream()));
+              temp_storage.data_ptr(),
+              temp_storage_bytes,
+              linear_indices.data_ptr<int64_t>(),
+              linear_indices_sorted.data_ptr<int64_t>(),
+              lxu_cache_locations.data_ptr<int32_t>(),
+              lxu_cache_locations_sorted.data_ptr<int32_t>(),
+              linear_indices.numel(),
+              0,
+              total_hash_size_bits,
+              at::cuda::getCurrentCUDAStream()));
+      }
+    }
+
+    if (lxu_cache_locations.size(0) == 0 || !use_uniq_cache_locations || use_homogeneous_placements) {
+        table_unique_indices_offsets = at::zeros_like(weights_placements);
     }
     {%- endif %}
 
@@ -788,6 +868,8 @@ Tensor split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_e
                         {%- endif %}
                         {%- if not dense %}
                         MAKE_PTA_WITH_NAME(func_name3, lxu_cache_locations_sorted, int32_t, 1, 32),
+                        use_uniq_cache_locations,
+                        MAKE_PTA_WITH_NAME(func_name3, table_unique_indices_offsets, int32_t, 1, 32),
                         {%- endif %}
                         {%- if weighted %}
                         MAKE_PTA_ACC_WITH_NAME(func_name3, indice_weights_sorted, cache_t, 1, 32),
@@ -896,6 +978,8 @@ Tensor split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_e
                         {%- endif %}
                         {%- if not dense %}
                         MAKE_PTA_WITH_NAME(func_name4, lxu_cache_locations_sorted, int32_t, 1, 32),
+                        use_uniq_cache_locations,
+                        MAKE_PTA_WITH_NAME(func_name4, table_unique_indices_offsets, int32_t, 1, 32),
                         {%- endif %}
                         {%- if weighted %}
                         MAKE_PTA_ACC_WITH_NAME(func_name4, indice_weights_sorted, cache_t, 1, 32),
@@ -1002,6 +1086,10 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
           "    Tensor vbe_row_output_offsets, "
           "    Tensor vbe_b_t_map, "
           {%- endif %}
+          {%- if not is_index_select and not dense %}
+          "    bool use_uniq_cache_locations, "
+          "    bool use_homogeneous_placements, "
+          {%- endif %}
           "    {{ args.split_function_schemas | join(", ") }}"
           ") -> Tensor");
     DISPATCH_TO_CUDA(
diff --git a/fbgemm_gpu/codegen/embedding_optimizer_split_device_kernel_template.cuh b/fbgemm_gpu/codegen/embedding_optimizer_split_device_kernel_template.cuh
index f5d4b68c9d..5bdfa415ce 100644
--- a/fbgemm_gpu/codegen/embedding_optimizer_split_device_kernel_template.cuh
+++ b/fbgemm_gpu/codegen/embedding_optimizer_split_device_kernel_template.cuh
@@ -31,10 +31,10 @@ DEVICE_INLINE void split_{{ optimizer }}_table_update_kernel(
     const bool stochastic_rounding,
     const at::PhiloxCudaState& stochastic_rounding_philox_args,
     const uint32_t run_id,
+    const uint32_t cache_loc_run_id,
     const int32_t D,
     const int32_t t,
     const int64_t idx,
-    const int32_t segment_start,
     const uint32_t shfl_sync_mask,
     const int32_t shared_weight_offset,
     {{ args.split_ref_kernel_args | replace_pta_namespace() | join(",\n    ") }}
@@ -54,7 +54,7 @@ DEVICE_INLINE void split_{{ optimizer }}_table_update_kernel(
         weights = &uvm_weights[weights_offset + idx * D_emb];
     }
     if (weights_placement == PlacementType::MANAGED_CACHING) {
-        const int32_t cache_idx = sorted_lxu_cache_locations[segment_start];
+        const int32_t cache_idx = sorted_lxu_cache_locations[cache_loc_run_id];
         if (cache_idx != kCacheLocationMissing) {
             cache_weights = &lxu_cache_weights[cache_idx][0];
         }
diff --git a/fbgemm_gpu/codegen/embedding_optimizer_split_kernel_template.cu b/fbgemm_gpu/codegen/embedding_optimizer_split_kernel_template.cu
index 310a0c6100..73f3721cfb 100644
--- a/fbgemm_gpu/codegen/embedding_optimizer_split_kernel_template.cu
+++ b/fbgemm_gpu/codegen/embedding_optimizer_split_kernel_template.cu
@@ -71,11 +71,11 @@ void split_{{ optimizer }}_update_kernel(
           stochastic_rounding,
           stochastic_rounding_philox_args,
           run_id,
+          0, // segment_start (not used right now because lxu_cache is not
+             // supported)
           D,
           0, // t
           grad_dev_indices[run_id], // idx
-          0, // segment_start (not used right now because lxu_cache is not
-             // supported)
           shfl_sync_mask,
           0, // shared_weight_offset (not used because shared memory is not
              // needed as uint8_t is not supported)

From e62a5e220b19b7b41e65d6a9b501adc32431a664 Mon Sep 17 00:00:00 2001
From: Shiyan Deng <dsy842974287@meta.com>
Date: Tue, 28 Nov 2023 11:42:51 -0800
Subject: [PATCH 47/73] use memcpy for cpu emb inplace update (#2166)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2166

Reviewed By: zyan0, sayitmemory

Differential Revision: D51539017

fbshipit-source-id: 1aa17f4c2a486dc304852584b16e3e49e4f4f0fb
---
 .../embedding_inplace_ops/embedding_inplace_update_cpu.cpp    | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
index 1738d61a68..80ed4db608 100644
--- a/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
+++ b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
@@ -62,9 +62,7 @@ void embedding_inplace_update_cpu_kernel(
 
     const uint8_t* __restrict__ update_weight_row =
         &update_weights[update_weight_offset];
-    for (const auto d : c10::irange(D_bytes)) {
-      weight_row[d] = update_weight_row[d];
-    }
+    memcpy(weight_row, update_weight_row, D_bytes);
   }
 }
 

From de731af65b4f04696e85c729e3282450b51b95fd Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 28 Nov 2023 12:10:51 -0800
Subject: [PATCH 48/73] Preparatory fixes & lint suppressions for
 c10::optional->std::optional (#2138)

Summary:
X-link: https://github.com/pytorch/executorch/pull/1226

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2138

Landing non-PyTorch portions first; then the PyTorch portions of  https://github.com/pytorch/pytorch/pull/101995 will land to Github.

Reviewed By: malfet

Differential Revision: D51355841

fbshipit-source-id: 4eed885733189f21e342613431b637de72979cb4
---
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index 8291927f45..be32cc2c4d 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -128,7 +128,7 @@ Tensor pack_segments_autograd(
 Tensor native_empty_like(const Tensor& self) {
   return at::native::empty_like(
       self,
-      optTypeMetaToScalarType(self.options().dtype_opt()),
+      c10::optTypeMetaToScalarType(self.options().dtype_opt()),
       self.options().layout_opt(),
       self.options().device_opt(),
       self.options().pinned_memory_opt(),

From 49e75361e4bb12471bcfdf9796f1a20a72d3c882 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Tue, 28 Nov 2023 13:59:12 -0800
Subject: [PATCH 49/73] Refactor embedding_bounds_check (#2165)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2165

- Migrate embedding_bounds_check out of codegen and into split_embeddings_utils

Reviewed By: sryap

Differential Revision: D51607319

fbshipit-source-id: 55051c08b8041d8d3bba65caa0b991e6be779e9f
---
 fbgemm_gpu/CMakeLists.txt                                   | 6 +++---
 fbgemm_gpu/docs/Doxyfile.in                                 | 4 ++--
 .../split_embeddings_utils}/embedding_bounds_check.cu       | 3 +--
 .../split_embeddings_utils/embedding_bounds_check_cpu.cpp}  | 0
 .../split_embeddings_utils/embedding_bounds_check_gpu.cpp}  | 2 --
 5 files changed, 6 insertions(+), 9 deletions(-)
 rename fbgemm_gpu/{codegen => src/split_embeddings_utils}/embedding_bounds_check.cu (99%)
 rename fbgemm_gpu/{codegen/embedding_bounds_check_host_cpu.cpp => src/split_embeddings_utils/embedding_bounds_check_cpu.cpp} (100%)
 rename fbgemm_gpu/{codegen/embedding_bounds_check_host.cpp => src/split_embeddings_utils/embedding_bounds_check_gpu.cpp} (96%)

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index cac90f07cf..8af7862213 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -560,7 +560,6 @@ set(fbgemm_gpu_sources_static_cpu
     codegen/embedding_forward_split_cpu.cpp
     codegen/embedding_forward_quantized_host_cpu.cpp
     codegen/embedding_backward_dense_host_cpu.cpp
-    codegen/embedding_bounds_check_host_cpu.cpp
     src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
@@ -580,13 +579,13 @@ set(fbgemm_gpu_sources_static_cpu
     src/split_embeddings_cache/lru_cache_populate_byte.cpp
     src/split_embeddings_cache/lxu_cache.cpp
     src/split_embeddings_cache/split_embeddings_cache_ops.cpp
+    src/split_embeddings_utils/embedding_bounds_check_cpu.cpp
     codegen/batch_index_select_dim0_cpu_host.cpp)
 
 if(NOT FBGEMM_CPU_ONLY)
   list(APPEND fbgemm_gpu_sources_static_cpu
     codegen/embedding_forward_quantized_host.cpp
     codegen/embedding_backward_dense_host.cpp
-    codegen/embedding_bounds_check_host.cpp
     src/memory_utils/memory_utils.cpp
     src/memory_utils/memory_utils_ops.cpp
     src/layout_transform_ops/layout_transform_ops_gpu.cpp
@@ -598,6 +597,7 @@ if(NOT FBGEMM_CPU_ONLY)
     src/split_embeddings_cache/split_embeddings_cache_ops.cu
     src/metric_ops/metric_ops_host.cpp
     src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
+    src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
     src/input_combine_ops/input_combine_gpu.cpp
     codegen/batch_index_select_dim0_host.cpp)
 
@@ -627,7 +627,6 @@ endif()
 
 if(NOT FBGEMM_CPU_ONLY)
   set(fbgemm_gpu_sources_static_gpu
-      codegen/embedding_bounds_check.cu
       codegen/embedding_forward_quantized_split_lookup.cu
       src/memory_utils/memory_utils.cu
       src/memory_utils/memory_utils_ops.cu
@@ -691,6 +690,7 @@ if(NOT FBGEMM_CPU_ONLY)
       src/split_embeddings_cache/lxu_cache.cu
       src/split_embeddings_cache/linearize_cache_indices.cu
       src/split_embeddings_cache/reset_weight_momentum.cu
+      src/split_embeddings_utils/embedding_bounds_check.cu
       src/split_embeddings_utils/generate_vbe_metadata.cu
       src/split_embeddings_utils/get_infos_metadata.cu
       src/split_embeddings_utils/radix_sort_pairs.cu
diff --git a/fbgemm_gpu/docs/Doxyfile.in b/fbgemm_gpu/docs/Doxyfile.in
index 447ac5f62d..3e5c8c20c8 100644
--- a/fbgemm_gpu/docs/Doxyfile.in
+++ b/fbgemm_gpu/docs/Doxyfile.in
@@ -932,10 +932,10 @@ INPUT                  = "../include/fbgemm_gpu" \
                          "../codegen/embedding_backward_split_host_cpu_template.cpp" \
                          "../codegen/embedding_forward_quantized_host.cpp" \
                          "../codegen/embedding_forward_quantized_host_cpu.cpp" \
-                         "../codegen/embedding_bounds_check_host.cpp" \
-                         "../codegen/embedding_bounds_check_host_cpu.cpp" \
                          "../codegen/embedding_backward_dense_host.cpp" \
                          "../codegen/embedding_forward_template_helpers.cuh " \
+                         "../src/split_embeddings_utils/embedding_bounds_check_cpu.cpp" \
+                         "../src/split_embeddings_utils/embedding_bounds_check_gpu.cpp" \
                          "../src/permute_pooled_embedding_ops_gpu.cpp" \
 
 # This tag can be used to specify the character encoding of the source files
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check.cu b/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check.cu
similarity index 99%
rename from fbgemm_gpu/codegen/embedding_bounds_check.cu
rename to fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check.cu
index a93be2ba6c..700d8974d8 100644
--- a/fbgemm_gpu/codegen/embedding_bounds_check.cu
+++ b/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check.cu
@@ -6,10 +6,9 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
-
 #include <c10/cuda/CUDADeviceAssertion.h>
 #include <c10/cuda/CUDAException.h>
+#include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
 
 using Tensor = at::Tensor;
 using namespace fbgemm_gpu;
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp b/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_cpu.cpp
similarity index 100%
rename from fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
rename to fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_cpu.cpp
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp b/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
similarity index 96%
rename from fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
rename to fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
index afa52729b7..77f68eb1df 100644
--- a/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
+++ b/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
@@ -9,9 +9,7 @@
 #include <ATen/ATen.h>
 #include <ATen/TypeDefault.h>
 #include <ATen/core/op_registration/op_registration.h>
-#include <ATen/cuda/CUDAContext.h>
 #include <torch/library.h>
-
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
 using Tensor = at::Tensor;

From 91a600a266bee7ca253f9ad5500b941b39cdba66 Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Tue, 28 Nov 2023 16:19:34 -0800
Subject: [PATCH 50/73] Add warmup_runs to TBE benchmarks and run at least 1
 warmup iter (#2163)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2163

We observe an extremely long start up time before the first
`cudaLaunchKernel` after switching to CUDA 12.  This causes the
performance of TBE to degrade if `warmup_runs=0`.  This diff modifies
`benchmark_requests` which is used extensively in TBE benchmarks to
run at least one warm up iteration even when `warmup_runs=0` to
exclude the first kernel time from the profiling result.  Moreover, we
add `--warmup-runs` to every TBE benchmarks to allow the users to
increase the warmup iterations.

Reviewed By: q10

Differential Revision: D51603915

fbshipit-source-id: 0f8ea9c089d51735a6c58417b1d8872732327f34
---
 fbgemm_gpu/bench/bench_utils.py               |  4 ++
 ...plit_table_batched_embeddings_benchmark.py | 53 +++++++++++++++++--
 2 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/fbgemm_gpu/bench/bench_utils.py b/fbgemm_gpu/bench/bench_utils.py
index be5b06287d..540b6e01ca 100644
--- a/fbgemm_gpu/bench/bench_utils.py
+++ b/fbgemm_gpu/bench/bench_utils.py
@@ -156,6 +156,10 @@ def benchmark_requests(
 ) -> float:
     times = []
 
+    # Run at least one warmup iteration to avoid the long cudaLaunchKernel time
+    # for the first kernel
+    num_warmups = num_warmups + 1 if num_warmups >= 0 else 1
+
     if num_warmups > 0:
         indices, offsets, weights = requests[0]
         for _ in range(num_warmups):
diff --git a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
index df57b397a6..143b8a0e3d 100644
--- a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
+++ b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
@@ -297,6 +297,7 @@ def device(  # noqa C901
         flush_gpu_cache_size_mb=flush_gpu_cache_size_mb,
         bwd_only=True,
         grad=grad_output,
+        num_warmups=warmup_runs,
     )
     logging.info(
         f"Backward, B: {B}, E: {E}, T: {T}, D: {D}, L: {L}, "
@@ -313,6 +314,7 @@ def device(  # noqa C901
 @click.option("--weights-precision", type=SparseType, default=SparseType.FP32)
 @click.option("--stoc", is_flag=True, default=False)
 @click.option("--iters", default=100)
+@click.option("--warmup-runs", default=0)
 @click.option("--mixed", is_flag=True, default=False)
 @click.option("--num-embeddings", default=int(1e5))
 @click.option("--num-tables", default=32)
@@ -336,6 +338,7 @@ def uvm(
     weights_precision: SparseType,
     stoc: bool,
     iters: int,
+    warmup_runs: int,
     mixed: bool,
     num_embeddings: int,
     num_tables: int,
@@ -486,6 +489,7 @@ def uvm(
             per_sample_weights,
         ),
         flush_gpu_cache_size_mb=flush_gpu_cache_size_mb,
+        num_warmups=warmup_runs,
     )
     logging.info(
         f"UVM Forward, B: {B}, "
@@ -521,6 +525,7 @@ def uvm(
                 per_sample_weights,
             ),
             flush_gpu_cache_size_mb=flush_gpu_cache_size_mb,
+            num_warmups=warmup_runs,
         )
         read_write_bytes_hbm = (
             output_size_multiplier * B * sum(Ds[T_uvm:])
@@ -541,6 +546,7 @@ def uvm(
                 per_sample_weights,
             ),
             flush_gpu_cache_size_mb=flush_gpu_cache_size_mb,
+            num_warmups=warmup_runs,
         )
         read_write_bytes_total = read_write_bytes_uvm + read_write_bytes_hbm
         logging.info(
@@ -562,6 +568,7 @@ def uvm(
 @click.option("--stoc", is_flag=True, default=False)
 @click.option("--long-index", is_flag=True, default=False)
 @click.option("--iters", default=100)
+@click.option("--warmup-runs", default=0)
 @click.option("--mixed", is_flag=True, default=False)
 @click.option("--num-embeddings", default=int(1e5))
 @click.option("--num-tables", default=32)
@@ -580,6 +587,7 @@ def cache(  # noqa C901
     weights_precision: SparseType,
     stoc: bool,
     iters: int,
+    warmup_runs: int,
     long_index: bool,
     mixed: bool,
     num_embeddings: int,
@@ -678,6 +686,7 @@ def cache(  # noqa C901
             indices.long(), offsets.long(), per_sample_weights
         ).backward(grad_output),
         flush_gpu_cache_size_mb=flush_gpu_cache_size_mb,
+        num_warmups=warmup_runs,
     )
     logging.info(
         f"ForwardBackward (UVM), B: {B}, E: {E}, T: {T}, D: {D}, L: {L}, "
@@ -713,6 +722,7 @@ def cache(  # noqa C901
     emb.reset_cache_states()
     for indices, offsets, _ in warmup_requests:
         emb.forward(indices, offsets)
+    # TODO: Add warmup_runs
     prefetch_time, forward_backward_time = benchmark_pipelined_requests(
         requests,
         lambda indices, offsets, indices_weights: emb.prefetch(indices, offsets),
@@ -738,9 +748,14 @@ def cache(  # noqa C901
 def benchmark_cpu_requests(
     requests: List[Tuple[torch.IntTensor, torch.IntTensor, Optional[torch.Tensor]]],
     func: Callable[[Tensor, Tensor, Optional[Tensor]], Tensor],
+    num_warmups: int = 0,
 ) -> float:
     import time
 
+    if num_warmups > 0:
+        for _ in range(num_warmups):
+            func(*requests[0])
+
     start_time = time.perf_counter()
     for indices, offsets, weights in requests:
         func(indices, offsets, weights)
@@ -756,6 +771,7 @@ def benchmark_cpu_requests(
 @click.option("--weights-precision", type=SparseType, default=SparseType.INT4)
 @click.option("--stoc", is_flag=True, default=False)
 @click.option("--iters", default=100)
+@click.option("--warmup-runs", default=0)
 @click.option("--managed", default="device")
 @click.option("--mixed", is_flag=True, default=False)
 @click.option("--num-embeddings", default=int(1e5))
@@ -777,6 +793,7 @@ def nbit_cpu(  # noqa C901
     weights_precision: SparseType,
     stoc: bool,
     iters: int,
+    warmup_runs: int,
     managed: str,
     mixed: bool,
     num_embeddings: int,
@@ -860,6 +877,7 @@ def nbit_cpu(  # noqa C901
             offsets,
             per_sample_weights,
         ),
+        num_warmups=warmup_runs,
     )
 
     logging.info(
@@ -1425,6 +1443,7 @@ def nbit_device_with_spec(  # noqa C901
 @click.option("--embedding-dim", default=128)
 @click.option("--weights-precision", type=SparseType, default=SparseType.INT4)
 @click.option("--iters", default=100)
+@click.option("--warmup-runs", default=0)
 @click.option("--mixed", is_flag=True, default=False)
 @click.option("--num-embeddings", default=int(1e5))
 @click.option("--num-tables", default=32)
@@ -1448,6 +1467,7 @@ def nbit_uvm(
     embedding_dim: int,
     weights_precision: SparseType,
     iters: int,
+    warmup_runs: int,
     mixed: bool,
     num_embeddings: int,
     num_tables: int,
@@ -1613,6 +1633,7 @@ def nbit_uvm(
             per_sample_weights,
         ),
         flush_gpu_cache_size_mb=flush_gpu_cache_size_mb,
+        num_warmups=warmup_runs,
     )
     logging.info(
         f"UVM NBit Forward, {weights_precision}, B: {B}, "
@@ -1670,6 +1691,7 @@ def nbit_uvm(
                 per_sample_weights,
             ),
             flush_gpu_cache_size_mb=flush_gpu_cache_size_mb,
+            num_warmups=warmup_runs,
         )
         read_write_bytes_total = read_write_bytes_uvm + read_write_bytes_hbm
         logging.info(
@@ -1683,6 +1705,7 @@ def nbit_uvm(
         emb_mixed.reset_cache_states()
         for indices, offsets, _ in requests:
             emb_mixed.forward(indices, offsets)
+        # TODO: Add warmup runs
         prefetch_time, forward_time = benchmark_pipelined_requests(
             requests,
             lambda indices, offsets, indices_weights: emb_mixed.prefetch(
@@ -1717,7 +1740,7 @@ def nbit_uvm(
 @click.option("--embedding-dim", default=128)
 @click.option("--weights-precision", type=SparseType, default=SparseType.INT4)
 @click.option("--iters", default=100)
-@click.option("--warmup", default=10)
+@click.option("--warmup_runs", default=10)
 @click.option("--mixed", is_flag=True, default=False)
 @click.option("--num-embeddings", default=int(1e5))
 @click.option("--num-tables", default=32)
@@ -1744,7 +1767,7 @@ def nbit_uvm_compare_direct_mapped(
     embedding_dim: int,
     weights_precision: SparseType,
     iters: int,
-    warmup: int,
+    warmup_runs: int,
     mixed: bool,
     num_embeddings: int,
     num_tables: int,
@@ -1866,7 +1889,7 @@ def bench_uvm_cls(
                 per_sample_weights,
             ),
             flush_gpu_cache_size_mb=flush_gpu_cache_size_mb,
-            num_warmups=warmup,
+            num_warmups=warmup_runs,
             nvtx_range=nvtx_range,
             callback_after_warmup=callback_after_warmup,
         )
@@ -1949,6 +1972,7 @@ def bench_uvm_cls(
 @click.option("--embedding-dim", default=128)
 @click.option("--weights-precision", type=SparseType, default=SparseType.INT4)
 @click.option("--iters", default=100)
+@click.option("--warmup-runs", default=0)
 @click.option("--mixed", is_flag=True, default=False)
 @click.option("--num-embeddings", default=int(1e5))
 @click.option("--num-tables", default=32)
@@ -1972,6 +1996,7 @@ def nbit_cache(  # noqa C901
     embedding_dim: int,
     weights_precision: SparseType,
     iters: int,
+    warmup_runs: int,
     mixed: bool,
     num_embeddings: int,
     num_tables: int,
@@ -2078,6 +2103,7 @@ def nbit_cache(  # noqa C901
             indices.int(), offsets.int(), per_sample_weights
         ),
         flush_gpu_cache_size_mb=flush_gpu_cache_size_mb,
+        num_warmups=warmup_runs,
     )
     logging.info(
         f"Forward (UVM) {weights_precision}, B: {B}, E: {E}, T: {T}, D: {D}, L: {L}, "
@@ -2156,6 +2182,7 @@ def nbit_cache(  # noqa C901
 
     torch.cuda.cudart().cudaProfilerStart()
     torch.cuda.nvtx.range_push("pipeline")
+    # TODO: Add warmup_runs
     prefetch_time, forward_time = benchmark_pipelined_requests(
         requests,
         lambda indices, offsets, indices_weights: emb.prefetch(
@@ -2189,6 +2216,7 @@ def nbit_cache(  # noqa C901
 @click.option("--bag-size", default=20)
 @click.option("--batch-size", default=2048)
 @click.option("--iters", default=10)
+@click.option("--warmup-runs", default=0)
 @click.option("--num-embeddings", default=int(1e5))
 @click.option("--num-tables", default=100)
 @click.option("--pruning-hash-load-factor", default=0.75)
@@ -2200,6 +2228,7 @@ def hashtable(  # noqa C901
     bag_size: int,
     batch_size: int,
     iters: int,
+    warmup_runs: int,
     num_embeddings: int,
     num_tables: int,
     pruning_hash_load_factor: float,
@@ -2278,6 +2307,7 @@ def hashtable(  # noqa C901
         lambda indices, offsets, _: torch.ops.fbgemm.pruned_hashmap_lookup(
             indices, offsets, hash_table, hash_table_offsets
         ),
+        num_warmups=warmup_runs,
     )
 
     logging.info(
@@ -2292,6 +2322,7 @@ def hashtable(  # noqa C901
         time_per_iter = benchmark_requests(
             requests,
             lambda indices, offsets, _: ht.lookup(indices, offsets),
+            num_warmups=warmup_runs,
         )
 
         logging.info(
@@ -2304,6 +2335,7 @@ def hashtable(  # noqa C901
 @click.option("--bag-size", default=20)
 @click.option("--batch-size", default=2048)
 @click.option("--iters", default=100)
+@click.option("--warmup-runs", default=0)
 @click.option("--num-embeddings", default=int(1e5))
 @click.option("--num-tables", default=100)
 @click.option("--pruning-ratio", default=0.9)
@@ -2314,6 +2346,7 @@ def pruned_array(  # noqa C901
     bag_size: int,
     batch_size: int,
     iters: int,
+    warmup_runs: int,
     num_embeddings: int,
     num_tables: int,
     pruning_ratio: float,
@@ -2362,6 +2395,7 @@ def pruned_array(  # noqa C901
             index_remappings,
             index_remappings_offsets,
         ),
+        num_warmups=warmup_runs,
     )
 
     logging.info(
@@ -2374,6 +2408,7 @@ def pruned_array(  # noqa C901
 @click.option("--bag-size", default=20)
 @click.option("--batch-size", default=512)
 @click.option("--iters", default=100)
+@click.option("--warmup-runs", default=0)
 @click.option("--num-embeddings", default=int(1e5))
 @click.option("--num-tables", default=32)
 @click.option("--bounds-check-mode", type=int, default=BoundsCheckMode.WARNING.value)
@@ -2383,6 +2418,7 @@ def bounds_check_indices(  # noqa C901
     bag_size: int,
     batch_size: int,
     iters: int,
+    warmup_runs: int,
     num_embeddings: int,
     num_tables: int,
     bounds_check_mode: int,
@@ -2414,11 +2450,12 @@ def bounds_check_indices(  # noqa C901
         requests,
         lambda indices, offsets, _: torch.ops.fbgemm.bounds_check_indices(
             rows_per_table,
-            indices,
-            offsets,
+            indices.long(),
+            offsets.long(),
             BoundsCheckMode(bounds_check_mode),
             warning,
         ),
+        num_warmups=warmup_runs,
     )
 
     logging.info(
@@ -2437,6 +2474,7 @@ def bounds_check_indices(  # noqa C901
 @click.option("--weights-precision", type=SparseType, default=SparseType.INT4)
 @click.option("--output-dtype", type=SparseType, default=SparseType.FP16)
 @click.option("--iters", type=int, default=100)
+@click.option("--warmup-runs", default=0)
 @click.option("--fp8-exponent-bits", type=int, default=None)
 @click.option("--fp8-exponent-bias", type=int, default=None)
 def emb_inplace_update(  # noqa C901
@@ -2447,6 +2485,7 @@ def emb_inplace_update(  # noqa C901
     weights_precision: SparseType,
     output_dtype: SparseType,
     iters: int,
+    warmup_runs: int,
     fp8_exponent_bits: Optional[int],
     fp8_exponent_bias: Optional[int],
 ) -> None:
@@ -2548,6 +2587,7 @@ def emb_inplace_update(  # noqa C901
         op.embedding_inplace_update_internal,
         (update_table_idx, update_row_idx, update_weights),
         iters=iters,
+        num_warmups=warmup_runs,
     )
 
     logging.info(
@@ -2601,6 +2641,7 @@ def emb_inplace_update(  # noqa C901
             16,  # row_alignment
         ),
         iters=iters,
+        num_warmups=warmup_runs,
     )
 
     logging.info(
@@ -2865,6 +2906,7 @@ def device_with_spec(  # noqa C901
         flush_gpu_cache_size_mb=flush_gpu_cache_size_mb,
         bwd_only=True,
         grad=grad_output,
+        num_warmups=warmup_runs,
     )
     logging.info(
         f"Backward, B: {B}, Es: {Es}, T: {T}, Ds: {Ds}, Ls: {Ls_str}, "
@@ -2896,6 +2938,7 @@ def vbe(
     compressed_tables: int,
     iters: int,
 ) -> None:
+    # TODO: Add warmup_runs
     torch.manual_seed(42)
     B = batch_size
     cB = compressed_batch_size

From cb7357af26f5b2e897fc39ec0396167f2149fa25 Mon Sep 17 00:00:00 2001
From: Richard Zou <rzou@meta.com>
Date: Tue, 28 Nov 2023 18:14:32 -0800
Subject: [PATCH 51/73] Stop using excess memory in generate_opcheck_tests,
 re-enable fbgemm TBE tests

Summary:
X-link: https://github.com/pytorch/pytorch/pull/114641

1. We stop using excess memory in generate_opcheck_tests. This is safe because
   all the individual test utils already ensure that they do not modify the
   inputs.
2. We re-enable the fbgemm TBE tests (see internal diff, but all of this is open
   source). They were previously removed because they OOM'ed when run serially;
   (1) and (3) cut down the memory usage to ~20gb peak.
3. I needed to skip some newly failing generated tests and also some that had an
   impact on the memory usage.

bypass-github-export-checks

Reviewed By: sryap, williamwen42

Differential Revision: D51601964

fbshipit-source-id: e3fa9be771ea87b5fcd59b9ea3935506642ad75a
---
 fbgemm_gpu/test/failures_dict_fast.json       | 165 ++++++++++++++----
 .../split_table_batched_embeddings_test.py    | 121 +++++++------
 2 files changed, 200 insertions(+), 86 deletions(-)

diff --git a/fbgemm_gpu/test/failures_dict_fast.json b/fbgemm_gpu/test/failures_dict_fast.json
index a691ff1ad3..cec2a1adc7 100644
--- a/fbgemm_gpu/test/failures_dict_fast.json
+++ b/fbgemm_gpu/test/failures_dict_fast.json
@@ -238,6 +238,12 @@
         "status": "xfail"
       }
     },
+    "fbgemm::get_unique_indices": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_unique_lxu_cache_lookup": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
     "fbgemm::int_nbit_split_embedding_codegen_lookup_function": {
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_int_nbit_split_embedding_uvm_caching_codegen_lookup_function": {
         "comment": "",
@@ -277,15 +283,19 @@
     "fbgemm::lfu_cache_populate": {
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd_really_long_segments": {
+        "comment": "",
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_pipeline": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_schema__test_cache_pipeline": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       }
     },
     "fbgemm::lfu_cache_populate_byte": {
@@ -297,19 +307,27 @@
     "fbgemm::linearize_cache_indices": {
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmNONE": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmSUM": {
+        "comment": "",
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmNONE": {
+        "comment": "",
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd": {
         "comment": "",
@@ -366,6 +384,10 @@
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_stb_uvm_cache_stats": {
         "comment": "",
         "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_unique_lxu_cache_lookup": {
+        "comment": "",
+        "status": "xfail"
       }
     },
     "fbgemm::linearize_cache_indices_from_row_idx": {
@@ -377,67 +399,75 @@
     "fbgemm::lru_cache_populate": {
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmNONE": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmSUM": {
+        "comment": "",
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmNONE": {
+        "comment": "",
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd_really_long_segments": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_miss_counter": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_pipeline": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_1": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_2": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_stb_uvm_cache_stats": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_schema__test_cache_pipeline": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_schema__test_cache_prefetch_pipeline": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_schema__test_cache_prefetch_pipeline_stream_1": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_schema__test_cache_prefetch_pipeline_stream_2": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       }
     },
     "fbgemm::lru_cache_populate_byte": {
@@ -467,10 +497,18 @@
         "comment": "",
         "status": "xfail"
       },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmSUM": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": {
         "comment": "",
         "status": "xfail"
       },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmNONE": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": {
         "comment": "",
         "status": "xfail"
@@ -515,10 +553,18 @@
         "comment": "",
         "status": "xfail"
       },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmSUM": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": {
         "comment": "",
         "status": "xfail"
       },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmNONE": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": {
         "comment": "",
         "status": "xfail"
@@ -574,6 +620,10 @@
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_stb_uvm_cache_stats": {
         "comment": "",
         "status": "xfail"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_unique_lxu_cache_lookup": {
+        "comment": "",
+        "status": "xfail"
       }
     },
     "fbgemm::new_managed_tensor": {},
@@ -648,11 +698,11 @@
     "fbgemm::reset_weight_momentum": {
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_reset_embedding_weight_momentum": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_schema__test_reset_embedding_weight_momentum": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       }
     },
     "fbgemm::split_embedding_codegen_lookup_adagrad_function": {},
@@ -674,45 +724,90 @@
     "fbgemm::split_embedding_codegen_lookup_partial_rowwise_adam_function": {},
     "fbgemm::split_embedding_codegen_lookup_partial_rowwise_lamb_function": {},
     "fbgemm::split_embedding_codegen_lookup_rowwise_adagrad_function": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": {
+        "comment": "",
+        "status": "skip"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmNONE": {
+        "comment": "",
+        "status": "skip"
+      },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmSUM": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": {
+        "comment": "",
+        "status": "skip"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmNONE": {
+        "comment": "",
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_adagrad": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
+      }
+    },
+    "fbgemm::split_embedding_codegen_lookup_rowwise_adagrad_function_cpu": {
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": {
+        "comment": "",
+        "status": "skip"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmNONE": {
+        "comment": "",
+        "status": "skip"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmSUM": {
+        "comment": "",
+        "status": "skip"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": {
+        "comment": "",
+        "status": "skip"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmNONE": {
+        "comment": "",
+        "status": "skip"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": {
+        "comment": "",
+        "status": "skip"
+      },
+      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_adagrad": {
+        "comment": "",
+        "status": "skip"
       }
     },
-    "fbgemm::split_embedding_codegen_lookup_rowwise_adagrad_function_cpu": {},
     "fbgemm::split_embedding_codegen_lookup_rowwise_weighted_adagrad_function": {},
     "fbgemm::split_embedding_codegen_lookup_sgd_function": {
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd_really_long_segments": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_pipeline": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_1": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       },
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_2": {
         "comment": "",
-        "status": "xfail"
+        "status": "skip"
       }
     },
     "fbgemm::split_embedding_codegen_lookup_sgd_function_cpu": {
diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
index 7415e5c303..ce8e41e630 100644
--- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py
+++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -12,6 +12,7 @@
 import pickle
 import random
 import unittest
+
 from itertools import accumulate
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
@@ -73,12 +74,19 @@
 
 if open_source:
     # pyre-ignore[21]
-    from test_utils import gpu_available, gpu_unavailable, gradcheck, TEST_WITH_ROCM
+    from test_utils import (
+        gpu_available,
+        gpu_unavailable,
+        gradcheck,
+        optests,
+        TEST_WITH_ROCM,
+    )
 else:
     from fbgemm_gpu.test.test_utils import (
         gpu_available,
         gpu_unavailable,
         gradcheck,
+        optests,
         TEST_WITH_ROCM,
     )
 
@@ -89,6 +97,9 @@
 MAX_EXAMPLES_LONG_RUNNING = 15
 
 
+VERBOSITY: Verbosity = Verbosity.verbose
+
+
 settings.register_profile("derandomize", derandomize=True)
 settings.load_profile("derandomize")
 
@@ -160,18 +171,22 @@ def format_ref_tensors_in_mixed_B_layout(
     "test_autograd_registration__test_backward_none_with_rowwise_adagrad": [
         unittest.skip("Cannot access data pointer of Tensor that doesn't have storage")
     ],
-    "test_faketensor__test_cache_prefetch_pipeline_stream_2": [unittest.skip("OOM")],
-    "test_faketensor__test_cache_prefetch_pipeline": [unittest.skip("OOM")],
-    "test_faketensor__test_cache_prefetch_pipeline_stream_1": [
-        unittest.skip("IMA on exit")
+    "test_faketensor__test_nbit_forward_uvm_cache": [
+        unittest.skip("CUDA Assert"),
+    ],
+    "test_faketensor__test_nbit_uvm_cache_stats": [
+        unittest.skip("very slow"),
     ],
-    "test_faketensor__test_cache_pipeline": [
-        unittest.skip("OOM when run serially"),
+    "test_faketensor__test_nbit_direct_mapped_uvm_cache_stats": [
+        unittest.skip("very slow"),
     ],
 }
 
 
+@optests.generate_opcheck_tests(fast=True, additional_decorators=additional_decorators)
 class SplitTableBatchedEmbeddingsTest(unittest.TestCase):
+    _do_cuda_memory_leak_check = True
+
     def execute_forward_(  # noqa C901
         self,
         T: int,
@@ -549,7 +564,7 @@ def test_forward_gpu_no_cache_int8(
         use_experimental_tbe=st.booleans() if not TEST_WITH_ROCM else st.just(False),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
@@ -612,7 +627,7 @@ def test_forward_gpu_no_cache_fp16(
         use_experimental_tbe=st.booleans() if not TEST_WITH_ROCM else st.just(False),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
@@ -675,7 +690,7 @@ def test_forward_gpu_no_cache_fp32(
         cache_algorithm=st.sampled_from(CacheAlgorithm),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
@@ -740,7 +755,7 @@ def test_forward_gpu_uvm_cache_int8(
         use_experimental_tbe=st.booleans() if not TEST_WITH_ROCM else st.just(False),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
@@ -806,7 +821,7 @@ def test_forward_gpu_uvm_cache_fp16(
         use_experimental_tbe=st.booleans() if not TEST_WITH_ROCM else st.just(False),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
@@ -876,7 +891,7 @@ def test_forward_gpu_uvm_cache_fp32(
         output_dtype=st.sampled_from([SparseType.FP16, SparseType.INT8]),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much],
@@ -1011,7 +1026,7 @@ def test_forward_fused_pooled_emb_quant(
         ),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much],
@@ -1184,7 +1199,7 @@ def test_nbit_forward_fused_pooled_emb_quant(
         ),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much],
@@ -1297,7 +1312,7 @@ def test_nbit_split_embedding_weights_with_scale_and_bias(
         output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=10,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
@@ -1547,7 +1562,7 @@ def test_backward_dense(  # noqa C901
         output_dtype=st.sampled_from([SparseType.FP16, SparseType.FP32]),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
@@ -1574,7 +1589,7 @@ def test_backward_none(self, **kwargs: Any) -> None:
         output_dtype=st.sampled_from([SparseType.FP16, SparseType.FP32]),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
@@ -2142,7 +2157,7 @@ def execute_backward_sgd_(  # noqa C901
         else st.just(True),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
@@ -2196,7 +2211,7 @@ def test_backward_sgd(  # noqa C901
         cache_algorithm=st.sampled_from(CacheAlgorithm),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
@@ -2640,7 +2655,7 @@ def execute_backward_adagrad_(  # noqa C901
         output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
@@ -2710,7 +2725,7 @@ def test_backward_adagrad_fp16_pmSUM(  # noqa C901
         output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
@@ -2779,7 +2794,7 @@ def test_backward_adagrad_fp16_pmMEAN(  # noqa C901
         output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
@@ -2845,7 +2860,7 @@ def test_backward_adagrad_fp16_pmNONE(  # noqa C901
         output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
@@ -2915,7 +2930,7 @@ def test_backward_adagrad_fp32_pmSUM(  # noqa C901
         output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
@@ -2984,7 +2999,7 @@ def test_backward_adagrad_fp32_pmMEAN(  # noqa C901
         output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
@@ -3108,6 +3123,7 @@ def _generate_cache_tbes(
 
         return (cc, cc_ref, min(Es), sum(Ds))
 
+    @optests.dontGenerateOpCheckTests("Serial OOM")
     @unittest.skipIf(*gpu_unavailable)
     @given(
         T=st.integers(min_value=1, max_value=5),
@@ -3118,7 +3134,7 @@ def _generate_cache_tbes(
         mixed=st.booleans(),
         cache_algorithm=st.sampled_from(CacheAlgorithm),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    @settings(verbosity=VERBOSITY, max_examples=MAX_EXAMPLES, deadline=None)
     def test_cache_pipeline(
         self,
         T: int,
@@ -3234,6 +3250,7 @@ def _prefetch(
         torch.testing.assert_close(output, output_ref)
         self.assertTrue(torch.all(cc.lxu_cache_locking_counter == 0))
 
+    @optests.dontGenerateOpCheckTests("Serial OOM")
     @unittest.skipIf(*gpu_unavailable)
     @given(
         T=st.integers(min_value=1, max_value=5),
@@ -3244,7 +3261,7 @@ def _prefetch(
         mixed=st.booleans(),
         prefetch_location=st.sampled_from(["before_fwd", "between_fwd_bwd"]),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    @settings(verbosity=VERBOSITY, max_examples=MAX_EXAMPLES, deadline=None)
     def test_cache_prefetch_pipeline(
         self,
         T: int,
@@ -3266,6 +3283,7 @@ def test_cache_prefetch_pipeline(
             prefetch_stream=None,
         )
 
+    @optests.dontGenerateOpCheckTests("Serial OOM")
     @unittest.skipIf(*gpu_unavailable)
     @given(
         T=st.integers(min_value=1, max_value=5),
@@ -3275,7 +3293,7 @@ def test_cache_prefetch_pipeline(
         L=st.integers(min_value=1, max_value=20),
         mixed=st.booleans(),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    @settings(verbosity=VERBOSITY, max_examples=MAX_EXAMPLES, deadline=None)
     def test_cache_prefetch_pipeline_stream_1(
         self,
         T: int,
@@ -3296,6 +3314,7 @@ def test_cache_prefetch_pipeline_stream_1(
             prefetch_stream=torch.cuda.Stream(),
         )
 
+    @optests.dontGenerateOpCheckTests("Serial OOM")
     @unittest.skipIf(*gpu_unavailable)
     @given(
         T=st.integers(min_value=1, max_value=5),
@@ -3305,7 +3324,7 @@ def test_cache_prefetch_pipeline_stream_1(
         L=st.integers(min_value=1, max_value=20),
         mixed=st.booleans(),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    @settings(verbosity=VERBOSITY, max_examples=MAX_EXAMPLES, deadline=None)
     def test_cache_prefetch_pipeline_stream_2(
         self,
         T: int,
@@ -3977,7 +3996,7 @@ def get_wts_from_counter_adagrad(
         uvm_non_rowwise_momentum=st.booleans(),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
@@ -4057,7 +4076,7 @@ def test_backward_optimizers_adam(  # noqa C901
         ),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
@@ -4129,7 +4148,7 @@ def test_backward_optimizers_adagrad(  # noqa C901
         else st.just(True),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
@@ -4188,7 +4207,7 @@ def test_backward_optimizers_lamb(  # noqa C901
         else st.just(True),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
@@ -4509,7 +4528,7 @@ def execute_nbit_forward_(  # noqa C901
         do_pruning=st.booleans(),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
     )
@@ -4596,7 +4615,7 @@ def test_nbit_forward_cpu(
         do_pruning=st.booleans(),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
     )
@@ -4686,7 +4705,7 @@ def test_nbit_forward_gpu_no_cache_fp8_2048(self) -> None:
         do_pruning=st.booleans(),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES_LONG_RUNNING,
         deadline=None,
     )
@@ -4765,7 +4784,7 @@ def test_nbit_forward_gpu_no_cache(
         ),
         emulate_pruning=st.booleans(),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    @settings(verbosity=VERBOSITY, max_examples=MAX_EXAMPLES, deadline=None)
     def test_int_nbit_split_embedding_uvm_caching_codegen_lookup_function(
         self,
         weights_ty: SparseType,
@@ -4950,7 +4969,7 @@ def test_int_nbit_split_embedding_uvm_caching_codegen_lookup_function(
         do_pruning=st.booleans(),
         use_array_for_index_remapping=st.booleans(),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    @settings(verbosity=VERBOSITY, max_examples=MAX_EXAMPLES, deadline=None)
     def test_nbit_forward_uvm_cache(
         self,
         weights_ty: SparseType,
@@ -5067,7 +5086,7 @@ def test_nbit_forward_uvm_cache(
         use_cpu_hashtable=st.booleans(),
         use_array_for_index_remapping=st.booleans(),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    @settings(verbosity=VERBOSITY, max_examples=MAX_EXAMPLES, deadline=None)
     def test_pruning(
         self,
         T: int,
@@ -5211,7 +5230,7 @@ def test_pruning(
         H=st.integers(min_value=512, max_value=1024),
         S=st.integers(min_value=0, max_value=128),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    @settings(verbosity=VERBOSITY, max_examples=MAX_EXAMPLES, deadline=None)
     def test_cache_update_function(self, L: int, H: int, S: int) -> None:
         # Generate synthetic data
         linear_cache_indices_cpu = torch.randint(L, H, (S,))
@@ -5262,7 +5281,7 @@ def test_cache_update_function(self, L: int, H: int, S: int) -> None:
         self.assertLessEqual(cache_miss_forward_count, unique_cache_miss_count)
 
     @given(N=st.integers(min_value=1, max_value=8))
-    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    @settings(verbosity=VERBOSITY, max_examples=MAX_EXAMPLES, deadline=None)
     def test_cache_miss_counter(self, N: int) -> None:
         # Create an abstract split table
         D = 8
@@ -5318,7 +5337,7 @@ def test_cache_miss_counter(self, N: int) -> None:
                     self.assertEqual(tablewise_cache_miss[i], t_tablewise_cache_miss[i])
 
     @given(N=st.integers(min_value=1, max_value=2))
-    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    @settings(verbosity=VERBOSITY, max_examples=MAX_EXAMPLES, deadline=None)
     def test_stb_uvm_cache_stats(self, N: int) -> None:
         # Create an abstract split table
         D = 8
@@ -5369,7 +5388,7 @@ def test_stb_uvm_cache_stats(self, N: int) -> None:
         H=st.integers(min_value=512, max_value=1024),
         S=st.integers(min_value=0, max_value=128),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    @settings(verbosity=VERBOSITY, max_examples=MAX_EXAMPLES, deadline=None)
     def test_nbit_cache_update_function(self, L: int, H: int, S: int) -> None:
         # Generate synthetic data
         linear_cache_indices_cpu = torch.randint(L, H, (S,))
@@ -5426,7 +5445,7 @@ def test_nbit_cache_update_function(self, L: int, H: int, S: int) -> None:
 
     @unittest.skipIf(*gpu_unavailable)
     @given(N=st.integers(min_value=1, max_value=8))
-    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    @settings(verbosity=VERBOSITY, max_examples=MAX_EXAMPLES, deadline=None)
     def test_nbit_cache_miss_counter(self, N: int) -> None:
         # Create an abstract split table
         D = 8
@@ -5481,7 +5500,7 @@ def test_nbit_cache_miss_counter(self, N: int) -> None:
         N=st.integers(min_value=1, max_value=8),
         dtype=st.sampled_from([SparseType.INT8, SparseType.INT4, SparseType.INT2]),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    @settings(verbosity=VERBOSITY, max_examples=MAX_EXAMPLES, deadline=None)
     def test_nbit_uvm_cache_stats(self, N: int, dtype: SparseType) -> None:
         # Create an abstract split table
         D = 8
@@ -5596,7 +5615,7 @@ def test_nbit_uvm_cache_stats(self, N: int, dtype: SparseType) -> None:
         N=st.integers(min_value=1, max_value=8),
         dtype=st.sampled_from([SparseType.INT8, SparseType.INT4, SparseType.INT2]),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    @settings(verbosity=VERBOSITY, max_examples=MAX_EXAMPLES, deadline=None)
     def test_nbit_direct_mapped_uvm_cache_stats(
         self, N: int, dtype: SparseType
     ) -> None:
@@ -5739,7 +5758,7 @@ def test_nbit_direct_mapped_uvm_cache_stats(
         ),
         mixed_B=st.booleans(),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    @settings(verbosity=VERBOSITY, max_examples=MAX_EXAMPLES, deadline=None)
     def test_bounds_check(  # noqa C901
         self,
         T: int,
@@ -6183,7 +6202,7 @@ def test_lxu_cache_lookup(self, associativity: int) -> None:
     @given(
         cache_sets=st.integers(min_value=10, max_value=300),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    @settings(verbosity=VERBOSITY, max_examples=MAX_EXAMPLES, deadline=None)
     def test_lxu_cache_locking_counter_decrement(
         self,
         cache_sets: int,
@@ -6248,7 +6267,7 @@ def test_lxu_cache_locking_counter_decrement(
         else st.just(True),
         test_internal=st.booleans(),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    @settings(verbosity=VERBOSITY, max_examples=MAX_EXAMPLES, deadline=None)
     def test_embedding_inplace_update(
         self,
         T: int,  # num of embedding tables
@@ -6406,7 +6425,7 @@ def test_embedding_inplace_update(
         num_indices_per_table=st.integers(min_value=1, max_value=5),
     )
     @settings(
-        verbosity=Verbosity.verbose,
+        verbosity=VERBOSITY,
         max_examples=MAX_EXAMPLES,
         deadline=None,
     )

From 886bf42e44e618031bddfd3b9567cacc428e83ce Mon Sep 17 00:00:00 2001
From: Liu Cao <lcao@fb.com>
Date: Tue, 28 Nov 2023 22:02:51 -0800
Subject: [PATCH 52/73] Revert D51607319: Refactor embedding_bounds_check

Differential Revision:
D51607319

Original commit changeset: 55051c08b804

Original Phabricator Diff: D51607319

fbshipit-source-id: 1453b9ae4278c64b9055c74b4fd301cedd2c475b
---
 fbgemm_gpu/CMakeLists.txt                                   | 6 +++---
 .../embedding_bounds_check.cu                               | 3 ++-
 .../embedding_bounds_check_host.cpp}                        | 2 ++
 .../embedding_bounds_check_host_cpu.cpp}                    | 0
 fbgemm_gpu/docs/Doxyfile.in                                 | 4 ++--
 5 files changed, 9 insertions(+), 6 deletions(-)
 rename fbgemm_gpu/{src/split_embeddings_utils => codegen}/embedding_bounds_check.cu (99%)
 rename fbgemm_gpu/{src/split_embeddings_utils/embedding_bounds_check_gpu.cpp => codegen/embedding_bounds_check_host.cpp} (96%)
 rename fbgemm_gpu/{src/split_embeddings_utils/embedding_bounds_check_cpu.cpp => codegen/embedding_bounds_check_host_cpu.cpp} (100%)

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 8af7862213..cac90f07cf 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -560,6 +560,7 @@ set(fbgemm_gpu_sources_static_cpu
     codegen/embedding_forward_split_cpu.cpp
     codegen/embedding_forward_quantized_host_cpu.cpp
     codegen/embedding_backward_dense_host_cpu.cpp
+    codegen/embedding_bounds_check_host_cpu.cpp
     src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
@@ -579,13 +580,13 @@ set(fbgemm_gpu_sources_static_cpu
     src/split_embeddings_cache/lru_cache_populate_byte.cpp
     src/split_embeddings_cache/lxu_cache.cpp
     src/split_embeddings_cache/split_embeddings_cache_ops.cpp
-    src/split_embeddings_utils/embedding_bounds_check_cpu.cpp
     codegen/batch_index_select_dim0_cpu_host.cpp)
 
 if(NOT FBGEMM_CPU_ONLY)
   list(APPEND fbgemm_gpu_sources_static_cpu
     codegen/embedding_forward_quantized_host.cpp
     codegen/embedding_backward_dense_host.cpp
+    codegen/embedding_bounds_check_host.cpp
     src/memory_utils/memory_utils.cpp
     src/memory_utils/memory_utils_ops.cpp
     src/layout_transform_ops/layout_transform_ops_gpu.cpp
@@ -597,7 +598,6 @@ if(NOT FBGEMM_CPU_ONLY)
     src/split_embeddings_cache/split_embeddings_cache_ops.cu
     src/metric_ops/metric_ops_host.cpp
     src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
-    src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
     src/input_combine_ops/input_combine_gpu.cpp
     codegen/batch_index_select_dim0_host.cpp)
 
@@ -627,6 +627,7 @@ endif()
 
 if(NOT FBGEMM_CPU_ONLY)
   set(fbgemm_gpu_sources_static_gpu
+      codegen/embedding_bounds_check.cu
       codegen/embedding_forward_quantized_split_lookup.cu
       src/memory_utils/memory_utils.cu
       src/memory_utils/memory_utils_ops.cu
@@ -690,7 +691,6 @@ if(NOT FBGEMM_CPU_ONLY)
       src/split_embeddings_cache/lxu_cache.cu
       src/split_embeddings_cache/linearize_cache_indices.cu
       src/split_embeddings_cache/reset_weight_momentum.cu
-      src/split_embeddings_utils/embedding_bounds_check.cu
       src/split_embeddings_utils/generate_vbe_metadata.cu
       src/split_embeddings_utils/get_infos_metadata.cu
       src/split_embeddings_utils/radix_sort_pairs.cu
diff --git a/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check.cu b/fbgemm_gpu/codegen/embedding_bounds_check.cu
similarity index 99%
rename from fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check.cu
rename to fbgemm_gpu/codegen/embedding_bounds_check.cu
index 700d8974d8..a93be2ba6c 100644
--- a/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check.cu
+++ b/fbgemm_gpu/codegen/embedding_bounds_check.cu
@@ -6,9 +6,10 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
+
 #include <c10/cuda/CUDADeviceAssertion.h>
 #include <c10/cuda/CUDAException.h>
-#include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
 
 using Tensor = at::Tensor;
 using namespace fbgemm_gpu;
diff --git a/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_gpu.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
similarity index 96%
rename from fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
rename to fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
index 77f68eb1df..afa52729b7 100644
--- a/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
+++ b/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
@@ -9,7 +9,9 @@
 #include <ATen/ATen.h>
 #include <ATen/TypeDefault.h>
 #include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <torch/library.h>
+
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
 using Tensor = at::Tensor;
diff --git a/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_cpu.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
similarity index 100%
rename from fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_cpu.cpp
rename to fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
diff --git a/fbgemm_gpu/docs/Doxyfile.in b/fbgemm_gpu/docs/Doxyfile.in
index 3e5c8c20c8..447ac5f62d 100644
--- a/fbgemm_gpu/docs/Doxyfile.in
+++ b/fbgemm_gpu/docs/Doxyfile.in
@@ -932,10 +932,10 @@ INPUT                  = "../include/fbgemm_gpu" \
                          "../codegen/embedding_backward_split_host_cpu_template.cpp" \
                          "../codegen/embedding_forward_quantized_host.cpp" \
                          "../codegen/embedding_forward_quantized_host_cpu.cpp" \
+                         "../codegen/embedding_bounds_check_host.cpp" \
+                         "../codegen/embedding_bounds_check_host_cpu.cpp" \
                          "../codegen/embedding_backward_dense_host.cpp" \
                          "../codegen/embedding_forward_template_helpers.cuh " \
-                         "../src/split_embeddings_utils/embedding_bounds_check_cpu.cpp" \
-                         "../src/split_embeddings_utils/embedding_bounds_check_gpu.cpp" \
                          "../src/permute_pooled_embedding_ops_gpu.cpp" \
 
 # This tag can be used to specify the character encoding of the source files

From c6e3fa2133197fe777fb85ae966c715641114854 Mon Sep 17 00:00:00 2001
From: Qiang Zhang <drqiangzhang@meta.com>
Date: Wed, 29 Nov 2023 07:28:51 -0800
Subject: [PATCH 53/73] RW Dist change to support uneven sharding [1] FBGEMM
 changes (#2168)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2168

We revert a diff due to incompatibility between frontend and backend packages in code freeze. Here we only reland the backend part, which will not pick up by production package. The frontend part would be land after the code freeze.

Reviewed By: IvanKobzarev

Differential Revision: D51496816

fbshipit-source-id: beac54e7d629e8919d1b280161dba491ed8a3431
---
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index be32cc2c4d..1be2fc8930 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -306,7 +306,8 @@ void _block_bucketize_sparse_features_cpu(
   const index_t* const block_sizes_data = block_sizes.data_ptr<index_t>();
   offset_t* batch_sizes_data = nullptr;
   const auto variable_batch_size = batch_size_per_feature.has_value();
-  const auto variable_bucket_sizes = block_bucketize_pos.has_value();
+  const auto variable_bucket_sizes = block_bucketize_pos.has_value() &&
+      block_bucketize_pos.value().size() != 0;
   using uindex_t = std::make_unsigned_t<index_t>;
   using uoffset_t = std::make_unsigned_t<offset_t>;
   std::vector<int64_t> lower_bounds(indices.numel(), 0);

From 98932d6b09ed45ff0aa2c31d35b2468a1a0d2544 Mon Sep 17 00:00:00 2001
From: Richard Zou <rzou@meta.com>
Date: Wed, 29 Nov 2023 12:49:58 -0800
Subject: [PATCH 54/73] Make fbgemm::masked_select_jagged_1d pt2_compliant
 (#2174)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2174

It needed an abstract impl.

Reviewed By: williamwen42

Differential Revision: D51647394

fbshipit-source-id: 127168a49026f1f1261ab4fe883c05d80450b5e3
---
 fbgemm_gpu/fbgemm_gpu/sparse_ops.py               | 15 +++++++++++++++
 .../jagged_tensor_ops/jagged_tensor_ops_cpu.cpp   |  8 +++++++-
 fbgemm_gpu/test/failures_dict.json                | 11 +----------
 3 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
index b394d4c2d1..d7f17ac135 100644
--- a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
@@ -96,6 +96,21 @@ def permute_1D_sparse_data_meta(
     return permuted_lengths, permuted_indices, permuted_weights
 
 
+@impl_abstract("fbgemm::masked_select_jagged_1d")
+def masked_select_jagged_1d(
+    values: Tensor, lengths: Tensor, mask: Tensor
+) -> Tuple[Tensor, Tensor]:
+    torch._check(values.dim() == 1)
+    torch._check(lengths.dim() == 1)
+    torch._check(values.device == lengths.device)
+    torch._check(values.device == mask.device)
+
+    s0 = torch.library.get_ctx().new_dynamic_size()
+    masked_values = values.new_empty([s0])
+    masked_lengths = torch.empty_like(lengths)
+    return masked_values, masked_lengths
+
+
 @impl_abstract("fbgemm::expand_into_jagged_permute")
 def expand_into_jagged_permute_meta(
     permute: Tensor,
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
index 0b8525773e..4a8859cfc6 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
@@ -1569,6 +1569,11 @@ Tensor jagged_slice_forward_cpu(
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
+#ifdef HAS_IMPL_ABSTRACT_PYSTUB
+  m.impl_abstract_pystub(
+      "fbgemm_gpu.sparse_ops",
+      "//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_py");
+#endif
   // (dense, offsets) -> jagged. Offsets output is same as input.
   // SymInt is a new PyTorch 2.0 feature to support dynamic shape. See more
   // details at https://pytorch.org/get-started/pytorch-2.0/#dynamic-shapes. If
@@ -1645,7 +1650,8 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "jagged_1d_to_truncated_values(Tensor values, Tensor lengths, int max_truncated_length) -> Tensor");
   m.def(
-      "masked_select_jagged_1d(Tensor values, Tensor lengths, Tensor mask) -> (Tensor, Tensor)");
+      "masked_select_jagged_1d(Tensor values, Tensor lengths, Tensor mask) -> (Tensor, Tensor)",
+      {PT2_COMPLIANT_TAG});
   m.def(
       "jagged_softmax(Tensor values, Tensor x_offsets, int max_L) -> (Tensor, Tensor)",
       {PT2_COMPLIANT_TAG});
diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index 964038bb68..ee2d384eb5 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -349,16 +349,7 @@
         "status": "xfail"
       }
     },
-    "fbgemm::masked_select_jagged_1d": {
-      "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_masked_select_jagged_1d": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "JaggedTensorOpsTest.test_faketensor__test_masked_select_jagged_1d": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
+    "fbgemm::masked_select_jagged_1d": {},
     "fbgemm::offsets_range": {
       "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_1d_to_dense": {
         "comment": "",

From 24576058259dcffef6f6be2968406c38b404e010 Mon Sep 17 00:00:00 2001
From: Richard Zou <rzou@meta.com>
Date: Wed, 29 Nov 2023 12:49:58 -0800
Subject: [PATCH 55/73] Add generate_opcheck_tests to input_combine_test
 (#2173)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2173

See title.

Reviewed By: williamwen42

Differential Revision: D51647393

fbshipit-source-id: 2c18a70faea73545f1aa9eb4e190f114ec2fde91
---
 fbgemm_gpu/test/failures_dict.json    | 104 ++++++++++++++++++++++++++
 fbgemm_gpu/test/input_combine_test.py |   5 +-
 2 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index ee2d384eb5..404791b965 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -374,6 +374,58 @@
         "status": "xsuccess"
       }
     },
+    "fbgemm::padding_fused_tbe_input_combine": {
+      "InputCombineTest.test_aot_dispatch_dynamic__test_padding_fused_input_combine_int32": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "InputCombineTest.test_aot_dispatch_dynamic__test_padding_fused_input_combine_int64": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "InputCombineTest.test_aot_dispatch_dynamic__test_padding_fused_input_combined_mix": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "InputCombineTest.test_faketensor__test_padding_fused_input_combine_int32": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "InputCombineTest.test_faketensor__test_padding_fused_input_combine_int64": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "InputCombineTest.test_faketensor__test_padding_fused_input_combined_mix": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::padding_fused_tbe_input_combine_with_length": {
+      "InputCombineTest.test_aot_dispatch_dynamic__test_padding_fused_input_combine_int32_with_length": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "InputCombineTest.test_aot_dispatch_dynamic__test_padding_fused_input_combine_int64_with_length": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "InputCombineTest.test_aot_dispatch_dynamic__test_padding_fused_input_combined_mix_with_length": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "InputCombineTest.test_faketensor__test_padding_fused_input_combine_int32_with_length": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "InputCombineTest.test_faketensor__test_padding_fused_input_combine_int64_with_length": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "InputCombineTest.test_faketensor__test_padding_fused_input_combined_mix_with_length": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
     "fbgemm::permute102_baddbmm_permute102": {
       "SparseOpsTest.test_aot_dispatch_dynamic__test_permute102_baddbmm_permute102": {
         "comment": "",
@@ -499,6 +551,58 @@
         "comment": "",
         "status": "xfail"
       }
+    },
+    "fbgemm::tbe_input_combine": {
+      "InputCombineTest.test_aot_dispatch_dynamic__test_input_combine_int32": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "InputCombineTest.test_aot_dispatch_dynamic__test_input_combine_int64": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "InputCombineTest.test_aot_dispatch_dynamic__test_input_combined_mix": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "InputCombineTest.test_faketensor__test_input_combine_int32": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "InputCombineTest.test_faketensor__test_input_combine_int64": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "InputCombineTest.test_faketensor__test_input_combined_mix": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::tbe_input_combine_with_length": {
+      "InputCombineTest.test_aot_dispatch_dynamic__test_input_combine_int32_with_length": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "InputCombineTest.test_aot_dispatch_dynamic__test_input_combine_int64_with_length": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "InputCombineTest.test_aot_dispatch_dynamic__test_input_combine_mix_with_length": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "InputCombineTest.test_faketensor__test_input_combine_int32_with_length": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "InputCombineTest.test_faketensor__test_input_combine_int64_with_length": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "InputCombineTest.test_faketensor__test_input_combine_mix_with_length": {
+        "comment": "",
+        "status": "xfail"
+      }
     }
   }
 }
diff --git a/fbgemm_gpu/test/input_combine_test.py b/fbgemm_gpu/test/input_combine_test.py
index cbd054b607..a042a93948 100644
--- a/fbgemm_gpu/test/input_combine_test.py
+++ b/fbgemm_gpu/test/input_combine_test.py
@@ -18,11 +18,11 @@
     from fbgemm_gpu import open_source  # noqa: F401
 
     # pyre-ignore[21]
-    from test_utils import cpu_and_maybe_gpu
+    from test_utils import cpu_and_maybe_gpu, optests
 except Exception:
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:input_combine")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:input_combine_cpu")
-    from fbgemm_gpu.test.test_utils import cpu_and_maybe_gpu
+    from fbgemm_gpu.test.test_utils import cpu_and_maybe_gpu, optests
 
 DEFAULT_DEVICE = torch.device("cpu")
 
@@ -127,6 +127,7 @@ def forward(  # noqa C901
         return combined_indices, combined_offsets, per_sample_weights
 
 
+@optests.generate_opcheck_tests()
 class InputCombineTest(unittest.TestCase):
     def _get_inputs(self, dtypes, device=DEFAULT_DEVICE):
         indices_list = [

From 71e496ab12779b87fd1f576889faa3726704b637 Mon Sep 17 00:00:00 2001
From: Richard Zou <rzou@meta.com>
Date: Wed, 29 Nov 2023 12:49:58 -0800
Subject: [PATCH 56/73] Make fbgemm::tbe_input_combine pt2_compliant (#2172)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2172

- Added abstract impl in sparse_ops.py. I don't think it's worth splitting up
  the abstract impls into multiple .py files right now, unless someone comes to
  us with memory issues.

Reviewed By: williamwen42

Differential Revision: D51647392

fbshipit-source-id: 33743bb8f4b7755b5b34a83324822628b4b30e56
---
 fbgemm_gpu/fbgemm_gpu/sparse_ops.py           | 40 +++++++++++++++++++
 .../input_combine_ops/input_combine_cpu.cpp   |  9 ++++-
 fbgemm_gpu/test/failures_dict.json            | 27 +------------
 fbgemm_gpu/test/input_combine_test.py         | 24 ++++++++++-
 4 files changed, 71 insertions(+), 29 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
index d7f17ac135..24f11956e8 100644
--- a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
@@ -23,6 +23,8 @@
     )
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/codegen:embedding_ops")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/codegen:embedding_ops_cpu")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:input_combine")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:input_combine_cpu")
 
 from torch import Tensor
 
@@ -111,6 +113,44 @@ def masked_select_jagged_1d(
     return masked_values, masked_lengths
 
 
+@impl_abstract("fbgemm::tbe_input_combine")
+def tbe_input_combine_abstract(
+    indices_list: List[Tensor],
+    offsets_list: List[Tensor],
+    per_sample_weights: List[Tensor],
+    include_last_offsets: Tensor,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    torch._check(len(indices_list) > 0)
+    torch._check(len(indices_list) == len(offsets_list))
+    torch._check(len(indices_list) == len(per_sample_weights))
+    torch._check(len(indices_list) == include_last_offsets.numel())
+    total_indices = 0
+    need_weight = False
+    for index, offset, weight in zip(indices_list, offsets_list, per_sample_weights):
+        torch._check(index.dtype == torch.int or index.dtype == torch.long)
+        torch._check(offset.dtype == torch.int or offset.dtype == torch.long)
+        torch._check(index.dim() == 1)
+        torch._check(offset.dim() == 1)
+        torch._check(index.is_contiguous())
+        torch._check(offset.is_contiguous())
+        total_indices = total_indices + index.numel()
+        if weight.numel() > 0:
+            torch._check(weight.dim() == 1)
+            torch._check(weight.numel() == index.numel())
+            torch._check(weight.is_contiguous())
+            need_weight = True
+    total_offsets = torch.library.get_ctx().new_dynamic_size()
+    combined_indices = indices_list[0].new_empty([total_indices], dtype=torch.int)
+    combined_offsets = offsets_list[0].new_empty([total_offsets], dtype=torch.int)
+    if need_weight:
+        combined_weights = per_sample_weights[0].new_empty(
+            [total_indices], dtype=torch.float
+        )
+    else:
+        combined_weights = torch.empty(0)
+    return combined_indices, combined_offsets, combined_weights
+
+
 @impl_abstract("fbgemm::expand_into_jagged_permute")
 def expand_into_jagged_permute_meta(
     permute: Tensor,
diff --git a/fbgemm_gpu/src/input_combine_ops/input_combine_cpu.cpp b/fbgemm_gpu/src/input_combine_ops/input_combine_cpu.cpp
index c836b91d98..ac711655b2 100644
--- a/fbgemm_gpu/src/input_combine_ops/input_combine_cpu.cpp
+++ b/fbgemm_gpu/src/input_combine_ops/input_combine_cpu.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include "fbgemm_gpu/dispatch_macros.h"
 #include "fbgemm_gpu/input_combine.h"
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
@@ -383,8 +384,14 @@ padding_fused_tbe_input_combine_with_length_cpu(
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
+#ifdef HAS_IMPL_ABSTRACT_PYSTUB
+  m.impl_abstract_pystub(
+      "fbgemm_gpu.sparse_ops",
+      "//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_py");
+#endif
   m.def(
-      "tbe_input_combine(Tensor[] indices_list, Tensor[] offsets_list, Tensor[] per_sample_weights, Tensor include_last_offsets) -> (Tensor, Tensor, Tensor)");
+      "tbe_input_combine(Tensor[] indices_list, Tensor[] offsets_list, Tensor[] per_sample_weights, Tensor include_last_offsets) -> (Tensor, Tensor, Tensor)",
+      {PT2_COMPLIANT_TAG});
   m.def(
       "tbe_input_combine_with_length(Tensor[] indices_list, Tensor[] lengths_list, Tensor[] per_sample_weights) -> (Tensor, Tensor, Tensor)");
   m.def(
diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index 404791b965..b82d5f3c86 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -552,32 +552,7 @@
         "status": "xfail"
       }
     },
-    "fbgemm::tbe_input_combine": {
-      "InputCombineTest.test_aot_dispatch_dynamic__test_input_combine_int32": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "InputCombineTest.test_aot_dispatch_dynamic__test_input_combine_int64": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "InputCombineTest.test_aot_dispatch_dynamic__test_input_combined_mix": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "InputCombineTest.test_faketensor__test_input_combine_int32": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "InputCombineTest.test_faketensor__test_input_combine_int64": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "InputCombineTest.test_faketensor__test_input_combined_mix": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
+    "fbgemm::tbe_input_combine": {},
     "fbgemm::tbe_input_combine_with_length": {
       "InputCombineTest.test_aot_dispatch_dynamic__test_input_combine_int32_with_length": {
         "comment": "",
diff --git a/fbgemm_gpu/test/input_combine_test.py b/fbgemm_gpu/test/input_combine_test.py
index a042a93948..68d327528b 100644
--- a/fbgemm_gpu/test/input_combine_test.py
+++ b/fbgemm_gpu/test/input_combine_test.py
@@ -8,9 +8,10 @@
 # pyre-unsafe
 
 import unittest
-from typing import List, Optional, Tuple
+from typing import Callable, Dict, List, Optional, Tuple
 
 import torch
+from fbgemm_gpu import sparse_ops  # noqa: F401
 from hypothesis import given, settings
 
 try:
@@ -127,7 +128,26 @@ def forward(  # noqa C901
         return combined_indices, combined_offsets, per_sample_weights
 
 
-@optests.generate_opcheck_tests()
+# e.g. "test_faketensor__test_cumsum": [unittest.expectedFailure]
+# Please avoid putting tests here, you should put operator-specific
+# skips and failures in deeplearning/fbgemm/fbgemm_gpu/test/failures_dict.json
+additional_decorators: Dict[str, List[Callable]] = {
+    "test_pt2_compliant_tag_fbgemm_dense_to_jagged": [
+        # This operator has been grandfathered in. We need to fix this test failure.
+        unittest.expectedFailure,
+    ],
+    "test_pt2_compliant_tag_fbgemm_jagged_dense_elementwise_add": [
+        # This operator has been grandfathered in. We need to fix this test failure.
+        unittest.expectedFailure,
+    ],
+    "test_pt2_compliant_tag_fbgemm_jagged_dense_elementwise_add_jagged_output": [
+        # This operator has been grandfathered in. We need to fix this test failure.
+        unittest.expectedFailure,
+    ],
+}
+
+
+@optests.generate_opcheck_tests(additional_decorators=additional_decorators)
 class InputCombineTest(unittest.TestCase):
     def _get_inputs(self, dtypes, device=DEFAULT_DEVICE):
         indices_list = [

From 90bb32d8da9c2962d96417fc2b848a414e5cfa54 Mon Sep 17 00:00:00 2001
From: Richard Zou <rzou@meta.com>
Date: Wed, 29 Nov 2023 12:49:58 -0800
Subject: [PATCH 57/73] Mark some more ops as pt2_compliant_tag (#2171)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2171

Also deleted two skips that were marked as flaky. Those don't appear to
actually be flaky.

Reviewed By: williamwen42

Differential Revision: D51647391

fbshipit-source-id: 4c70558e2e6d4cb7d2c8566e40f5da06050681d4
---
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp |  9 ++++++---
 fbgemm_gpu/test/failures_dict.json           | 14 ++------------
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index 1be2fc8930..f0f405e84c 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -2730,12 +2730,15 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "permute_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
   m.def(
-      "permute_2D_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
+      "permute_2D_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)",
+      {PT2_COMPLIANT_TAG});
   m.def(
-      "permute_1D_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
+      "permute_1D_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)",
+      {PT2_COMPLIANT_TAG});
   m.def("invert_permute(Tensor permute) -> Tensor");
   m.def(
-      "expand_into_jagged_permute(Tensor permute, Tensor input_offset, Tensor output_offset, SymInt output_size) -> Tensor");
+      "expand_into_jagged_permute(Tensor permute, Tensor input_offset, Tensor output_offset, SymInt output_size) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
       "block_bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, bool sequence, Tensor block_sizes, SymInt my_size, Tensor? weights=None, Tensor? batch_size_per_feature=None, SymInt max_B= -1, Tensor[]? block_bucketize_pos=None) -> (Tensor, Tensor, Tensor?, Tensor?, Tensor?)");
   m.def(
diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index b82d5f3c86..95dbd2a1e0 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -436,18 +436,8 @@
         "status": "xfail"
       }
     },
-    "fbgemm::permute_1D_sparse_data": {
-      "SparseOpsTest.test_schema__test_permute_indices": {
-        "comment": "flaky",
-        "status": "skip"
-      }
-    },
-    "fbgemm::permute_2D_sparse_data": {
-      "SparseOpsTest.test_schema__test_permute_indices": {
-        "comment": "flaky",
-        "status": "skip"
-      }
-    },
+    "fbgemm::permute_1D_sparse_data": {},
+    "fbgemm::permute_2D_sparse_data": {},
     "fbgemm::permute_sequence_embeddings": {
       "SparseOpsTest.test_aot_dispatch_dynamic__test_permute_embeddings": {
         "comment": "",

From 911aec4876a266adca7ade54f342a06f2b35ff2a Mon Sep 17 00:00:00 2001
From: Jia Chen <grievejia@meta.com>
Date: Wed, 29 Nov 2023 14:14:28 -0800
Subject: [PATCH 58/73] suppress errors in `deeplearning/fbgemm/fbgemm_gpu`
 (#2159)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2159

Reviewed By: kinto0

Differential Revision: D51595408

fbshipit-source-id: 49bf60a11b37d6daf522f05bcbda5dca01c6a873
---
 .../split_table_batched_embeddings_ops_training.py          | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
index 5751817173..378f3819f0 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
@@ -349,8 +349,6 @@ def __init__(  # noqa C901
         self.embedding_specs = embedding_specs
         (rows, dims, locations, compute_devices) = zip(*embedding_specs)
         T_ = len(self.embedding_specs)
-        # pyre-fixme[8]: Attribute has type `List[int]`; used as
-        #  `Tuple[Union[ComputeDevice, EmbeddingLocation, int]]`.
         self.dims: List[int] = dims
         assert T_ > 0
         # mixed D is not supported by no bag kernels
@@ -700,10 +698,6 @@ def __init__(  # noqa C901
                     persistent=False,
                 )
 
-        # pyre-fixme[6]: For 1st argument expected `List[int]` but got
-        #  `Tuple[Union[ComputeDevice, EmbeddingLocation, int]]`.
-        # pyre-fixme[6]: For 2nd argument expected `List[EmbeddingLocation]` but got
-        #  `Tuple[Union[ComputeDevice, EmbeddingLocation, int]]`.
         cache_state = construct_cache_state(rows, locations, self.feature_table_map)
 
         # Add table-wise cache miss counter

From 6c29dbdf7f776c87f89938edb6d89964a9c987a2 Mon Sep 17 00:00:00 2001
From: Supadchaya Puangpontip <supadchaya@meta.com>
Date: Wed, 29 Nov 2023 18:52:52 -0800
Subject: [PATCH 59/73] Initialize empty values for fp8 quantize op (#2176)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2176

The PT2 compliancy tests call the op additional times and compare outputs. To make `FloatToFP8RowwiseQuantized` PT2 compliant, this diff initializes empty values for fp8 quantize to make output results deterministic.

Reviewed By: q10

Differential Revision: D51419189

fbshipit-source-id: bf36b51510810ff61912a4fd2d306065958b8cd8
---
 .../src/quantize_ops/quantize_fp8_rowwise.cu  | 27 +++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/fbgemm_gpu/src/quantize_ops/quantize_fp8_rowwise.cu b/fbgemm_gpu/src/quantize_ops/quantize_fp8_rowwise.cu
index f8ae2fa3fd..169f8d2937 100644
--- a/fbgemm_gpu/src/quantize_ops/quantize_fp8_rowwise.cu
+++ b/fbgemm_gpu/src/quantize_ops/quantize_fp8_rowwise.cu
@@ -49,6 +49,10 @@ __global__ inline void _float_to_FP8rowwise_cuda_kernel(
     const auto scale =
         max_pos / (kEpsilon + fmaxf(maximum_element, -minimum_element));
     output_row_scale_bias[0] = scale;
+    // 8 bytes are allocated for scale but only 4 bytes are used
+    // value of the unassigned 4 bytes are hence indeterministic
+    // Initialize it to make the output deterministic for PT2 compliance
+    output_row_scale_bias[1] = 0.0;
     for (int64_t col = 0; col < ncols; ++col) {
       output_row[col] =
           float_to_hfp8(to_float(input_row[col]) * scale, ebit, bias, max_pos);
@@ -112,6 +116,8 @@ __global__ inline void _get_FP8_qparam_cuda_kernel(
       reinterpret_cast<float*>(&output[row * output_columns + ncols_aligned]);
 
   output_row_qparams[0] = max_pos / (kEpsilon + maximum_element);
+  // Initialize it to make the output deterministic for PT2 compliance
+  output_row_qparams[1] = 0.0;
 }
 
 template <typename input_t>
@@ -211,14 +217,17 @@ Tensor _float_to_FP8rowwise_gpu_t(const Tensor& input, const bool forward) {
   // that size).
   auto output_dims = input_sizes.vec();
   output_dims[last_dim] = output_columns;
-  auto output = at::empty(
-      output_dims, // 4 = sizeof(float)
-      input.options().dtype(at::kByte));
 
   if (nrows == 0 || ncols == 0) {
-    return output;
+    return at::zeros(
+        output_dims, // 4 = sizeof(float)
+        input.options().dtype(at::kByte));
   }
 
+  auto output = at::empty(
+      output_dims, // 4 = sizeof(float)
+      input.options().dtype(at::kByte));
+
   constexpr int threads_per_block = 256;
   const auto num_blocks = cuda_calc_xblock_count(nrows, threads_per_block);
   // think unsigned as we use 0, 255
@@ -361,14 +370,16 @@ Tensor _FP8rowwise_to_float_gpu_t(
       output_sdtype == SparseType::FP32 || output_sdtype == SparseType::FP16 ||
       output_sdtype == SparseType::BF16);
 
+  if (nrows == 0 || output_columns == 0) {
+    return at::zeros(
+        output_dims, // 4 = sizeof(float)
+        input.options().dtype(getScalarType(output_sdtype)));
+  }
+
   Tensor output = at::empty(
       output_dims, // 4 = sizeof(float)
       input.options().dtype(getScalarType(output_sdtype)));
 
-  if (nrows == 0 || output_columns == 0) {
-    return output;
-  }
-
   constexpr int threads_per_block = 256;
 
   const int blockDim_x =

From 48120dae22d170cdf8062c009b38080574505d82 Mon Sep 17 00:00:00 2001
From: generatedunixname89002005307016
 <generatedunixname89002005307016@meta.com>
Date: Wed, 29 Nov 2023 20:53:52 -0800
Subject: [PATCH 60/73] set strict as default typing mode in
 `deeplearning/fbgemm/fbgemm_gpu` (#2175)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2175

Reviewed By: azad-meta

Differential Revision: D51673332

fbshipit-source-id: ffad51a6bb2c8746b6595bd6828c5bb8d2bf9e87
---
 .../batched_unary_embeddings_benchmark.py     |  7 +++--
 .../bench/merge_embeddings_benchmark.py       | 28 ++++++++++++++++--
 .../batched_unary_embeddings_ops.py           |  5 +++-
 .../split_embedding_inference_converter.py    |  5 +++-
 .../test/batched_unary_embeddings_test.py     | 11 +++++--
 fbgemm_gpu/test/input_combine_test.py         | 24 +++++++++++++--
 fbgemm_gpu/test/lint/check_meta_header.py     |  1 -
 .../test/merge_pooled_embeddings_test.py      | 29 ++++++++++++++++++-
 ...plit_embedding_inference_converter_test.py | 16 +++++++++-
 9 files changed, 113 insertions(+), 13 deletions(-)

diff --git a/fbgemm_gpu/bench/batched_unary_embeddings_benchmark.py b/fbgemm_gpu/bench/batched_unary_embeddings_benchmark.py
index d15e6b34ef..2ab90148b0 100644
--- a/fbgemm_gpu/bench/batched_unary_embeddings_benchmark.py
+++ b/fbgemm_gpu/bench/batched_unary_embeddings_benchmark.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# pyre-unsafe
 import functools
 from math import sqrt
 from typing import List, Tuple
@@ -29,7 +28,10 @@
 
 
 def generate_unary_feature(
-    batch_size: int, num_embeddings: int
+    batch_size: int,
+    num_embeddings: int
+    # pyre-fixme[24]: Generic type `list` expects 1 type parameter, use
+    #  `typing.List[<element type>]` to avoid runtime subscripting errors.
 ) -> Tuple[List, List, List]:
     lengths = []
     offsets = []
@@ -90,6 +92,7 @@ def forward(
 @click.option("--num-tables", default=2)
 @click.option("--num-tasks", default=3)
 @click.option("--repeats", default=100)
+# pyre-fixme[2]: Parameter must be annotated.
 def main(batch_size, num_tables, num_tasks, repeats) -> None:
     device = torch.device("cuda", 0)
     torch.cuda.set_device(device)
diff --git a/fbgemm_gpu/bench/merge_embeddings_benchmark.py b/fbgemm_gpu/bench/merge_embeddings_benchmark.py
index d6bd17d20f..d7f574d6f7 100644
--- a/fbgemm_gpu/bench/merge_embeddings_benchmark.py
+++ b/fbgemm_gpu/bench/merge_embeddings_benchmark.py
@@ -5,7 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# pyre-unsafe
 
 import logging
 import signal
@@ -44,6 +43,7 @@
     )
 
 
+# pyre-fixme[2]: Parameter must be annotated.
 def get_gpu_device(gpu_num) -> torch.device:
     return torch.device(f"cuda:{gpu_num}")
 
@@ -53,6 +53,7 @@ def get_gpu_device(gpu_num) -> torch.device:
 # Reference: https://fburl.com/code/5ueyfv5j
 def get_table_batched_offsets_from_dense(
     merged_indices: torch.Tensor,
+    # pyre-fixme[2]: Parameter must be annotated.
     gpu_num,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     (T, B, L) = merged_indices.size()
@@ -95,6 +96,7 @@ def generate_requests(
     return rs
 
 
+# pyre-fixme[3]: Return type must be annotated.
 def _get_random_tensor(
     num_ads: int,
     embedding_dimension: int,
@@ -140,7 +142,9 @@ def _get_random_tensor(
     return result_tensor
 
 
+# pyre-fixme[3]: Return type must be annotated.
 def generate_tbe(
+    # pyre-fixme[2]: Parameter must be annotated.
     batch_indices,
     num_ads: int,
     embedding_dimension: int,
@@ -204,7 +208,14 @@ def generate_tbe(
 
 
 def print_p2p_bandwidth(
-    num_gpus, iters, pooled_ad_embeddings, bytes_per_element
+    # pyre-fixme[2]: Parameter must be annotated.
+    num_gpus,
+    # pyre-fixme[2]: Parameter must be annotated.
+    iters,
+    # pyre-fixme[2]: Parameter must be annotated.
+    pooled_ad_embeddings,
+    # pyre-fixme[2]: Parameter must be annotated.
+    bytes_per_element,
 ) -> None:
     print("Pairwise GPU Copy Bandwidth (GB/s)")
     p2p_copy_bw = np.zeros((num_gpus, num_gpus))
@@ -289,12 +300,23 @@ def benchmark(  # noqa C901
     if p2p_bw:
         print_p2p_bandwidth(num_gpus, iters, pooled_ad_embeddings, bytes_per_element)
 
+    # pyre-fixme[53]: Captured variable `emb` is not annotated.
+    # pyre-fixme[53]: Captured variable `pooled_ad_embeddings` is not annotated.
+    # pyre-fixme[53]: Captured variable `requests` is not annotated.
+    # pyre-fixme[53]: Captured variable `tbe_offset` is not annotated.
+    # pyre-fixme[3]: Return type must be annotated.
     def pool_func_with_quantization(
+        # pyre-fixme[2]: Parameter must be annotated.
         batch_indices,
+        # pyre-fixme[2]: Parameter must be annotated.
         include_quantization,
+        # pyre-fixme[2]: Parameter must be annotated.
         include_tbe,
+        # pyre-fixme[2]: Parameter must be annotated.
         fused_tbe,
+        # pyre-fixme[2]: Parameter must be annotated.
         skip_dequantization,
+        # pyre-fixme[2]: Parameter must be annotated.
         data_type,
     ):
         if include_tbe:
@@ -478,6 +500,8 @@ def main(
     )
     if sweep:
 
+        # pyre-fixme[3]: Return type must be annotated.
+        # pyre-fixme[2]: Parameter must be annotated.
         def handler(signum, frame):
             logging.error("timeout")
             raise TimeoutError()
diff --git a/fbgemm_gpu/fbgemm_gpu/batched_unary_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/batched_unary_embeddings_ops.py
index 52b4587088..db9b4c48da 100644
--- a/fbgemm_gpu/fbgemm_gpu/batched_unary_embeddings_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/batched_unary_embeddings_ops.py
@@ -5,7 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# pyre-unsafe
 
 from math import sqrt
 from typing import List
@@ -28,6 +27,7 @@ def wrap_weight_to_parameter(weights: List[torch.Tensor]) -> List[torch.Tensor]:
 
 
 class BatchedUnaryEmbeddingBag(torch.nn.Module):
+    # pyre-fixme[3]: Return type must be annotated.
     def __init__(self, num_tasks: int, hash_sizes: List[int], long_index: bool = False):
         super().__init__()
         self.num_tasks = num_tasks
@@ -49,6 +49,7 @@ def __init__(self, num_tasks: int, hash_sizes: List[int], long_index: bool = Fal
         self.register_buffer("table_offsets_tensor", table_offsets_tensor)
         self.init_parameters()
 
+    # pyre-fixme[3]: Return type must be annotated.
     def forward(self, offsets: torch.Tensor, input: torch.Tensor):
         # output is [N][B][T]
         return torch.ops.fbgemm.batched_unary_embeddings(
@@ -59,6 +60,7 @@ def forward(self, offsets: torch.Tensor, input: torch.Tensor):
         )
 
     @torch.jit.export
+    # pyre-fixme[3]: Return type must be annotated.
     def split_embedding_weights(self):
         embedding_weights = []
         for n in range(self.num_tasks):
@@ -73,6 +75,7 @@ def split_embedding_weights(self):
         return embedding_weights
 
     @torch.jit.export
+    # pyre-fixme[3]: Return type must be annotated.
     def init_parameters(self):
         for num_emb, param in zip(
             self.hash_sizes * self.num_tasks,
diff --git a/fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py b/fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py
index 86e53afbea..a2dd34ade5 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py
@@ -5,7 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# pyre-unsafe
 
 import logging
 import math
@@ -29,6 +28,7 @@
 # TODO: add per-feature based converter option (based on embedding_specs during inference)
 # TODO: optimize embedding pruning and quantization latency.
 class SplitEmbInferenceConverter:
+    # pyre-fixme[3]: Return type must be annotated.
     def __init__(
         self,
         quantize_type: SparseType,
@@ -46,6 +46,7 @@ def convert_model(self, model: torch.nn.Module) -> torch.nn.Module:
         self._process_split_embs(model)
         return model
 
+    # pyre-fixme[2]: Parameter must be annotated.
     def _prune_by_weights_l2_norm(self, new_num_rows, weights) -> Tuple[Tensor, float]:
         assert new_num_rows > 0
         from numpy.linalg import norm
@@ -83,6 +84,8 @@ def _prune_embs(
             weights, indicators, threshold, torch.int32
         )
 
+    # pyre-fixme[3]: Return type must be annotated.
+    # pyre-fixme[2]: Parameter must be annotated.
     def _get_quantization_config(self, name):
         quantization_config = self.quantization_config
         if quantization_config is None:
diff --git a/fbgemm_gpu/test/batched_unary_embeddings_test.py b/fbgemm_gpu/test/batched_unary_embeddings_test.py
index bbd97f294f..3d63aff90f 100644
--- a/fbgemm_gpu/test/batched_unary_embeddings_test.py
+++ b/fbgemm_gpu/test/batched_unary_embeddings_test.py
@@ -5,7 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# pyre-unsafe
 
 import random
 import unittest
@@ -30,6 +29,7 @@
 
 
 # Relative tolerances
+# pyre-fixme[5]: Global expression must be annotated.
 TOLERANCE_REL = {
     torch.float32: 1e-4,
     torch.float16: 1e-2,
@@ -37,6 +37,7 @@
 }
 
 # Absolute tolerances
+# pyre-fixme[5]: Global expression must be annotated.
 TOLERANCE_ABS = {
     torch.float32: 1e-4,
     torch.float16: 1e-2,
@@ -81,7 +82,11 @@ def forward(
             return torch.cat(tt_list).view(self.num_tasks, -1, len(self.hash_sizes))
 
     def _generate_unary_features(
-        self, batch_size: int, num_embeddings: int
+        self,
+        batch_size: int,
+        num_embeddings: int
+        # pyre-fixme[24]: Generic type `list` expects 1 type parameter, use
+        #  `typing.List[<element type>]` to avoid runtime subscripting errors.
     ) -> Tuple[List, List, List]:
         lengths = []
         offsets = []
@@ -212,6 +217,8 @@ def _test_main(
             output = output[1:]
             output.sum().backward()
 
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    #  `test_utils.gpu_unavailable` to decorator factory `unittest.skipIf`.
     @unittest.skipIf(*gpu_unavailable)
     def test_gpu(self) -> None:
         self._test_main(gpu_infer=True)
diff --git a/fbgemm_gpu/test/input_combine_test.py b/fbgemm_gpu/test/input_combine_test.py
index 68d327528b..b8fb2a4a75 100644
--- a/fbgemm_gpu/test/input_combine_test.py
+++ b/fbgemm_gpu/test/input_combine_test.py
@@ -5,7 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# pyre-unsafe
 
 import unittest
 from typing import Callable, Dict, List, Optional, Tuple
@@ -149,6 +148,8 @@ def forward(  # noqa C901
 
 @optests.generate_opcheck_tests(additional_decorators=additional_decorators)
 class InputCombineTest(unittest.TestCase):
+    # pyre-fixme[3]: Return type must be annotated.
+    # pyre-fixme[2]: Parameter must be annotated.
     def _get_inputs(self, dtypes, device=DEFAULT_DEVICE):
         indices_list = [
             torch.tensor([1, 2, 3], dtype=dtypes[0], device=device),
@@ -175,6 +176,7 @@ def _get_inputs(self, dtypes, device=DEFAULT_DEVICE):
             include_last_offsets,
         )
 
+    # pyre-fixme[2]: Parameter must be annotated.
     def _run_test(self, dtypes) -> None:
         (
             indices_list,
@@ -212,6 +214,7 @@ def _run_test(self, dtypes) -> None:
         self.assertTrue(outputs[1].dtype == torch.int32)
         self.assertTrue(outputs[-1].size(0) == 0)
 
+    # pyre-fixme[2]: Parameter must be annotated.
     def _run_padding_fused_test(self, dtypes, batch_size) -> None:
         (
             indices_list,
@@ -255,8 +258,17 @@ def _run_padding_fused_test(self, dtypes, batch_size) -> None:
         self.assertTrue(outputs[1].dtype == torch.int32)
         self.assertTrue(outputs[-1].size(0) == 0)
 
+    # pyre-fixme[3]: Return type must be annotated.
     def _offsets_to_lengths(
-        self, offsets, indices, include_last_offsets, device=DEFAULT_DEVICE
+        self,
+        # pyre-fixme[2]: Parameter must be annotated.
+        offsets,
+        # pyre-fixme[2]: Parameter must be annotated.
+        indices,
+        # pyre-fixme[2]: Parameter must be annotated.
+        include_last_offsets,
+        # pyre-fixme[2]: Parameter must be annotated.
+        device=DEFAULT_DEVICE,
     ):
         if include_last_offsets:
             offsets_complete = offsets
@@ -269,6 +281,7 @@ def _offsets_to_lengths(
             )
         return offsets_complete[1:] - offsets_complete[:-1]
 
+    # pyre-fixme[2]: Parameter must be annotated.
     def _run_test_with_length(self, dtypes, device=DEFAULT_DEVICE) -> None:
         (
             indices_list,
@@ -300,6 +313,7 @@ def _run_test_with_length(self, dtypes, device=DEFAULT_DEVICE) -> None:
         ref_lengths = self._offsets_to_lengths(ref_outputs[1], ref_outputs[0], True)
         self.assertTrue(ref_lengths.allclose(outputs[1]))
 
+    # pyre-fixme[2]: Parameter must be annotated.
     def _run_padding_fused_test_with_length(self, dtypes, batch_size) -> None:
         (
             indices_list,
@@ -343,16 +357,22 @@ def test_input_combine_int32(self) -> None:
     def test_input_combined_mix(self) -> None:
         self._run_test((torch.int64, torch.int32))
 
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    #  `test_utils.cpu_and_maybe_gpu()` to decorator factory `hypothesis.given`.
     @given(device=cpu_and_maybe_gpu())
     @settings(deadline=None)
     def test_input_combine_int64_with_length(self, device: torch.device) -> None:
         self._run_test_with_length((torch.int64, torch.int64), device=device)
 
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    #  `test_utils.cpu_and_maybe_gpu()` to decorator factory `hypothesis.given`.
     @given(device=cpu_and_maybe_gpu())
     @settings(deadline=None)
     def test_input_combine_int32_with_length(self, device: torch.device) -> None:
         self._run_test_with_length((torch.int32, torch.int32), device=device)
 
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    #  `test_utils.cpu_and_maybe_gpu()` to decorator factory `hypothesis.given`.
     @given(device=cpu_and_maybe_gpu())
     @settings(deadline=None)
     def test_input_combine_mix_with_length(self, device: torch.device) -> None:
diff --git a/fbgemm_gpu/test/lint/check_meta_header.py b/fbgemm_gpu/test/lint/check_meta_header.py
index fd5178d753..5fd5e41f6c 100644
--- a/fbgemm_gpu/test/lint/check_meta_header.py
+++ b/fbgemm_gpu/test/lint/check_meta_header.py
@@ -5,7 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# pyre-unsafe
 
 """Check Python source code contains Meta copyright header
 """
diff --git a/fbgemm_gpu/test/merge_pooled_embeddings_test.py b/fbgemm_gpu/test/merge_pooled_embeddings_test.py
index 78d62fb416..a686cdb3eb 100644
--- a/fbgemm_gpu/test/merge_pooled_embeddings_test.py
+++ b/fbgemm_gpu/test/merge_pooled_embeddings_test.py
@@ -5,7 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# pyre-unsafe
 
 import unittest
 from typing import Tuple
@@ -37,6 +36,9 @@
 @unittest.skipIf(*gpu_unavailable)
 @unittest.skipIf(open_source, "Not supported in open source yet")
 class MergePooledEmbeddingsTest(unittest.TestCase):
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    #  `hypothesis.strategies.integers($parameter$min_value = 1, $parameter$max_value =
+    #  10)` to decorator factory `hypothesis.given`.
     @given(
         num_ads=st.integers(min_value=1, max_value=10),
         embedding_dimension=st.integers(min_value=1, max_value=32),
@@ -50,11 +52,17 @@ class MergePooledEmbeddingsTest(unittest.TestCase):
     @settings(verbosity=Verbosity.verbose, max_examples=40, deadline=None)
     def test_merge(
         self,
+        # pyre-fixme[2]: Parameter must be annotated.
         num_ads,
+        # pyre-fixme[2]: Parameter must be annotated.
         embedding_dimension,
+        # pyre-fixme[2]: Parameter must be annotated.
         ads_tables,
+        # pyre-fixme[2]: Parameter must be annotated.
         num_gpus,
+        # pyre-fixme[2]: Parameter must be annotated.
         non_default_stream,
+        # pyre-fixme[2]: Parameter must be annotated.
         r,
         dim: int,
     ) -> None:
@@ -83,6 +91,8 @@ def test_merge(
                 pooled_ad_embeddings, uncat_size, batch_indices.device, dim
             )
 
+        # pyre-fixme[3]: Return type must be annotated.
+        # pyre-fixme[2]: Parameter must be annotated.
         def ref(pooled_ad_embeddings, batch_indices):
             return torch.cat([p.cpu() for p in pooled_ad_embeddings], dim=dim)
 
@@ -97,6 +107,9 @@ def ref(pooled_ad_embeddings, batch_indices):
         torch.testing.assert_close(output_ref, output.cpu())
         torch.testing.assert_close(output_ref, output_cpu)
 
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    #  `hypothesis.strategies.integers($parameter$min_value = 1, $parameter$max_value =
+    #  10)` to decorator factory `hypothesis.given`.
     @given(
         num_inputs=st.integers(min_value=1, max_value=10),
         num_gpus=st.integers(min_value=1, max_value=torch.cuda.device_count()),
@@ -106,8 +119,11 @@ def ref(pooled_ad_embeddings, batch_indices):
     @settings(verbosity=Verbosity.verbose, max_examples=40, deadline=None)
     def test_all_to_one_device(
         self,
+        # pyre-fixme[2]: Parameter must be annotated.
         num_inputs,
+        # pyre-fixme[2]: Parameter must be annotated.
         num_gpus,
+        # pyre-fixme[2]: Parameter must be annotated.
         r,
     ) -> None:
         dst_device = torch.device(f"cuda:{r.randint(0, num_gpus - 1)}")
@@ -133,6 +149,9 @@ def test_merge_pooled_embeddings_cpu_with_different_target_device(self) -> None:
         self.assertFalse(output_meta.is_cpu)
         self.assertTrue(output_meta.is_meta)
 
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    #  `hypothesis.strategies.integers($parameter$min_value = 1, $parameter$max_value =
+    #  10)` to decorator factory `hypothesis.given`.
     @given(
         num_inputs=st.integers(min_value=1, max_value=10),
         num_gpus=st.integers(min_value=1, max_value=torch.cuda.device_count()),
@@ -142,8 +161,11 @@ def test_merge_pooled_embeddings_cpu_with_different_target_device(self) -> None:
     @settings(verbosity=Verbosity.verbose, max_examples=10, deadline=None)
     def test_sum_reduce_to_one(
         self,
+        # pyre-fixme[2]: Parameter must be annotated.
         num_inputs,
+        # pyre-fixme[2]: Parameter must be annotated.
         num_gpus,
+        # pyre-fixme[2]: Parameter must be annotated.
         r,
     ) -> None:
         dst_device = torch.device(f"cuda:{r.randint(0, num_gpus - 1)}")
@@ -168,6 +190,11 @@ def test_merge_pooled_embeddings_meta(self) -> None:
         cat_dim = 1
         pooled_embeddings = [torch.ones(uncat_size, 4), torch.ones(uncat_size, 8)]
 
+        # pyre-fixme[53]: Captured variable `cat_dim` is not annotated.
+        # pyre-fixme[53]: Captured variable `pooled_embeddings` is not annotated.
+        # pyre-fixme[53]: Captured variable `uncat_size` is not annotated.
+        # pyre-fixme[3]: Return type must be annotated.
+        # pyre-fixme[2]: Parameter must be annotated.
         def fbgemm_merge_pooled_embeddings(device):
             pooled_embeddings_device = [
                 pooled_embedding.to(device) for pooled_embedding in pooled_embeddings
diff --git a/fbgemm_gpu/test/split_embedding_inference_converter_test.py b/fbgemm_gpu/test/split_embedding_inference_converter_test.py
index 22ad6ad3ae..126552c2db 100644
--- a/fbgemm_gpu/test/split_embedding_inference_converter_test.py
+++ b/fbgemm_gpu/test/split_embedding_inference_converter_test.py
@@ -5,7 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# pyre-unsafe
 
 import logging
 import math
@@ -84,9 +83,13 @@ class SparseArch(nn.Module):
 
     def __init__(
         self,
+        # pyre-fixme[2]: Parameter must be annotated.
         emb_dim,
+        # pyre-fixme[2]: Parameter must be annotated.
         num_tables,
+        # pyre-fixme[2]: Parameter must be annotated.
         num_rows,
+        # pyre-fixme[2]: Parameter must be annotated.
         use_cpu,
     ) -> None:
         super().__init__()
@@ -117,11 +120,16 @@ def __init__(
             -EMB_WEIGHT_UNIFORM_INIT_BOUND, +EMB_WEIGHT_UNIFORM_INIT_BOUND
         )
 
+    # pyre-fixme[3]: Return type must be annotated.
+    # pyre-fixme[2]: Parameter must be annotated.
     def forward(self, indices, offsets):
         return self.emb_module(indices, offsets)
 
 
 class QuantizedSplitEmbeddingsTest(unittest.TestCase):
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    #  `hypothesis.strategies.integers($parameter$min_value = 1, $parameter$max_value =
+    #  10)` to decorator factory `hypothesis.given`.
     @given(
         T=st.integers(min_value=1, max_value=10),
         D=st.integers(min_value=2, max_value=128),
@@ -234,6 +242,9 @@ def test_quantize_workflow(
         )
 
     @unittest.skipIf(*on_arm_platform)
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    #  `hypothesis.strategies.booleans() if test_utils.gpu_available else
+    #  hypothesis.strategies.just(True)` to decorator factory `hypothesis.given`.
     @given(
         use_cpu=st.booleans() if gpu_available else st.just(True),
         use_array_for_index_remapping=st.booleans(),
@@ -319,6 +330,9 @@ def test_l2_norm_pruning_workflow(
                 rtol=1.0e-1,
             )
 
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    #  `hypothesis.strategies.integers($parameter$min_value = 1, $parameter$max_value =
+    #  10)` to decorator factory `hypothesis.given`.
     @given(
         T=st.integers(min_value=1, max_value=10),
         D=st.integers(min_value=2, max_value=128),

From 453c80e4d6f84706fef2d374cdef01aafdc3e54d Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Wed, 29 Nov 2023 20:58:37 -0800
Subject: [PATCH 61/73] Update AVX2 and AVX512 flags (#2167)

Summary:
- Update AVX2 and AVX512 flags to account for nvcc as the front-end compiler

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2167

Reviewed By: spcyppt

Differential Revision: D51681615

Pulled By: q10

fbshipit-source-id: 231aa051f121ff7a5f6aac56f335442bbd312a49
---
 .github/scripts/fbgemm_gpu_build.bash         |  1 +
 .gitignore                                    |  4 ++++
 fbgemm_gpu/CMakeLists.txt                     | 20 +++++++++++++----
 .../embedding_backward_split_grad_template.cu |  2 +-
 ..._backward_split_indice_weights_template.cu |  2 +-
 ...edding_forward_split_kernel_v2_template.cu | 10 ++++-----
 fbgemm_gpu/setup.py                           | 22 ++++++++++++++++---
 .../jagged_dense_elementwise_mul_backward.cu  |  3 ++-
 8 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index 381efc2fbc..58c5e705b2 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -347,6 +347,7 @@ build_fbgemm_gpu_package () {
       --package_name="${package_name}" \
       --python-tag="${python_tag}" \
       --plat-name="${plat_name}" \
+      --verbose \
       "${build_args[@]}"
 
   # Run checks on the built libraries
diff --git a/.gitignore b/.gitignore
index 746e08c73e..20163c8dc8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,10 @@
 # found in:
 # https://github.com/github/gitignore/
 
+# General
+.DS_Store
+*~
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index cac90f07cf..67c36f125f 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -432,10 +432,22 @@ else()
     DEPENDS "${optimizer_codegen_dependencies}")
 endif()
 
+set(AVX2_FLAGS "-mavx2;-mf16c;-mfma;-fopenmp")
+if(NOT FBGEMM_CPU_ONLY AND WSL_MODE)
+  # NVCC in WSL complains about unknown -mavx options
+  # https://github.com/pytorch/FBGEMM/issues/2135
+  set(AVX2_FLAGS "-Xcompiler;-mavx;-Xcompiler;-mavx2;-Xcompiler;-mf16c;-Xcompiler;-mfma;-fopenmp")
+endif()
+
+set(AVX512_FLAGS "-mavx2;-mf16c;-mfma;-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl;-fopenmp")
+if(NOT FBGEMM_CPU_ONLY AND WSL_MODE)
+  set(AVX512_FLAGS "-Xcompiler;-mavx2;-Xcompiler;-mf16c;-Xcompiler;-mfma;-Xcompiler;-mavx512f;-Xcompiler;-mavx512bw;-Xcompiler;-mavx512dq;-Xcompiler;-mavx512vl;-fopenmp")
+endif()
+
 if(CXX_AVX2_FOUND)
   set_source_files_properties(${gen_cpu_source_files}
     PROPERTIES COMPILE_OPTIONS
-    "-mavx2;-mf16c;-mfma;-fopenmp")
+    "${AVX2_FLAGS}")
 else()
   set_source_files_properties(${gen_cpu_source_files}
     PROPERTIES COMPILE_OPTIONS
@@ -504,13 +516,13 @@ set(fbgemm_sources_avx512
 if(CXX_AVX2_FOUND)
   set_source_files_properties(${fbgemm_sources_avx2}
     PROPERTIES COMPILE_OPTIONS
-    "-mavx2;-mf16c;-mfma")
+    "${AVX2_FLAGS}")
 endif()
 
 if(CXX_AVX512_FOUND)
   set_source_files_properties(${fbgemm_sources_avx512}
     PROPERTIES COMPILE_OPTIONS
-    "-mavx2;-mf16c;-mfma;-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl")
+    "${AVX512_FLAGS}")
 endif()
 
 set(fbgemm_sources ${fbgemm_sources_normal})
@@ -618,7 +630,7 @@ endif()
 if(CXX_AVX2_FOUND)
   set_source_files_properties(${fbgemm_gpu_sources_static_cpu}
     PROPERTIES COMPILE_OPTIONS
-    "-mavx;-mf16c;-mfma;-mavx2;-fopenmp")
+    "${AVX2_FLAGS}")
 else()
   set_source_files_properties(${fbgemm_gpu_sources_static_cpu}
     PROPERTIES COMPILE_OPTIONS
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_grad_template.cu b/fbgemm_gpu/codegen/embedding_backward_split_grad_template.cu
index 356e8a0c79..1db787760c 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_grad_template.cu
+++ b/fbgemm_gpu/codegen/embedding_backward_split_grad_template.cu
@@ -151,7 +151,7 @@ __global__ __launch_bounds__(kMaxThreads) void grad_mean{{ vbe_desc }}_kernel(
 ) {
   int32_t T = D_offsets.size(0) - 1;
   int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
-  int32_t b;
+  [[maybe_unused]] int32_t b;
   int32_t t;
   const auto total_B = offsets.size(0) - 1;
 
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_indice_weights_template.cu b/fbgemm_gpu/codegen/embedding_backward_split_indice_weights_template.cu
index bfb6f1fef3..d6e72ccaa4 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_indice_weights_template.cu
+++ b/fbgemm_gpu/codegen/embedding_backward_split_indice_weights_template.cu
@@ -66,7 +66,7 @@ __global__ __launch_bounds__(kForwardMaxThreads) void
     }
 
     int32_t t;
-    int32_t b;
+    [[maybe_unused]] int32_t b;
 
     {%- if vbe %}
     const auto info = reinterpret_cast<const uint32_t*>(&b_t_map[b_t])[0];
diff --git a/fbgemm_gpu/codegen/embedding_forward_split_kernel_v2_template.cu b/fbgemm_gpu/codegen/embedding_forward_split_kernel_v2_template.cu
index 61b941e4d1..923b52baf4 100644
--- a/fbgemm_gpu/codegen/embedding_forward_split_kernel_v2_template.cu
+++ b/fbgemm_gpu/codegen/embedding_forward_split_kernel_v2_template.cu
@@ -130,7 +130,7 @@ __inline__ __device__ void process_all_indices_no_pooling(
   const auto total_load_D = static_cast<uint32_t>(smem[params_offset + SAVED_PARAMS::P_total_load_D]);
 
   // Each thread loads a separate weight ptr
-  const auto weight_ptrs = reinterpret_cast<const uintptr_t>(&weights[indices[threadIdx.x] * load_D]);
+  const auto weight_ptrs = reinterpret_cast<uintptr_t>(&weights[indices[threadIdx.x] * load_D]);
 
   // Assuming kWarpSize is a multiple of STEP
   for (uint32_t l_start = 0; l_start < TOTAL_L; l_start += STEP) {
@@ -332,8 +332,8 @@ __noinline__ __device__ void process_all_indices_small_Ls(
           const cache_t* lxu_cache_weights =
             reinterpret_cast<const cache_t*>(smem[params_offset + LXU_CACHE_PARAMS::P_lxu_cache_weights]);
           SMEM_GENERIC_PTR[threadIdx.x] = cache_idx != kCacheLocationMissing ?
-            reinterpret_cast<const uintptr_t>(&lxu_cache_weights[cache_idx * max_D_cache]) :
-            reinterpret_cast<const uintptr_t>(&weights[indices[l] * load_D]);
+            reinterpret_cast<uintptr_t>(&lxu_cache_weights[cache_idx * max_D_cache]) :
+            reinterpret_cast<uintptr_t>(&weights[indices[l] * load_D]);
         }
         if (!std::is_same<emb_t, cache_t>::value) {
           cache_look_up_bits = ballot_sync(cache_idx != kCacheLocationMissing);
@@ -558,8 +558,8 @@ __noinline__ __device__ void process_all_indices_large_Ls(
           const auto* lxu_cache_weights =
             reinterpret_cast<const cache_t*>(smem[params_offset + LXU_CACHE_PARAMS::P_lxu_cache_weights]);
           SMEM_GENERIC_PTR[threadIdx.x] = cache_idx != kCacheLocationMissing ?
-            reinterpret_cast<const uintptr_t>(&lxu_cache_weights[cache_idx * max_D_cache]) :
-            reinterpret_cast<const uintptr_t>(&weights[indices[l] * load_D]);
+            reinterpret_cast<uintptr_t>(&lxu_cache_weights[cache_idx * max_D_cache]) :
+            reinterpret_cast<uintptr_t>(&weights[indices[l] * load_D]);
         }
         if (!std::is_same<emb_t, cache_t>::value) {
           cache_look_up_bits = ballot_sync(cache_idx != kCacheLocationMissing);
diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py
index 0a16e6f625..cb7ab36475 100644
--- a/fbgemm_gpu/setup.py
+++ b/fbgemm_gpu/setup.py
@@ -26,6 +26,11 @@
 
 def parse_args(argv: List[str]) -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="fbgemm_gpu setup")
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Print verbose logs during the build.",
+    )
     parser.add_argument(
         "--package_variant",
         type=str,
@@ -133,8 +138,6 @@ def set_cuda_environment_variables() -> None:
 def cmake_environment_variables(args) -> None:
     def _get_cxx11_abi():
         try:
-            import torch
-
             value = int(torch._C._GLIBCXX_USE_CXX11_ABI)
         except ImportError:
             value = 0
@@ -143,11 +146,22 @@ def _get_cxx11_abi():
     torch_root = os.path.dirname(torch.__file__)
     os.environ["CMAKE_BUILD_PARALLEL_LEVEL"] = str(os.cpu_count() // 2)
 
-    cmake_args = [f"-DCMAKE_PREFIX_PATH={torch_root}", _get_cxx11_abi()]
+    cmake_args = [
+        f"-DCMAKE_PREFIX_PATH={torch_root}",
+        _get_cxx11_abi(),
+    ]
+
+    if args.verbose:
+        print("[SETUP.PY] Building in VERBOSE mode ...")
+        cmake_args.append("-DCMAKE_VERBOSE_MAKEFILE=1")
+
     if args.package_variant == "cpu":
+        print("[SETUP.PY] Building the CPU-ONLY variant of FBGEMM_GPU ...")
         cmake_args.append("-DFBGEMM_CPU_ONLY=ON")
+
     if args.nvml_lib_path:
         cmake_args.append(f"-DNVML_LIB_PATH={args.nvml_lib_path}")
+
     return cmake_args
 
 
@@ -183,6 +197,7 @@ def extract_variant_version(cls, variant: str) -> str:
 
         if variant == "cpu":
             variant_version = "+cpu"
+
         elif variant == "cuda":
             set_cuda_environment_variables()
             if torch.version.cuda is not None:
@@ -192,6 +207,7 @@ def extract_variant_version(cls, variant: str) -> str:
                 sys.exit(
                     "[SETUP.PY] Installed PyTorch variant is not CUDA; cannot determine the CUDA version!"
                 )
+
         elif variant == "rocm":
             if torch.version.hip is not None:
                 rocm_version = torch.version.hip.split(".")
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_elementwise_mul_backward.cu b/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_elementwise_mul_backward.cu
index fc4a08b1f2..41fb56a899 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_elementwise_mul_backward.cu
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_elementwise_mul_backward.cu
@@ -104,7 +104,8 @@ void jagged_jagged_elementwise_dense_output_(
       x_offset_ptrs.vals[d] =                                                \
           x_offsets_contig[d].template data_ptr<index_t>();                  \
     }                                                                        \
-    const auto func_name = "jagged_jagged_elementwise_dense_output_kernel_"; \
+    [[maybe_unused]] const auto func_name =                                  \
+        "jagged_jagged_elementwise_dense_output_kernel_";                    \
     jagged_jagged_elementwise_dense_output_kernel_<NUM_JAGGED_DIM, index_t>  \
         <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(          \
             MAKE_PTA_WITH_NAME(func_name, x_values, scalar_t, 2, 32),        \

From 63d1198552dae2a7e6e798c892d8f5a0ccb22e44 Mon Sep 17 00:00:00 2001
From: generatedunixname89002005232357
 <generatedunixname89002005232357@meta.com>
Date: Thu, 30 Nov 2023 07:29:30 -0800
Subject: [PATCH 62/73] Revert D51647391: Multisect successfully blamed
 "D51647391: Mark some more ops as pt2_compliant_tag" for otest failure
 (#2179)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2179

This diff is reverting D51647391
D51647391: Mark some more ops as pt2_compliant_tag by zou3519 has been identified to be causing the following test failure:

Tests affected:
- [deeplearning/fbgemm/fbgemm_gpu:sparse_ops_test - test_schema__test_permute_indices (deeplearning.fbgemm.fbgemm_gpu.test.sparse_ops_test.SparseOpsTest)](https://www.internalfb.com/intern/test/562950068029445/)

Here's the Multisect link:
https://www.internalfb.com/multisect/3631441
Here are the tasks that are relevant to this breakage:

We're generating a revert to back out the changes in this diff, please note the backout may land if someone accepts it.

If you believe this diff has been generated in error you may Commandeer and Abandon it.

Reviewed By: zou3519

Differential Revision: D51687883

fbshipit-source-id: 22d0bd6edd4206e1a6acb9b5f173846783b4479d
---
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp |  9 +++------
 fbgemm_gpu/test/failures_dict.json           | 14 ++++++++++++--
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index f0f405e84c..1be2fc8930 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -2730,15 +2730,12 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "permute_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
   m.def(
-      "permute_2D_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)",
-      {PT2_COMPLIANT_TAG});
+      "permute_2D_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
   m.def(
-      "permute_1D_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)",
-      {PT2_COMPLIANT_TAG});
+      "permute_1D_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
   m.def("invert_permute(Tensor permute) -> Tensor");
   m.def(
-      "expand_into_jagged_permute(Tensor permute, Tensor input_offset, Tensor output_offset, SymInt output_size) -> Tensor",
-      {PT2_COMPLIANT_TAG});
+      "expand_into_jagged_permute(Tensor permute, Tensor input_offset, Tensor output_offset, SymInt output_size) -> Tensor");
   m.def(
       "block_bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, bool sequence, Tensor block_sizes, SymInt my_size, Tensor? weights=None, Tensor? batch_size_per_feature=None, SymInt max_B= -1, Tensor[]? block_bucketize_pos=None) -> (Tensor, Tensor, Tensor?, Tensor?, Tensor?)");
   m.def(
diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index 95dbd2a1e0..b82d5f3c86 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -436,8 +436,18 @@
         "status": "xfail"
       }
     },
-    "fbgemm::permute_1D_sparse_data": {},
-    "fbgemm::permute_2D_sparse_data": {},
+    "fbgemm::permute_1D_sparse_data": {
+      "SparseOpsTest.test_schema__test_permute_indices": {
+        "comment": "flaky",
+        "status": "skip"
+      }
+    },
+    "fbgemm::permute_2D_sparse_data": {
+      "SparseOpsTest.test_schema__test_permute_indices": {
+        "comment": "flaky",
+        "status": "skip"
+      }
+    },
     "fbgemm::permute_sequence_embeddings": {
       "SparseOpsTest.test_aot_dispatch_dynamic__test_permute_embeddings": {
         "comment": "",

From 1c40928d16a41353d4d0c5829d8bf6cf6e1564f7 Mon Sep 17 00:00:00 2001
From: Richard Zou <rzou@meta.com>
Date: Thu, 30 Nov 2023 07:30:12 -0800
Subject: [PATCH 63/73] Make fbgemm::jagged_index_select pt2_compliant (#2170)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2170

- The problem with the original op was that in the Autograd implementation, it
  needed to call Tensor.item(). This doesn't work with FakeTensors (maybe it
  can some day in the future).
- We create two new ops, `jagged_index_select_2d_forward_v2` and
  `jagged_index_add_2d_forward_v2` (which is effectively the backward) that do
  the Tensor.item() calls, and change fbgemm::jagged_index_select's Autograd
  implementation to call those.
- We add abstract impls for those two new ops.
- Finally, we move the fbgemm::jagged_index_select implementation to
  CompositeImplicitAutograd (and delete the CPU/CUDA impls, because those are
  redundant).

Reviewed By: williamwen42, aakhundov

Differential Revision: D51670069

fbshipit-source-id: b7ae86dcb02a993ec3bad94a839b707faf4f9098
---
 fbgemm_gpu/fbgemm_gpu/sparse_ops.py           | 32 ++++++++
 .../jagged_tensor_ops/jagged_tensor_ops.cu    |  5 --
 .../jagged_tensor_ops_autograd.cpp            | 40 +++-------
 .../jagged_tensor_ops_cpu.cpp                 | 76 ++++++++++++++++++-
 fbgemm_gpu/test/failures_dict.json            | 12 +--
 5 files changed, 124 insertions(+), 41 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
index 24f11956e8..0979959a53 100644
--- a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
@@ -151,6 +151,38 @@ def tbe_input_combine_abstract(
     return combined_indices, combined_offsets, combined_weights
 
 
+@impl_abstract("fbgemm::jagged_index_select_2d_forward_v2")
+def jagged_index_select_2d_forward_v2_abstract(
+    values: Tensor,
+    indices: Tensor,
+    input_offsets: Tensor,
+    output_offsets: Tensor,
+) -> Tensor:
+    torch._check(values.device == indices.device)
+    torch._check(values.device == input_offsets.device)
+    torch._check(values.device == output_offsets.device)
+    torch._check(values.dim() == 2)
+    num_dense_output_rows = torch.library.get_ctx().new_dynamic_size()
+    num_cols = values.size(1)
+    return values.new_empty([num_dense_output_rows, num_cols])
+
+
+@impl_abstract("fbgemm::jagged_index_add_2d_forward_v2")
+def jagged_index_add_2d_forward_v2_abstract(
+    values: Tensor,
+    indices: Tensor,
+    input_offsets: Tensor,
+    output_offsets: Tensor,
+    num_output_rows: int,
+) -> Tensor:
+    torch._check(values.device == indices.device)
+    torch._check(values.device == input_offsets.device)
+    torch._check(values.device == output_offsets.device)
+    torch._check(values.dim() == 2)
+    num_cols = values.size(1)
+    return values.new_empty([num_output_rows, num_cols])
+
+
 @impl_abstract("fbgemm::expand_into_jagged_permute")
 def expand_into_jagged_permute_meta(
     permute: Tensor,
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops.cu b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops.cu
index 186560caaf..dbccc6fdfd 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops.cu
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops.cu
@@ -37,8 +37,3 @@ FBGEMM_OP_DISPATCH(CUDA, "jagged_2d_to_dense", fbgemm_gpu::jagged_2d_to_dense);
 FBGEMM_OP_DISPATCH(CUDA, "jagged_softmax", fbgemm_gpu::jagged_softmax);
 FBGEMM_OP_DISPATCH(CUDA, "jagged_jagged_bmm", fbgemm_gpu::jagged_jagged_bmm);
 FBGEMM_OP_DISPATCH(CUDA, "jagged_dense_bmm", fbgemm_gpu::jagged_dense_bmm);
-
-FBGEMM_OP_DISPATCH(
-    CUDA,
-    "jagged_index_select",
-    fbgemm_gpu::jagged_index_select_2d);
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
index 30060728d7..89c2c89116 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
@@ -524,30 +524,20 @@ class JaggedIndexSelect2dOp
     Tensor output_offsets = output_lengths.cumsum(0);
     Tensor input_offsets = lengths.cumsum(0);
 
-    int64_t num_dense_output_rows =
-        output_offsets[output_offsets.numel() - 1].item<int64_t>();
-
     ctx->save_for_backward({indices, output_offsets, input_offsets});
-    ctx->saved_data["num_dense_grad_rows"] = num_dense_output_rows;
-    ctx->saved_data["num_input_rows"] = values.size(0);
+    ctx->saved_data["num_input_rows"] = values.sym_size(0);
 
     static auto op =
         c10::Dispatcher::singleton()
-            .findSchemaOrThrow("fbgemm::jagged_index_select_2d_forward", "")
+            .findSchemaOrThrow("fbgemm::jagged_index_select_2d_forward_v2", "")
             .typed<at::Tensor(
                 const Tensor& values,
                 const Tensor& indices,
                 const Tensor& input_offsets,
-                const Tensor& output_offsets,
-                const int64_t num_dense_output_rows)>();
+                const Tensor& output_offsets)>();
 
     return {
-        op.call(
-            values,
-            indices,
-            input_offsets,
-            output_offsets,
-            num_dense_output_rows),
+        op.call(values, indices, input_offsets, output_offsets),
         output_lengths};
   }
 
@@ -565,29 +555,20 @@ class JaggedIndexSelect2dOp
 
     TENSORS_ON_SAME_DEVICE(grad, indices);
 
-    int64_t num_dense_grad_rows =
-        ctx->saved_data["num_dense_grad_rows"].toInt();
-    int64_t num_output_rows = ctx->saved_data["num_input_rows"].toInt();
+    auto num_output_rows = ctx->saved_data["num_input_rows"].toSymInt();
 
     static auto op =
         c10::Dispatcher::singleton()
-            .findSchemaOrThrow("fbgemm::jagged_index_add_2d_forward", "")
+            .findSchemaOrThrow("fbgemm::jagged_index_add_2d_forward_v2", "")
             .typed<at::Tensor(
                 const Tensor& values,
                 const Tensor& indices,
                 const Tensor& input_offsets,
                 const Tensor& output_offsets,
-                const int64_t num_dense_input_rows,
-                const int64_t num_output_rows)>();
+                c10::SymInt num_output_rows)>();
 
     return {
-        op.call(
-            grad,
-            indices,
-            grad_offsets,
-            output_offsets,
-            num_dense_grad_rows,
-            num_output_rows),
+        op.call(grad, indices, grad_offsets, output_offsets, num_output_rows),
         torch::autograd::Variable(), // lengths
         torch::autograd::Variable() // indices
     };
@@ -883,6 +864,9 @@ TORCH_LIBRARY_IMPL(fbgemm, Autograd, m) {
   m.impl("jagged_softmax", TORCH_FN(fbgemm_gpu::jagged_softmax));
   m.impl("jagged_jagged_bmm", TORCH_FN(fbgemm_gpu::jagged_jagged_bmm));
   m.impl("jagged_dense_bmm", TORCH_FN(fbgemm_gpu::jagged_dense_bmm));
-  m.impl("jagged_index_select", TORCH_FN(fbgemm_gpu::jagged_index_select_2d));
   m.impl("jagged_slice", TORCH_FN(fbgemm_gpu::jagged_slice));
 }
+
+TORCH_LIBRARY_IMPL(fbgemm, CompositeImplicitAutograd, m) {
+  m.impl("jagged_index_select", TORCH_FN(fbgemm_gpu::jagged_index_select_2d));
+}
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
index 4a8859cfc6..5a1753b239 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
+++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
@@ -1136,6 +1136,63 @@ Tensor jagged_index_select_2d_forward_cpu(
   return output;
 }
 
+// v2 supports PT2 Dynamic Shapes.
+// The problem with v1 is that it accepts a redundant num_dense_output_rows arg
+// that we compute by peeking at output_offsets.data.
+// PT2 has problems with data access, so we hide the data access inside
+// the new operator.
+Tensor jagged_index_select_2d_forward_v2_impl(
+    const Tensor& values,
+    const Tensor& indices,
+    const Tensor& input_offsets,
+    const Tensor& output_offsets) {
+  int64_t num_dense_output_rows =
+      output_offsets[output_offsets.numel() - 1].item<int64_t>();
+  static auto v1_op =
+      c10::Dispatcher::singleton()
+          .findSchemaOrThrow("fbgemm::jagged_index_select_2d_forward", "")
+          .typed<at::Tensor(
+              const Tensor& values,
+              const Tensor& indices,
+              const Tensor& input_offsets,
+              const Tensor& output_offsets,
+              const int64_t num_dense_output_rows)>();
+  return v1_op.call(
+      values, indices, input_offsets, output_offsets, num_dense_output_rows);
+}
+
+// v2 supports PT2 Dynamic Shapes.
+// The problem with v1 is that it accepts a redundant num_dense_output_rows arg
+// that we compute by peeking at input_offsets.data.
+// PT2 has problems with data access, so we hide the data access inside
+// the new operator.
+Tensor jagged_index_add_2d_forward_v2_impl(
+    const Tensor& values,
+    const Tensor& indices,
+    const Tensor& input_offsets,
+    const Tensor& output_offsets,
+    const int64_t num_output_rows) {
+  int64_t num_dense_output_rows =
+      input_offsets[input_offsets.numel() - 1].item<int64_t>();
+  static auto v1_op =
+      c10::Dispatcher::singleton()
+          .findSchemaOrThrow("fbgemm::jagged_index_add_2d_forward", "")
+          .typed<at::Tensor(
+              const Tensor& values,
+              const Tensor& indices,
+              const Tensor& input_offsets,
+              const Tensor& output_offsets,
+              const int64_t num_dense_input_rows,
+              const int64_t num_output_rows)>();
+  return v1_op.call(
+      values,
+      indices,
+      input_offsets,
+      output_offsets,
+      num_dense_output_rows,
+      num_output_rows);
+}
+
 template <typename index_t, typename offset_t, typename scalar_t>
 void jagged_index_add_2d_kernel(
     at::TensorAccessor<scalar_t, 2> output,
@@ -1642,11 +1699,18 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       "batched_dense_vec_jagged_2d_mul_backward(Tensor grad_output, Tensor v, Tensor a_values, Tensor a_offsets) -> (Tensor, Tensor)",
       {PT2_COMPLIANT_TAG});
   m.def(
-      "jagged_index_select(Tensor values, Tensor lengths, Tensor indices) -> Tensor[]");
+      "jagged_index_select(Tensor values, Tensor lengths, Tensor indices) -> Tensor[]",
+      {PT2_COMPLIANT_TAG});
   m.def(
       "jagged_index_select_2d_forward(Tensor values, Tensor indices, Tensor input_offsets, Tensor output_offsets, int num_dense_output_rows) -> Tensor");
+  m.def(
+      "jagged_index_select_2d_forward_v2(Tensor values, Tensor indices, Tensor input_offsets, Tensor output_offsets) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
       "jagged_index_add_2d_forward(Tensor values, Tensor indices, Tensor input_offsets, Tensor output_offsets, int num_dense_input_rows, int num_output_rows) -> Tensor");
+  m.def(
+      "jagged_index_add_2d_forward_v2(Tensor values, Tensor indices, Tensor input_offsets, Tensor output_offsets, SymInt num_output_rows) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
       "jagged_1d_to_truncated_values(Tensor values, Tensor lengths, int max_truncated_length) -> Tensor");
   m.def(
@@ -1728,7 +1792,6 @@ TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
   DISPATCH_TO_CPU(
       "jagged_index_select_2d_forward",
       fbgemm_gpu::jagged_index_select_2d_forward_cpu);
-  DISPATCH_TO_CPU("jagged_index_select", fbgemm_gpu::jagged_index_select_2d);
   DISPATCH_TO_CPU(
       "jagged_index_add_2d_forward",
       fbgemm_gpu::jagged_index_add_2d_forward_cpu);
@@ -1749,3 +1812,12 @@ TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
       "jagged_dense_bmm_forward", fbgemm_gpu::jagged_dense_bmm_forward);
   DISPATCH_TO_CPU("jagged_slice_forward", fbgemm_gpu::jagged_slice_forward_cpu);
 }
+
+TORCH_LIBRARY_IMPL(fbgemm, CompositeExplicitAutograd, m) {
+  m.impl(
+      "jagged_index_select_2d_forward_v2",
+      fbgemm_gpu::jagged_index_select_2d_forward_v2_impl);
+  m.impl(
+      "jagged_index_add_2d_forward_v2",
+      fbgemm_gpu::jagged_index_add_2d_forward_v2_impl);
+}
diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index b82d5f3c86..1f8ebd34a8 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -268,27 +268,27 @@
     "fbgemm::jagged_index_select": {
       "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_index_select_2d": {
         "comment": "",
-        "status": "xfail"
+        "status": "xsuccess"
       },
       "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_jagged_index_select_2d_in_inference": {
         "comment": "",
-        "status": "xfail"
+        "status": "xsuccess"
       },
       "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_keyed_jagged_index_select_dim1": {
         "comment": "",
-        "status": "xfail"
+        "status": "xsuccess"
       },
       "JaggedTensorOpsTest.test_faketensor__test_jagged_index_select_2d": {
         "comment": "",
-        "status": "xfail"
+        "status": "xsuccess"
       },
       "JaggedTensorOpsTest.test_faketensor__test_jagged_index_select_2d_in_inference": {
         "comment": "",
-        "status": "xfail"
+        "status": "xsuccess"
       },
       "JaggedTensorOpsTest.test_faketensor__test_keyed_jagged_index_select_dim1": {
         "comment": "",
-        "status": "xfail"
+        "status": "xsuccess"
       }
     },
     "fbgemm::jagged_jagged_bmm": {},

From 1c9307240ba8f3a7f00cc1738c747a16a9747956 Mon Sep 17 00:00:00 2001
From: Richard Zou <rzou@meta.com>
Date: Thu, 30 Nov 2023 11:00:04 -0800
Subject: [PATCH 64/73] Make fbgemm::permute_1D_sparse_data,
 permute_2D_sparse_data pt2_compliant (#2180)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2180

- We fix fbgemm::permute_1d_sparse_data's implementation to never return
  views. The schema for the operator promises that it is functional. This was
  causing flaky tests.
- fbgemm::permute_2d_sparse_data already has that fix applied, so we just
  remove the skip

Reviewed By: williamwen42

Differential Revision: D51711257

fbshipit-source-id: 388aa89d7b7be3036ec3a620eb1c239e2e02d0dc
---
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp   |  6 ++++--
 fbgemm_gpu/src/sparse_ops/sparse_permute_1d.cu |  6 +++++-
 fbgemm_gpu/test/failures_dict.json             | 14 ++------------
 3 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index 1be2fc8930..bf3b0f22b3 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -2730,9 +2730,11 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "permute_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
   m.def(
-      "permute_2D_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
+      "permute_2D_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)",
+      {PT2_COMPLIANT_TAG});
   m.def(
-      "permute_1D_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)");
+      "permute_1D_sparse_data(Tensor permute, Tensor lengths, Tensor values, Tensor? weights=None, SymInt? permuted_lengths_sum=None) -> (Tensor, Tensor, Tensor?)",
+      {PT2_COMPLIANT_TAG});
   m.def("invert_permute(Tensor permute) -> Tensor");
   m.def(
       "expand_into_jagged_permute(Tensor permute, Tensor input_offset, Tensor output_offset, SymInt output_size) -> Tensor");
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_permute_1d.cu b/fbgemm_gpu/src/sparse_ops/sparse_permute_1d.cu
index 2c03165268..145d279c0d 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_permute_1d.cu
+++ b/fbgemm_gpu/src/sparse_ops/sparse_permute_1d.cu
@@ -85,7 +85,11 @@ permute_1D_sparse_data_cuda(
 
   if (permuted_lengths_size == 0 || lengths_size == 0) {
     // Permutation will not be performed.  Return the input tensors
-    return {lengths.view({-1}), indices, weights};
+    return {
+        lengths.view({-1}).clone(),
+        indices.clone(),
+        weights.has_value() ? c10::make_optional(weights->clone())
+                            : c10::nullopt};
   }
 
   Tensor permuted_lengths;
diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index 1f8ebd34a8..e29789f6b0 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -436,18 +436,8 @@
         "status": "xfail"
       }
     },
-    "fbgemm::permute_1D_sparse_data": {
-      "SparseOpsTest.test_schema__test_permute_indices": {
-        "comment": "flaky",
-        "status": "skip"
-      }
-    },
-    "fbgemm::permute_2D_sparse_data": {
-      "SparseOpsTest.test_schema__test_permute_indices": {
-        "comment": "flaky",
-        "status": "skip"
-      }
-    },
+    "fbgemm::permute_1D_sparse_data": {},
+    "fbgemm::permute_2D_sparse_data": {},
     "fbgemm::permute_sequence_embeddings": {
       "SparseOpsTest.test_aot_dispatch_dynamic__test_permute_embeddings": {
         "comment": "",

From 3d477a071d1a7cdabc91bb1f1f328caa7de9ae1c Mon Sep 17 00:00:00 2001
From: Richard Zou <rzou@meta.com>
Date: Thu, 30 Nov 2023 11:00:04 -0800
Subject: [PATCH 65/73] Mark some more ops as pt2_compliant (#2181)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2181

These passed all generated opcheck tests.

Reviewed By: williamwen42

Differential Revision: D51711876

fbshipit-source-id: 26be06c5183c8de2036e455cf21df6a149bd6da8
---
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index bf3b0f22b3..a6dccc0fee 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -2737,7 +2737,8 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       {PT2_COMPLIANT_TAG});
   m.def("invert_permute(Tensor permute) -> Tensor");
   m.def(
-      "expand_into_jagged_permute(Tensor permute, Tensor input_offset, Tensor output_offset, SymInt output_size) -> Tensor");
+      "expand_into_jagged_permute(Tensor permute, Tensor input_offset, Tensor output_offset, SymInt output_size) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
       "block_bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, bool sequence, Tensor block_sizes, SymInt my_size, Tensor? weights=None, Tensor? batch_size_per_feature=None, SymInt max_B= -1, Tensor[]? block_bucketize_pos=None) -> (Tensor, Tensor, Tensor?, Tensor?, Tensor?)");
   m.def(
@@ -2808,7 +2809,8 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "index_select_dim0(Tensor input, Tensor indices, SymInt? consecutive_range_start=0, SymInt? consecutive_range_length=0, bool? skip_indices_sorting_fwd=None) -> Tensor");
   m.def(
-      "group_index_select_dim0(Tensor[] input_group, Tensor[] indices_group) -> Tensor[]");
+      "group_index_select_dim0(Tensor[] input_group, Tensor[] indices_group) -> Tensor[]",
+      {PT2_COMPLIANT_TAG});
   // This is an one-off op to be used in split_embedding_utils.py for zipf
   // generation w/o replacement along dim=-1. If requires_unique=True, find
   // smallest unique k.  If the number of unique elements is less than k,

From 121c20bd96b3f17b3290bf46c1ccc9ca88b68a78 Mon Sep 17 00:00:00 2001
From: generatedunixname89002005307016
 <generatedunixname89002005307016@meta.com>
Date: Thu, 30 Nov 2023 13:40:53 -0800
Subject: [PATCH 66/73] set strict as default typing mode in
 `deeplearning/fbgemm/fbgemm_gpu` (#2184)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2184

Reviewed By: azad-meta

Differential Revision: D51710423

fbshipit-source-id: a282a011278cff953d19bf0f969d982240e554c4
---
 fbgemm_gpu/test/input_combine_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fbgemm_gpu/test/input_combine_test.py b/fbgemm_gpu/test/input_combine_test.py
index b8fb2a4a75..db1f0e8928 100644
--- a/fbgemm_gpu/test/input_combine_test.py
+++ b/fbgemm_gpu/test/input_combine_test.py
@@ -130,6 +130,7 @@ def forward(  # noqa C901
 # e.g. "test_faketensor__test_cumsum": [unittest.expectedFailure]
 # Please avoid putting tests here, you should put operator-specific
 # skips and failures in deeplearning/fbgemm/fbgemm_gpu/test/failures_dict.json
+# pyre-fixme[24]: Generic type `Callable` expects 2 type parameters.
 additional_decorators: Dict[str, List[Callable]] = {
     "test_pt2_compliant_tag_fbgemm_dense_to_jagged": [
         # This operator has been grandfathered in. We need to fix this test failure.

From 327dcf9a2386b4796b970a332399988ebc63be94 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@meta.com>
Date: Thu, 30 Nov 2023 14:34:01 -0800
Subject: [PATCH 67/73] Get fbgemm::FloatToFP8RowwiseQuantized opcheck tests
 passing

Summary: Title

Reviewed By: zou3519

Differential Revision: D51692611

fbshipit-source-id: 5b9353f8730352ce3a8d7f89124668a8450c8456
---
 .../src/quantize_ops/quantize_ops_cpu.cpp      |  4 +++-
 .../src/quantize_ops/quantize_ops_meta.cpp     | 18 ++++++++++++++++++
 fbgemm_gpu/test/failures_dict.json             |  4 ++--
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp b/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp
index 63988b4b0c..dc5ab72be3 100644
--- a/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp
+++ b/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp
@@ -408,7 +408,9 @@ at::Tensor _hfp8_to_float_cpu(
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def("FloatToFused8BitRowwiseQuantized(Tensor t) -> Tensor");
-  m.def("FloatToFP8RowwiseQuantized(Tensor t, bool forward) -> Tensor");
+  m.def(
+      "FloatToFP8RowwiseQuantized(Tensor t, bool forward) -> Tensor",
+      {PT2_COMPLIANT_TAG});
   m.def(
       "FloatToPaddedFP8RowwiseQuantized(Tensor t, bool forward, int row_dim) -> Tensor");
   m.def(
diff --git a/fbgemm_gpu/src/quantize_ops/quantize_ops_meta.cpp b/fbgemm_gpu/src/quantize_ops/quantize_ops_meta.cpp
index 471116942c..2cecb4ceac 100644
--- a/fbgemm_gpu/src/quantize_ops/quantize_ops_meta.cpp
+++ b/fbgemm_gpu/src/quantize_ops/quantize_ops_meta.cpp
@@ -49,10 +49,28 @@ Tensor FP8rowwise_to_float_meta(
   }
 }
 
+Tensor FloatToFP8RowwiseQuantized_meta(const Tensor& input, bool forward) {
+  TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
+
+  const at::SymIntArrayRef input_sizes = input.sym_sizes();
+
+  const auto last_dim = input_sizes.size() - 1;
+  const at::SymInt ncols = input_sizes[last_dim];
+  const at::SymInt ncols_aligned = (ncols + 4 - 1) / 4 * 4;
+  const at::SymInt output_columns = ncols_aligned + 2 * sizeof(float);
+
+  auto output_dims = input_sizes.vec();
+  output_dims[last_dim] = output_columns;
+  return at::empty_symint(output_dims, input.options().dtype(at::kByte));
+}
+
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_IMPL(fbgemm, Meta, m) {
   m.impl(
       "FP8RowwiseQuantizedToFloat",
       TORCH_FN(fbgemm_gpu::FP8rowwise_to_float_meta));
+  m.impl(
+      "FloatToFP8RowwiseQuantized",
+      TORCH_FN(fbgemm_gpu::FloatToFP8RowwiseQuantized_meta));
 }
diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
index e29789f6b0..43efa968c9 100644
--- a/fbgemm_gpu/test/failures_dict.json
+++ b/fbgemm_gpu/test/failures_dict.json
@@ -6,11 +6,11 @@
     "fbgemm::FloatToFP8RowwiseQuantized": {
       "TestFP8RowwiseQuantizationConversion.test_aot_dispatch_dynamic__test_quantize_and_dequantize_op_fp8_rowwise": {
         "comment": "",
-        "status": "xfail"
+        "status": "xsuccess"
       },
       "TestFP8RowwiseQuantizationConversion.test_faketensor__test_quantize_and_dequantize_op_fp8_rowwise": {
         "comment": "",
-        "status": "xfail"
+        "status": "xsuccess"
       }
     },
     "fbgemm::FloatToPaddedFP8RowwiseQuantized": {

From bf980a7205a2e0c68802c74be832e93c419928fe Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Thu, 30 Nov 2023 14:35:46 -0800
Subject: [PATCH 68/73] Refactor embedding_bounds_check (#2178)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2178

- Migrate embedding_bounds_check out of codegen and into split_embeddings_utils

Reviewed By: sryap

Differential Revision: D51688407

fbshipit-source-id: e021212f7c7123e770998cd6115f4a99461b1562
---
 fbgemm_gpu/CMakeLists.txt                                   | 6 +++---
 fbgemm_gpu/docs/Doxyfile.in                                 | 4 ++--
 .../split_embeddings_utils}/embedding_bounds_check.cu       | 3 +--
 .../split_embeddings_utils/embedding_bounds_check_cpu.cpp}  | 0
 .../split_embeddings_utils/embedding_bounds_check_gpu.cpp}  | 2 --
 5 files changed, 6 insertions(+), 9 deletions(-)
 rename fbgemm_gpu/{codegen => src/split_embeddings_utils}/embedding_bounds_check.cu (99%)
 rename fbgemm_gpu/{codegen/embedding_bounds_check_host_cpu.cpp => src/split_embeddings_utils/embedding_bounds_check_cpu.cpp} (100%)
 rename fbgemm_gpu/{codegen/embedding_bounds_check_host.cpp => src/split_embeddings_utils/embedding_bounds_check_gpu.cpp} (96%)

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 67c36f125f..16a80c13f4 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -572,7 +572,6 @@ set(fbgemm_gpu_sources_static_cpu
     codegen/embedding_forward_split_cpu.cpp
     codegen/embedding_forward_quantized_host_cpu.cpp
     codegen/embedding_backward_dense_host_cpu.cpp
-    codegen/embedding_bounds_check_host_cpu.cpp
     src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
@@ -592,13 +591,13 @@ set(fbgemm_gpu_sources_static_cpu
     src/split_embeddings_cache/lru_cache_populate_byte.cpp
     src/split_embeddings_cache/lxu_cache.cpp
     src/split_embeddings_cache/split_embeddings_cache_ops.cpp
+    src/split_embeddings_utils/embedding_bounds_check_cpu.cpp
     codegen/batch_index_select_dim0_cpu_host.cpp)
 
 if(NOT FBGEMM_CPU_ONLY)
   list(APPEND fbgemm_gpu_sources_static_cpu
     codegen/embedding_forward_quantized_host.cpp
     codegen/embedding_backward_dense_host.cpp
-    codegen/embedding_bounds_check_host.cpp
     src/memory_utils/memory_utils.cpp
     src/memory_utils/memory_utils_ops.cpp
     src/layout_transform_ops/layout_transform_ops_gpu.cpp
@@ -610,6 +609,7 @@ if(NOT FBGEMM_CPU_ONLY)
     src/split_embeddings_cache/split_embeddings_cache_ops.cu
     src/metric_ops/metric_ops_host.cpp
     src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
+    src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
     src/input_combine_ops/input_combine_gpu.cpp
     codegen/batch_index_select_dim0_host.cpp)
 
@@ -639,7 +639,6 @@ endif()
 
 if(NOT FBGEMM_CPU_ONLY)
   set(fbgemm_gpu_sources_static_gpu
-      codegen/embedding_bounds_check.cu
       codegen/embedding_forward_quantized_split_lookup.cu
       src/memory_utils/memory_utils.cu
       src/memory_utils/memory_utils_ops.cu
@@ -703,6 +702,7 @@ if(NOT FBGEMM_CPU_ONLY)
       src/split_embeddings_cache/lxu_cache.cu
       src/split_embeddings_cache/linearize_cache_indices.cu
       src/split_embeddings_cache/reset_weight_momentum.cu
+      src/split_embeddings_utils/embedding_bounds_check.cu
       src/split_embeddings_utils/generate_vbe_metadata.cu
       src/split_embeddings_utils/get_infos_metadata.cu
       src/split_embeddings_utils/radix_sort_pairs.cu
diff --git a/fbgemm_gpu/docs/Doxyfile.in b/fbgemm_gpu/docs/Doxyfile.in
index 447ac5f62d..3e5c8c20c8 100644
--- a/fbgemm_gpu/docs/Doxyfile.in
+++ b/fbgemm_gpu/docs/Doxyfile.in
@@ -932,10 +932,10 @@ INPUT                  = "../include/fbgemm_gpu" \
                          "../codegen/embedding_backward_split_host_cpu_template.cpp" \
                          "../codegen/embedding_forward_quantized_host.cpp" \
                          "../codegen/embedding_forward_quantized_host_cpu.cpp" \
-                         "../codegen/embedding_bounds_check_host.cpp" \
-                         "../codegen/embedding_bounds_check_host_cpu.cpp" \
                          "../codegen/embedding_backward_dense_host.cpp" \
                          "../codegen/embedding_forward_template_helpers.cuh " \
+                         "../src/split_embeddings_utils/embedding_bounds_check_cpu.cpp" \
+                         "../src/split_embeddings_utils/embedding_bounds_check_gpu.cpp" \
                          "../src/permute_pooled_embedding_ops_gpu.cpp" \
 
 # This tag can be used to specify the character encoding of the source files
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check.cu b/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check.cu
similarity index 99%
rename from fbgemm_gpu/codegen/embedding_bounds_check.cu
rename to fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check.cu
index a93be2ba6c..700d8974d8 100644
--- a/fbgemm_gpu/codegen/embedding_bounds_check.cu
+++ b/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check.cu
@@ -6,10 +6,9 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
-
 #include <c10/cuda/CUDADeviceAssertion.h>
 #include <c10/cuda/CUDAException.h>
+#include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
 
 using Tensor = at::Tensor;
 using namespace fbgemm_gpu;
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp b/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_cpu.cpp
similarity index 100%
rename from fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
rename to fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_cpu.cpp
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp b/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
similarity index 96%
rename from fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
rename to fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
index afa52729b7..77f68eb1df 100644
--- a/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
+++ b/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
@@ -9,9 +9,7 @@
 #include <ATen/ATen.h>
 #include <ATen/TypeDefault.h>
 #include <ATen/core/op_registration/op_registration.h>
-#include <ATen/cuda/CUDAContext.h>
 #include <torch/library.h>
-
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
 using Tensor = at::Tensor;

From c58679a2041c3b75fac5670a88afd3c640768d0f Mon Sep 17 00:00:00 2001
From: Gufan Yin <gyin@fb.com>
Date: Thu, 30 Nov 2023 17:43:57 -0800
Subject: [PATCH 69/73] Revert D51688407: Refactor embedding_bounds_check

Differential Revision:
D51688407

Original commit changeset: e021212f7c71

Original Phabricator Diff: D51688407

fbshipit-source-id: fa2c01bbd0f48609561dc6a353f9088bb91f0177
---
 fbgemm_gpu/CMakeLists.txt                                   | 6 +++---
 .../embedding_bounds_check.cu                               | 3 ++-
 .../embedding_bounds_check_host.cpp}                        | 2 ++
 .../embedding_bounds_check_host_cpu.cpp}                    | 0
 fbgemm_gpu/docs/Doxyfile.in                                 | 4 ++--
 5 files changed, 9 insertions(+), 6 deletions(-)
 rename fbgemm_gpu/{src/split_embeddings_utils => codegen}/embedding_bounds_check.cu (99%)
 rename fbgemm_gpu/{src/split_embeddings_utils/embedding_bounds_check_gpu.cpp => codegen/embedding_bounds_check_host.cpp} (96%)
 rename fbgemm_gpu/{src/split_embeddings_utils/embedding_bounds_check_cpu.cpp => codegen/embedding_bounds_check_host_cpu.cpp} (100%)

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 16a80c13f4..67c36f125f 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -572,6 +572,7 @@ set(fbgemm_gpu_sources_static_cpu
     codegen/embedding_forward_split_cpu.cpp
     codegen/embedding_forward_quantized_host_cpu.cpp
     codegen/embedding_backward_dense_host_cpu.cpp
+    codegen/embedding_bounds_check_host_cpu.cpp
     src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
@@ -591,13 +592,13 @@ set(fbgemm_gpu_sources_static_cpu
     src/split_embeddings_cache/lru_cache_populate_byte.cpp
     src/split_embeddings_cache/lxu_cache.cpp
     src/split_embeddings_cache/split_embeddings_cache_ops.cpp
-    src/split_embeddings_utils/embedding_bounds_check_cpu.cpp
     codegen/batch_index_select_dim0_cpu_host.cpp)
 
 if(NOT FBGEMM_CPU_ONLY)
   list(APPEND fbgemm_gpu_sources_static_cpu
     codegen/embedding_forward_quantized_host.cpp
     codegen/embedding_backward_dense_host.cpp
+    codegen/embedding_bounds_check_host.cpp
     src/memory_utils/memory_utils.cpp
     src/memory_utils/memory_utils_ops.cpp
     src/layout_transform_ops/layout_transform_ops_gpu.cpp
@@ -609,7 +610,6 @@ if(NOT FBGEMM_CPU_ONLY)
     src/split_embeddings_cache/split_embeddings_cache_ops.cu
     src/metric_ops/metric_ops_host.cpp
     src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
-    src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
     src/input_combine_ops/input_combine_gpu.cpp
     codegen/batch_index_select_dim0_host.cpp)
 
@@ -639,6 +639,7 @@ endif()
 
 if(NOT FBGEMM_CPU_ONLY)
   set(fbgemm_gpu_sources_static_gpu
+      codegen/embedding_bounds_check.cu
       codegen/embedding_forward_quantized_split_lookup.cu
       src/memory_utils/memory_utils.cu
       src/memory_utils/memory_utils_ops.cu
@@ -702,7 +703,6 @@ if(NOT FBGEMM_CPU_ONLY)
       src/split_embeddings_cache/lxu_cache.cu
       src/split_embeddings_cache/linearize_cache_indices.cu
       src/split_embeddings_cache/reset_weight_momentum.cu
-      src/split_embeddings_utils/embedding_bounds_check.cu
       src/split_embeddings_utils/generate_vbe_metadata.cu
       src/split_embeddings_utils/get_infos_metadata.cu
       src/split_embeddings_utils/radix_sort_pairs.cu
diff --git a/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check.cu b/fbgemm_gpu/codegen/embedding_bounds_check.cu
similarity index 99%
rename from fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check.cu
rename to fbgemm_gpu/codegen/embedding_bounds_check.cu
index 700d8974d8..a93be2ba6c 100644
--- a/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check.cu
+++ b/fbgemm_gpu/codegen/embedding_bounds_check.cu
@@ -6,9 +6,10 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
+
 #include <c10/cuda/CUDADeviceAssertion.h>
 #include <c10/cuda/CUDAException.h>
-#include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
 
 using Tensor = at::Tensor;
 using namespace fbgemm_gpu;
diff --git a/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_gpu.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
similarity index 96%
rename from fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
rename to fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
index 77f68eb1df..afa52729b7 100644
--- a/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_gpu.cpp
+++ b/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
@@ -9,7 +9,9 @@
 #include <ATen/ATen.h>
 #include <ATen/TypeDefault.h>
 #include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <torch/library.h>
+
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
 using Tensor = at::Tensor;
diff --git a/fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_cpu.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
similarity index 100%
rename from fbgemm_gpu/src/split_embeddings_utils/embedding_bounds_check_cpu.cpp
rename to fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
diff --git a/fbgemm_gpu/docs/Doxyfile.in b/fbgemm_gpu/docs/Doxyfile.in
index 3e5c8c20c8..447ac5f62d 100644
--- a/fbgemm_gpu/docs/Doxyfile.in
+++ b/fbgemm_gpu/docs/Doxyfile.in
@@ -932,10 +932,10 @@ INPUT                  = "../include/fbgemm_gpu" \
                          "../codegen/embedding_backward_split_host_cpu_template.cpp" \
                          "../codegen/embedding_forward_quantized_host.cpp" \
                          "../codegen/embedding_forward_quantized_host_cpu.cpp" \
+                         "../codegen/embedding_bounds_check_host.cpp" \
+                         "../codegen/embedding_bounds_check_host_cpu.cpp" \
                          "../codegen/embedding_backward_dense_host.cpp" \
                          "../codegen/embedding_forward_template_helpers.cuh " \
-                         "../src/split_embeddings_utils/embedding_bounds_check_cpu.cpp" \
-                         "../src/split_embeddings_utils/embedding_bounds_check_gpu.cpp" \
                          "../src/permute_pooled_embedding_ops_gpu.cpp" \
 
 # This tag can be used to specify the character encoding of the source files

From 0fc0d4e0bfb465a82e51f8e1e48185eacd50b632 Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Thu, 30 Nov 2023 23:26:17 -0800
Subject: [PATCH 70/73] Remove indices and offsets copying from prefetch
 (#2186)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2186

Since `linearize_cache_indices` did not support the case where
`indices` and `offsets` have different types, we casted both `indices`
and `offsets` to the same type, that is `int64_t`.  In some cases,
this caused the memory requirment to surge causing the peak memory
requirement to increase.  This diff modifies the
`linearize_cache_indices` op to support when `indices` and `offsets`
have different types.

Reviewed By: ehsanardestani

Differential Revision: D51723551

fbshipit-source-id: f40c9cc6e8e4435a8cc01702edd85cf843835c07
---
 ..._table_batched_embeddings_ops_inference.py |  4 --
 ...t_table_batched_embeddings_ops_training.py |  1 -
 .../linearize_cache_indices.cu                | 43 +++++++++++--------
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_inference.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_inference.py
index 301dad837a..d5b0d66d23 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_inference.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_inference.py
@@ -505,10 +505,6 @@ def prefetch(self, indices: Tensor, offsets: Tensor) -> None:
         if not self.lxu_cache_weights.numel():
             return
 
-        # FIXME: check the int32_t range failure in https://fburl.com/gdoc/kcdnrnvg .
-        # The real failure should be in cache handling in https://fburl.com/ox3f26r0 .
-        indices, offsets = indices.long(), offsets.long()
-
         linear_cache_indices = torch.ops.fbgemm.linearize_cache_indices(
             self.cache_hash_size_cumsum,
             indices,
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
index 378f3819f0..36bf5f43f9 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
@@ -1158,7 +1158,6 @@ def _prefetch(self, indices: Tensor, offsets: Tensor) -> None:
         if not self.lxu_cache_weights.numel():
             return
 
-        (indices, offsets) = indices.long(), offsets.long()
         linear_cache_indices = torch.ops.fbgemm.linearize_cache_indices(
             self.cache_hash_size_cumsum,
             indices,
diff --git a/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu b/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu
index 088ba930e5..9bb6fe5c86 100644
--- a/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu
+++ b/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu
@@ -13,15 +13,15 @@ using namespace fbgemm_gpu;
 
 namespace {
 
-template <typename index_t>
+template <typename index_t, typename offset_t>
 __global__ __launch_bounds__(kMaxThreads) void linearize_cache_indices_kernel(
     const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         cache_hash_size_cumsum,
     const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
         indices,
-    const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+    const pta::PackedTensorAccessor32<offset_t, 1, at::RestrictPtrTraits>
         table_offsets,
-    pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+    pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         linear_cache_indices) {
   const index_t index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index >= indices.size(0)) {
@@ -72,31 +72,36 @@ DLL_PUBLIC Tensor linearize_cache_indices_cuda(
   const auto B = (offsets.size(0) - 1) / T;
   TORCH_CHECK(B >= 0);
 
-  auto linear_cache_indices = at::empty_like(indices);
+  auto linear_cache_indices =
+      at::empty(indices.sizes(), indices.options().dtype(at::kLong));
   const auto num_indices = indices.numel();
   if (B == 0 || num_indices == 0) {
     return linear_cache_indices;
   }
 
-  auto table_offsets = offsets.slice(0, B, B * T, B);
+  const auto table_offsets = offsets.slice(0, B, B * T, B);
 
   AT_DISPATCH_INDEX_TYPES(
-      indices.scalar_type(), "linearize_cache_indices_kernel", [&] {
+      table_offsets.scalar_type(), "linearize_cache_indices_kernel_1", [&] {
+        using offset_t = index_t;
+        AT_DISPATCH_INDEX_TYPES(
+            indices.scalar_type(), "linearize_cache_indices_kernel_2", [&] {
 #ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "linearize_cache_indices_kernel";
+              const char* func_name = "linearize_cache_indices_kernel";
 #endif
-        linearize_cache_indices_kernel<<<
-            div_round_up(num_indices, kMaxThreads),
-            kMaxThreads,
-            0,
-            at::cuda::getCurrentCUDAStream()>>>(
-            MAKE_PTA_WITH_NAME(
-                func_name, cache_hash_size_cumsum, int64_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, indices, index_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, table_offsets, index_t, 1, 32),
-            MAKE_PTA_WITH_NAME(
-                func_name, linear_cache_indices, index_t, 1, 32));
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
+              linearize_cache_indices_kernel<<<
+                  div_round_up(num_indices, kMaxThreads),
+                  kMaxThreads,
+                  0,
+                  at::cuda::getCurrentCUDAStream()>>>(
+                  MAKE_PTA_WITH_NAME(
+                      func_name, cache_hash_size_cumsum, int64_t, 1, 32),
+                  MAKE_PTA_WITH_NAME(func_name, indices, index_t, 1, 32),
+                  MAKE_PTA_WITH_NAME(func_name, table_offsets, offset_t, 1, 32),
+                  MAKE_PTA_WITH_NAME(
+                      func_name, linear_cache_indices, int64_t, 1, 32));
+              C10_CUDA_KERNEL_LAUNCH_CHECK();
+            });
       });
   return linear_cache_indices;
 }

From d5cdefdbfd212de3951c0a73200a9ad91bad4aab Mon Sep 17 00:00:00 2001
From: Qiang Zhang <drqiangzhang@meta.com>
Date: Fri, 1 Dec 2023 10:57:56 -0800
Subject: [PATCH 71/73] Add GPU kernel to support variable bucket size in
 block_bucketize_sparse_features (#2150)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2150

CPU version implementation D50868649

Reviewed By: sryap

Differential Revision: D51414310

fbshipit-source-id: 6b447911c902feff5039bd6fc2a69d029621b1cf
---
 .../sparse_block_bucketize_features.cu        | 166 +++++++++++++++++-
 fbgemm_gpu/test/sparse_ops_test.py            |  33 ++++
 2 files changed, 190 insertions(+), 9 deletions(-)

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu
index 7e18862538..676ec9f4d8 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu
+++ b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu
@@ -48,13 +48,18 @@ __launch_bounds__(kMaxThreads) void _block_bucketize_sparse_features_cuda_kernel
     const offset_t* const __restrict__ offsets_data,
     const index_t* const __restrict__ indices_data,
     offset_t* const __restrict__ new_lengths_data,
-    offset_t* __restrict__ length_to_feature_idx) {
+    offset_t* __restrict__ length_to_feature_idx,
+    const offset_t* const __restrict__ block_bucketize_pos_concat,
+    const offset_t* const __restrict__ block_bucketize_pos_offsets,
+    offset_t* __restrict__ indices_to_lb) {
   using uindex_t = std::make_unsigned_t<index_t>;
   CUDA_KERNEL_LOOP(b_t, lengths_size) {
     const auto t = length_to_feature_idx ? length_to_feature_idx[b_t] : b_t / B;
     index_t blk_size = block_sizes_data[t];
     offset_t rowstart = (b_t == 0 ? 0 : offsets_data[b_t - 1]);
     offset_t rowend = offsets_data[b_t];
+    const auto use_block_bucketize_pos =
+        (block_bucketize_pos_concat != nullptr);
     for (index_t i = rowstart; i < rowend; ++i) {
       // We have use cases using none-hashed raw indices that can be either
       // negative or larger than embedding table hash_size (blk_size *
@@ -63,7 +68,27 @@ __launch_bounds__(kMaxThreads) void _block_bucketize_sparse_features_cuda_kernel
       // range of blk_size, we expect the later embedding module to take care
       // of hashing indices calculation.
       uindex_t idx = static_cast<uindex_t>(indices_data[i]);
-      uindex_t p = idx < blk_size * my_size ? idx / blk_size : idx % my_size;
+      uindex_t p = 0;
+      if (!use_block_bucketize_pos) {
+        p = idx < blk_size * my_size ? idx / blk_size : idx % my_size;
+      } else {
+        index_t first = block_bucketize_pos_offsets[t];
+        index_t last = block_bucketize_pos_offsets[t + 1];
+
+        while (first < last) {
+          index_t middle = first + ((last - first) / 2);
+          if (static_cast<uindex_t>(block_bucketize_pos_concat[middle]) <=
+              idx) {
+            first = ++middle;
+          } else {
+            last = middle;
+          }
+        }
+        uindex_t lb =
+            static_cast<uindex_t>(first - block_bucketize_pos_offsets[t] - 1);
+        indices_to_lb[i] = lb;
+        p = lb < my_size ? lb : idx % my_size;
+      }
       new_lengths_data[p * lengths_size + b_t]++;
     }
   }
@@ -95,7 +120,10 @@ __launch_bounds__(kMaxThreads) void _block_bucketize_sparse_features_cuda_kernel
     scalar_t* __restrict__ new_weights_data,
     index_t* __restrict__ new_pos_data,
     index_t* const __restrict__ unbucketize_permute_data,
-    const offset_t* const __restrict__ length_to_feature_idx) {
+    const offset_t* const __restrict__ length_to_feature_idx,
+    const offset_t* const __restrict__ block_bucketize_pos_concat,
+    const offset_t* const __restrict__ block_bucketize_pos_offsets,
+    const offset_t* const __restrict__ indices_to_lb) {
   using uindex_t = std::make_unsigned_t<index_t>;
   using uoffset_t = std::make_unsigned_t<offset_t>;
   CUDA_KERNEL_LOOP(b_t, lengths_size) {
@@ -103,6 +131,8 @@ __launch_bounds__(kMaxThreads) void _block_bucketize_sparse_features_cuda_kernel
     index_t blk_size = block_sizes_data[t];
     offset_t rowstart = (b_t == 0 ? 0 : offsets_data[b_t - 1]);
     offset_t rowend = offsets_data[b_t];
+    const auto use_block_bucketize_pos =
+        (block_bucketize_pos_concat != nullptr);
     for (index_t i = rowstart; i < rowend; ++i) {
       // We have use cases using none-hashed raw indices that can be either
       // negative or larger than embedding table hash_size (blk_size *
@@ -111,9 +141,18 @@ __launch_bounds__(kMaxThreads) void _block_bucketize_sparse_features_cuda_kernel
       // range of blk_size, we expect the later embedding module to take care
       // of hashing indices calculation.
       uindex_t idx = static_cast<uindex_t>(indices_data[i]);
-      uindex_t p = idx < blk_size * my_size ? idx / blk_size : idx % my_size;
-      uindex_t new_idx =
-          idx < blk_size * my_size ? idx % blk_size : idx / my_size;
+      uindex_t p = 0;
+      uindex_t new_idx = 0;
+      if (!use_block_bucketize_pos) {
+        p = idx < blk_size * my_size ? idx / blk_size : idx % my_size;
+        new_idx = idx < blk_size * my_size ? idx % blk_size : idx / my_size;
+      } else {
+        uindex_t lb = indices_to_lb[i];
+        p = lb < my_size ? lb : idx % my_size;
+        new_idx = lb < my_size ? idx -
+                block_bucketize_pos_concat[lb + block_bucketize_pos_offsets[t]]
+                               : idx / my_size;
+      }
       uoffset_t pos = new_offsets_data[p * lengths_size + b_t];
       new_indices_data[pos] = new_idx;
       new_offsets_data[p * lengths_size + b_t]++;
@@ -148,9 +187,7 @@ block_bucketize_sparse_features_cuda(
     const c10::optional<Tensor>& weights,
     const c10::optional<Tensor>& batch_size_per_feature,
     const int64_t max_B,
-    const c10::optional<std::vector<
-        at::Tensor>>& /*block_bucketize_pos*/ // Only used in CPU variant
-) {
+    const c10::optional<std::vector<at::Tensor>>& block_bucketize_pos) {
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(lengths, indices);
 
   at::cuda::OptionalCUDAGuard device_guard;
@@ -184,6 +221,7 @@ block_bucketize_sparse_features_cuda(
   }
   auto length_to_feature_idx =
       at::empty({lengths_size}, lengths_contig.options());
+  auto indices_to_lb = at::empty_like(indices);
   if (batch_size_per_feature.has_value()) {
     constexpr auto threads_per_block = 256;
     const auto num_blocks =
@@ -207,6 +245,27 @@ block_bucketize_sparse_features_cuda(
         });
   }
 
+  at::Tensor block_bucketize_pos_concat =
+      at::empty({1}, lengths_contig.options());
+  at::Tensor block_bucketize_pos_offsets =
+      at::empty({1}, lengths_contig.options());
+
+  if (block_bucketize_pos.has_value()) {
+    block_bucketize_pos_concat = at::cat(block_bucketize_pos.value(), 0);
+    std::vector<int64_t> sizes_;
+    sizes_.reserve(block_bucketize_pos.value().size() + 1);
+    for (auto const& t : block_bucketize_pos.value()) {
+      sizes_.push_back(t.numel());
+    }
+    sizes_.push_back(0);
+    at::Tensor sizes_vec =
+        at::tensor(sizes_, at::TensorOptions().dtype(lengths_contig.dtype()));
+    block_bucketize_pos_offsets = asynchronous_exclusive_cumsum_cpu(
+        sizes_vec); // expect sizes_vec to be a small tensor, using cpu instead
+                    // of gpu for cumsum
+    block_bucketize_pos_offsets = block_bucketize_pos_offsets.to(
+        block_bucketize_pos_concat.device(), true);
+  }
   constexpr auto threads_per_block = 256;
   const auto num_blocks =
       cuda_calc_xblock_count(lengths_size, threads_per_block);
@@ -233,6 +292,15 @@ block_bucketize_sparse_features_cuda(
                   new_lengths.data_ptr<offset_t>(),
                   batch_size_per_feature.has_value()
                       ? length_to_feature_idx.data_ptr<offset_t>()
+                      : static_cast<offset_t*>(nullptr),
+                  block_bucketize_pos.has_value()
+                      ? block_bucketize_pos_concat.data_ptr<offset_t>()
+                      : static_cast<offset_t*>(nullptr),
+                  block_bucketize_pos.has_value()
+                      ? block_bucketize_pos_offsets.data_ptr<offset_t>()
+                      : static_cast<offset_t*>(nullptr),
+                  block_bucketize_pos.has_value()
+                      ? indices_to_lb.data_ptr<offset_t>()
                       : static_cast<offset_t*>(nullptr));
               C10_CUDA_KERNEL_LAUNCH_CHECK();
             });
@@ -286,6 +354,17 @@ block_bucketize_sparse_features_cuda(
                                 unbucketize_permute.data_ptr<index_t>(),
                                 batch_size_per_feature.has_value()
                                     ? length_to_feature_idx.data_ptr<offset_t>()
+                                    : static_cast<offset_t*>(nullptr),
+                                block_bucketize_pos.has_value()
+                                    ? block_bucketize_pos_concat
+                                          .data_ptr<offset_t>()
+                                    : static_cast<offset_t*>(nullptr),
+                                block_bucketize_pos.has_value()
+                                    ? block_bucketize_pos_offsets
+                                          .data_ptr<offset_t>()
+                                    : static_cast<offset_t*>(nullptr),
+                                block_bucketize_pos.has_value()
+                                    ? indices_to_lb.data_ptr<offset_t>()
                                     : static_cast<offset_t*>(nullptr));
                         C10_CUDA_KERNEL_LAUNCH_CHECK();
                       });
@@ -333,6 +412,17 @@ block_bucketize_sparse_features_cuda(
                                 unbucketize_permute.data_ptr<index_t>(),
                                 batch_size_per_feature.has_value()
                                     ? length_to_feature_idx.data_ptr<offset_t>()
+                                    : static_cast<offset_t*>(nullptr),
+                                block_bucketize_pos.has_value()
+                                    ? block_bucketize_pos_concat
+                                          .data_ptr<offset_t>()
+                                    : static_cast<offset_t*>(nullptr),
+                                block_bucketize_pos.has_value()
+                                    ? block_bucketize_pos_offsets
+                                          .data_ptr<offset_t>()
+                                    : static_cast<offset_t*>(nullptr),
+                                block_bucketize_pos.has_value()
+                                    ? indices_to_lb.data_ptr<offset_t>()
                                     : static_cast<offset_t*>(nullptr));
                         C10_CUDA_KERNEL_LAUNCH_CHECK();
                       });
@@ -374,6 +464,15 @@ block_bucketize_sparse_features_cuda(
                           unbucketize_permute.data_ptr<index_t>(),
                           batch_size_per_feature.has_value()
                               ? length_to_feature_idx.data_ptr<offset_t>()
+                              : static_cast<offset_t*>(nullptr),
+                          block_bucketize_pos.has_value()
+                              ? block_bucketize_pos_concat.data_ptr<offset_t>()
+                              : static_cast<offset_t*>(nullptr),
+                          block_bucketize_pos.has_value()
+                              ? block_bucketize_pos_offsets.data_ptr<offset_t>()
+                              : static_cast<offset_t*>(nullptr),
+                          block_bucketize_pos.has_value()
+                              ? indices_to_lb.data_ptr<offset_t>()
                               : static_cast<offset_t*>(nullptr));
                   C10_CUDA_KERNEL_LAUNCH_CHECK();
                 });
@@ -413,6 +512,15 @@ block_bucketize_sparse_features_cuda(
                           unbucketize_permute.data_ptr<index_t>(),
                           batch_size_per_feature.has_value()
                               ? length_to_feature_idx.data_ptr<offset_t>()
+                              : static_cast<offset_t*>(nullptr),
+                          block_bucketize_pos.has_value()
+                              ? block_bucketize_pos_concat.data_ptr<offset_t>()
+                              : static_cast<offset_t*>(nullptr),
+                          block_bucketize_pos.has_value()
+                              ? block_bucketize_pos_offsets.data_ptr<offset_t>()
+                              : static_cast<offset_t*>(nullptr),
+                          block_bucketize_pos.has_value()
+                              ? indices_to_lb.data_ptr<offset_t>()
                               : static_cast<offset_t*>(nullptr));
                   C10_CUDA_KERNEL_LAUNCH_CHECK();
                 });
@@ -462,6 +570,17 @@ block_bucketize_sparse_features_cuda(
                                 nullptr,
                                 batch_size_per_feature.has_value()
                                     ? length_to_feature_idx.data_ptr<offset_t>()
+                                    : static_cast<offset_t*>(nullptr),
+                                block_bucketize_pos.has_value()
+                                    ? block_bucketize_pos_concat
+                                          .data_ptr<offset_t>()
+                                    : static_cast<offset_t*>(nullptr),
+                                block_bucketize_pos.has_value()
+                                    ? block_bucketize_pos_offsets
+                                          .data_ptr<offset_t>()
+                                    : static_cast<offset_t*>(nullptr),
+                                block_bucketize_pos.has_value()
+                                    ? indices_to_lb.data_ptr<offset_t>()
                                     : static_cast<offset_t*>(nullptr));
                         C10_CUDA_KERNEL_LAUNCH_CHECK();
                       });
@@ -509,6 +628,17 @@ block_bucketize_sparse_features_cuda(
                                 nullptr,
                                 batch_size_per_feature.has_value()
                                     ? length_to_feature_idx.data_ptr<offset_t>()
+                                    : static_cast<offset_t*>(nullptr),
+                                block_bucketize_pos.has_value()
+                                    ? block_bucketize_pos_concat
+                                          .data_ptr<offset_t>()
+                                    : static_cast<offset_t*>(nullptr),
+                                block_bucketize_pos.has_value()
+                                    ? block_bucketize_pos_offsets
+                                          .data_ptr<offset_t>()
+                                    : static_cast<offset_t*>(nullptr),
+                                block_bucketize_pos.has_value()
+                                    ? indices_to_lb.data_ptr<offset_t>()
                                     : static_cast<offset_t*>(nullptr));
                         C10_CUDA_KERNEL_LAUNCH_CHECK();
                       });
@@ -550,6 +680,15 @@ block_bucketize_sparse_features_cuda(
                           nullptr,
                           batch_size_per_feature.has_value()
                               ? length_to_feature_idx.data_ptr<offset_t>()
+                              : static_cast<offset_t*>(nullptr),
+                          block_bucketize_pos.has_value()
+                              ? block_bucketize_pos_concat.data_ptr<offset_t>()
+                              : static_cast<offset_t*>(nullptr),
+                          block_bucketize_pos.has_value()
+                              ? block_bucketize_pos_offsets.data_ptr<offset_t>()
+                              : static_cast<offset_t*>(nullptr),
+                          block_bucketize_pos.has_value()
+                              ? indices_to_lb.data_ptr<offset_t>()
                               : static_cast<offset_t*>(nullptr));
                   C10_CUDA_KERNEL_LAUNCH_CHECK();
                 });
@@ -589,6 +728,15 @@ block_bucketize_sparse_features_cuda(
                           nullptr,
                           batch_size_per_feature.has_value()
                               ? length_to_feature_idx.data_ptr<offset_t>()
+                              : static_cast<offset_t*>(nullptr),
+                          block_bucketize_pos.has_value()
+                              ? block_bucketize_pos_concat.data_ptr<offset_t>()
+                              : static_cast<offset_t*>(nullptr),
+                          block_bucketize_pos.has_value()
+                              ? block_bucketize_pos_offsets.data_ptr<offset_t>()
+                              : static_cast<offset_t*>(nullptr),
+                          block_bucketize_pos.has_value()
+                              ? indices_to_lb.data_ptr<offset_t>()
                               : static_cast<offset_t*>(nullptr));
                   C10_CUDA_KERNEL_LAUNCH_CHECK();
                 });
diff --git a/fbgemm_gpu/test/sparse_ops_test.py b/fbgemm_gpu/test/sparse_ops_test.py
index 27730f418b..d06b7988d0 100644
--- a/fbgemm_gpu/test/sparse_ops_test.py
+++ b/fbgemm_gpu/test/sparse_ops_test.py
@@ -1052,6 +1052,39 @@ def test_block_bucketize_sparse_features_with_block_bucketize_pos(
         if has_weight:
             torch.testing.assert_close(new_weights_cpu, new_weights_ref)
 
+        if gpu_available:
+            block_bucketize_pos = [
+                torch.tensor([0, 2, 8], dtype=index_type, device="cuda"),
+                torch.tensor([0, 5, 10], dtype=index_type, device="cuda"),
+                torch.tensor([0, 7, 12], dtype=index_type, device="cuda"),
+            ]
+            (
+                new_lengths_gpu,
+                new_indices_gpu,
+                new_weights_gpu,
+                new_pos_gpu,
+                unbucketize_permute,
+            ) = torch.ops.fbgemm.block_bucketize_sparse_features(
+                lengths.cuda(),
+                indices.cuda(),
+                bucketize_pos,
+                sequence,
+                block_sizes.cuda(),
+                my_size,
+                weights.cuda() if weights is not None else None,
+                batch_sizes.cuda(),
+                max_B,
+                block_bucketize_pos,
+            )
+            torch.testing.assert_close(
+                new_lengths_gpu.cpu(), new_lengths_ref, rtol=0, atol=0
+            )
+            torch.testing.assert_close(
+                new_indices_gpu.cpu(), new_indices_ref, rtol=0, atol=0
+            )
+            if has_weight:
+                torch.testing.assert_close(new_weights_gpu.cpu(), new_weights_ref)
+
     @given(
         index_type=st.sampled_from([torch.int, torch.long]),
         has_weight=st.booleans(),

From 88fc6e741bc03e09fcdc3cd365fa3aafddb7ec24 Mon Sep 17 00:00:00 2001
From: Qiang Zhang <drqiangzhang@meta.com>
Date: Fri, 1 Dec 2023 12:16:52 -0800
Subject: [PATCH 72/73] Fix block_bucketize_features with variable bucket size
 when using torch.int64 for block_bucketize_pos (#2188)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2188

When block_bucketize_pos using torch.int64, it would fail TorchRec uneven sharing test. This is fix resolving this issue

Reviewed By: sryap

Differential Revision: D51713675

fbshipit-source-id: 240e836587d89ed118a9d24e9bf4b267031333b8
---
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index a6dccc0fee..ae17a393c5 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -355,7 +355,7 @@ void _block_bucketize_sparse_features_cpu(
         if (variable_bucket_sizes) {
           int64_t lb = std::upper_bound(
                            bucketize_offset,
-                           bucketize_offset + bucket_size,
+                           bucketize_offset + static_cast<index_t>(bucket_size),
                            indices_data[i]) -
               bucketize_offset - 1;
           lower_bounds[i] = lb;

From dbc3157bf256f1339b3fa1fef2be89ac4078be0e Mon Sep 17 00:00:00 2001
From: Qiang Zhang <drqiangzhang@meta.com>
Date: Mon, 4 Dec 2023 14:00:48 -0800
Subject: [PATCH 73/73] Benchmark block_bucketize_sparse_features uneven
 sharding for GPU (#2169)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2169

CPU implementation D51288847

Reviewed By: tissue3

Differential Revision: D51533599

fbshipit-source-id: cf9c7fcbe7043f385e97901a95916287c9f618a5
---
 fbgemm_gpu/bench/sparse_ops_benchmark.py | 27 +++++++++++++++++-------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/fbgemm_gpu/bench/sparse_ops_benchmark.py b/fbgemm_gpu/bench/sparse_ops_benchmark.py
index afca7d4678..a578f3f40a 100644
--- a/fbgemm_gpu/bench/sparse_ops_benchmark.py
+++ b/fbgemm_gpu/bench/sparse_ops_benchmark.py
@@ -874,9 +874,11 @@ def ben(fn, name, ad_indices, ad_lengths, batch_offsets, num_ads_in_batch):
 @click.option("--batch-size", default=4096)
 @click.option("--bucket-num", default=16)
 @click.option("--input-precision", type=str, default="long")
+@click.option("--device", type=click.Choice(["cpu", "cuda"]), default="cpu")
 def block_bucketize_sparse_features_bench(
-    row_size: int, batch_size: int, bucket_num: int, input_precision: str
+    row_size: int, batch_size: int, bucket_num: int, input_precision: str, device: str
 ) -> None:
+
     dtype = torch.int
     if input_precision == "int":
         dtype = torch.int
@@ -900,25 +902,34 @@ def block_bucketize_sparse_features_bench(
     block_sizes = torch.tensor([bucket_size] * lengths.numel(), dtype=dtype)
 
     bucket_pos = [j * bucket_size for j in range(bucket_num + 1)]
-    block_bucketize_pos = [torch.tensor(bucket_pos)] * lengths.numel()
+    block_bucketize_pos = [torch.tensor(bucket_pos, device=device)] * lengths.numel()
     test_param = {"uneven": block_bucketize_pos, "even": None}
+    print("device {device}")
     for name, is_block_bucketize_pos in test_param.items():
         time, output = benchmark_torch_function(
             torch.ops.fbgemm.block_bucketize_sparse_features,
             (
-                lengths,
-                indices,
+                lengths if device == "cpu" else lengths.to(device),
+                indices if device == "cpu" else indices.to(device),
                 False,
                 True,
-                block_sizes,
+                block_sizes if device == "cpu" else block_sizes.to(device),
                 bucket_num,
-                weights,
+                weights
+                if device == "cpu"
+                else (weights.to(device) if weights is not None else None),
                 None,
                 -1,  # unused
-                is_block_bucketize_pos,
+                is_block_bucketize_pos
+                if device == "cpu"
+                else (
+                    [i.to(device) for i in is_block_bucketize_pos]
+                    if is_block_bucketize_pos is not None
+                    else None
+                ),
             ),
             iters=100,
-            device="cpu",
+            device=device,
         )
 
         num_bytes = 0