From 7af4de11f43c3dfc12dac35dedd27db0b4a798a1 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Mon, 30 Sep 2024 17:01:36 -0700
Subject: [PATCH] Add support for int64_t indices and offsets in TBE inference
 [6/N] (#3182)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/3182

X-link: https://github.com/facebookresearch/FBGEMM/pull/278

- Convert `PrunedMapCPU::lookup` to use `index_t`

Differential Revision: D62602764
---
 .../embedding_forward_quantized_host_cpu.cpp  | 39 ++++++++++++-------
 .../codegen/utils/embedding_bounds_check.cu   | 38 +++++++++---------
 .../utils/embedding_bounds_check_host_cpu.cpp |  2 +-
 3 files changed, 45 insertions(+), 34 deletions(-)

diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host_cpu.cpp b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host_cpu.cpp
index 41fd137dd..b6f55b961 100644
--- a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host_cpu.cpp
+++ b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host_cpu.cpp
@@ -21,6 +21,7 @@
 #include "fbgemm_gpu/embedding_common.h"
 #include "fbgemm_gpu/utils/dispatch_macros.h"
 #include "fbgemm_gpu/utils/ops_utils.h"
+#include "fbgemm_gpu/utils/tensor_utils.h"
 
 using Tensor = at::Tensor;
 using namespace fbgemm_gpu;
@@ -374,29 +375,37 @@ class PrunedMapCPU : public torch::jit::CustomClassHolder {
   }
 
   Tensor lookup(Tensor indices, Tensor offsets) const {
+    TENSORS_HAVE_SAME_SCALAR_TYPE(indices, offsets);
+
     int32_t T = maps_.size();
     TORCH_CHECK(T > 0);
     int32_t B = (offsets.size(0) - 1) / T;
     TORCH_CHECK(B > 0);
     TORCH_CHECK(maps_.size() == T);
+
     auto dense_indices = empty_like(indices);
-    const auto* indices_acc = indices.data_ptr<int32_t>();
-    auto* dense_indices_acc = dense_indices.data_ptr<int32_t>();
-    const auto* offsets_acc = offsets.data_ptr<int32_t>();
-    for (const auto t : c10::irange(T)) {
-      auto& map = maps_[t];
-      for (const auto b : c10::irange(B)) {
-        int32_t indices_start = offsets_acc[t * B + b];
-        int32_t indices_end = offsets_acc[t * B + b + 1];
-        int32_t L = indices_end - indices_start;
-        for (const auto l : c10::irange(L)) {
-          int32_t slot_sparse_index = indices_acc[indices_start + l];
-          auto it = map.find(slot_sparse_index);
-          dense_indices_acc[indices_start + l] =
-              it != map.end() ? it->second : -1;
+
+    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "PrunedMapCPU::lookup", [&] {
+      const auto* indices_acc = indices.data_ptr<index_t>();
+      auto* dense_indices_acc = dense_indices.data_ptr<index_t>();
+      const auto* offsets_acc = offsets.data_ptr<index_t>();
+
+      for (const auto t : c10::irange(T)) {
+        auto& map = maps_[t];
+        for (const auto b : c10::irange(B)) {
+          const auto indices_start = offsets_acc[t * B + b];
+          const auto indices_end = offsets_acc[t * B + b + 1];
+          const auto L = indices_end - indices_start;
+          for (const auto l : c10::irange(L)) {
+            const auto slot_sparse_index = indices_acc[indices_start + l];
+            const auto it = map.find(slot_sparse_index);
+            dense_indices_acc[indices_start + l] =
+                it != map.end() ? it->second : -1;
+          }
         }
       }
-    }
+    });
+
     return dense_indices;
   }
 
diff --git a/fbgemm_gpu/codegen/utils/embedding_bounds_check.cu b/fbgemm_gpu/codegen/utils/embedding_bounds_check.cu
index 08e22baa9..8d8ee6ab5 100644
--- a/fbgemm_gpu/codegen/utils/embedding_bounds_check.cu
+++ b/fbgemm_gpu/codegen/utils/embedding_bounds_check.cu
@@ -233,22 +233,24 @@ void bounds_check_indices_cuda(
   constexpr size_t kNumThreads = 256;
   const auto max_B_ = vbe ? max_B : B;
 
-  AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "bounds_check_indices", [&] {
-    const auto bounds_check_kernel =
-        (vbe ? bounds_check_indices_kernel<index_t, true>
-             : bounds_check_indices_kernel<index_t, false>);
-    TORCH_DSA_KERNEL_LAUNCH(
-        bounds_check_kernel,
-        div_round_up(max_B_ * T, kNumThreads / fbgemm_gpu::kWarpSize),
-        dim3(fbgemm_gpu::kWarpSize, kNumThreads / fbgemm_gpu::kWarpSize),
-        0,
-        at::cuda::getCurrentCUDAStream(),
-        rows_per_table.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-        indices.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-        offsets.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-        vbe ? B_offsets.value().data_ptr<int32_t>() : nullptr,
-        bounds_check_mode_,
-        warning.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-        FixedDivisor(max_B_));
-  });
+  AT_DISPATCH_INDEX_TYPES(
+      indices.scalar_type(), "bounds_check_indices_cuda", [&] {
+        const auto bounds_check_kernel =
+            (vbe ? bounds_check_indices_kernel<index_t, true>
+                 : bounds_check_indices_kernel<index_t, false>);
+        TORCH_DSA_KERNEL_LAUNCH(
+            bounds_check_kernel,
+            div_round_up(max_B_ * T, kNumThreads / fbgemm_gpu::kWarpSize),
+            dim3(fbgemm_gpu::kWarpSize, kNumThreads / fbgemm_gpu::kWarpSize),
+            0,
+            at::cuda::getCurrentCUDAStream(),
+            rows_per_table
+                .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+            indices.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+            offsets.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+            vbe ? B_offsets.value().data_ptr<int32_t>() : nullptr,
+            bounds_check_mode_,
+            warning.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+            FixedDivisor(max_B_));
+      });
 }
diff --git a/fbgemm_gpu/codegen/utils/embedding_bounds_check_host_cpu.cpp b/fbgemm_gpu/codegen/utils/embedding_bounds_check_host_cpu.cpp
index 1098378d0..1d0cd1348 100644
--- a/fbgemm_gpu/codegen/utils/embedding_bounds_check_host_cpu.cpp
+++ b/fbgemm_gpu/codegen/utils/embedding_bounds_check_host_cpu.cpp
@@ -70,7 +70,7 @@ void bounds_check_indices_cpu(
   const auto rows_per_table_acc = rows_per_table.accessor<int64_t, 1>();
   auto warning_acc = warning.data_ptr<int64_t>();
 
-  AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "bounds_check_indices", [&] {
+  AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "bounds_check_indices_cpu", [&] {
     auto offsets_acc = offsets.accessor<index_t, 1>();
     auto indices_acc = indices.accessor<index_t, 1>();
     auto num_indices = indices.numel();