From 33033b34d66aad5bec6636ba80c7852274297346 Mon Sep 17 00:00:00 2001
From: Jing Shan <jingshan@meta.com>
Date: Thu, 30 May 2024 10:38:31 -0700
Subject: [PATCH] Add a check on grid before launching cuda kernels (#2639)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2639

We had errors like P1382114867
```
c10::Error: CUDA error: invalid configuration argument
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Device-side assertion tracking was not enabled by user.
Exception raised from c10_cuda_check_implementation at fbcode/caffe2/c10/cuda/CUDAException.cpp:43 (most recent call first):
# 7  c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)
# 8  c10::cuda::c10_cuda_check_implementation(int, char const*, char const*, int, bool)
# 9  fbgemm_gpu::reorder_batched_ad_lengths_gpu(at::Tensor const&, at::Tensor const&, long, bool, long)
```
The `CUDA error: invalid configuration argument` usually means that the configuration we used to launch the cuda kernel is invalid https://fburl.com/wiki/cssgu0vs

But the error message is not very helpful for debugging, this diff adds a check on grid size before launching the kernel, and logs the variables calculating the grid size to help quickly identify the errors and causes.

Reviewed By: sryap, gnahzg

Differential Revision: D57932413

fbshipit-source-id: d558a3eb2f0d7404ff5cda2eb9a80f8ffd17471d
---
 .../src/sparse_ops/sparse_reorder_batched_ad.cu      | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu b/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu
index f32301e47..6239d914f 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu
+++ b/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu
@@ -88,8 +88,18 @@ DLL_PUBLIC Tensor reorder_batched_ad_lengths_gpu(
       ? at::empty({T * num_ads_in_batch}, cat_ad_lengths.options())
       : at::empty_like(cat_ad_lengths);
 
+  const int64_t grid_size = (B * T + 32 - 1) / 32;
+  TORCH_CHECK(
+      grid_size >= 0,
+      "grid_size must be positive, got ",
+      grid_size,
+      " where B =",
+      B,
+      " and T =",
+      T);
+
   const dim3 threads(32, 32);
-  const dim3 blocks((B * T + 32 - 1) / 32);
+  const dim3 blocks(grid_size);
 
   FBGEMM_DISPATCH_ALL_TYPES(
       cat_ad_lengths.scalar_type(),