From 33033b34d66aad5bec6636ba80c7852274297346 Mon Sep 17 00:00:00 2001 From: Jing Shan Date: Thu, 30 May 2024 10:38:31 -0700 Subject: [PATCH] Add a check on grid before launching cuda kernels (#2639) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2639 We had errors like P1382114867 ``` c10::Error: CUDA error: invalid configuration argument CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1 Device-side assertion tracking was not enabled by user. Exception raised from c10_cuda_check_implementation at fbcode/caffe2/c10/cuda/CUDAException.cpp:43 (most recent call first): # 7 c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string, std::allocator > const&) # 8 c10::cuda::c10_cuda_check_implementation(int, char const*, char const*, int, bool) # 9 fbgemm_gpu::reorder_batched_ad_lengths_gpu(at::Tensor const&, at::Tensor const&, long, bool, long) ``` The `CUDA error: invalid configuration argument` usually means that the configuration we used to launch the cuda kernel is invalid https://fburl.com/wiki/cssgu0vs But the error message is not very helpful for debugging, this diff adds a check on grid size before launching the kernel, and logs the variables calculating the grid size to help quickly identify the errors and causes. Reviewed By: sryap, gnahzg Differential Revision: D57932413 fbshipit-source-id: d558a3eb2f0d7404ff5cda2eb9a80f8ffd17471d --- .../src/sparse_ops/sparse_reorder_batched_ad.cu | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu b/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu index f32301e47..6239d914f 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu @@ -88,8 +88,18 @@ DLL_PUBLIC Tensor reorder_batched_ad_lengths_gpu( ? at::empty({T * num_ads_in_batch}, cat_ad_lengths.options()) : at::empty_like(cat_ad_lengths); + const int64_t grid_size = (B * T + 32 - 1) / 32; + TORCH_CHECK( + grid_size >= 0, + "grid_size must be positive, got ", + grid_size, + " where B =", + B, + " and T =", + T); + const dim3 threads(32, 32); - const dim3 blocks((B * T + 32 - 1) / 32); + const dim3 blocks(grid_size); FBGEMM_DISPATCH_ALL_TYPES( cat_ad_lengths.scalar_type(),