diff --git a/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu b/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu index bb88dee3ce..f32301e47f 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu @@ -37,11 +37,11 @@ __global__ __launch_bounds__(kMaxThreads) void reorder_batched_ad_lengths_kernel( // reorder lengths from (ragged) [B x T x #num_ads_b)] to // [T][B][#num_ads_b], i.e. [T][sum(#num_ads_b)]. - const at::PackedTensorAccessor32 + const pta::PackedTensorAccessor32 cat_ad_lengths, - const at::PackedTensorAccessor32 + const pta::PackedTensorAccessor32 batch_offsets, - at::PackedTensorAccessor32 + pta::PackedTensorAccessor32 reordered_cat_ad_lengths, const int32_t T, const bool broadcast_lengths) { @@ -95,14 +95,15 @@ DLL_PUBLIC Tensor reorder_batched_ad_lengths_gpu( cat_ad_lengths.scalar_type(), "reorder_batched_ad_lengths_gpu_kernel", [&] { +#ifdef FBGEMM_GPU_MEMCHECK + const auto func_name = "reorder_batched_ad_lengths_kernel"; +#endif reorder_batched_ad_lengths_kernel <<>>( - cat_ad_lengths - .packed_accessor32(), - batch_offsets - .packed_accessor32(), - reordered_cat_ad_lengths - .packed_accessor32(), + MAKE_PTA_WITH_NAME(func_name, cat_ad_lengths, scalar_t, 1, 32), + MAKE_PTA_WITH_NAME(func_name, batch_offsets, int32_t, 1, 32), + MAKE_PTA_WITH_NAME( + func_name, reordered_cat_ad_lengths, scalar_t, 1, 32), T, broadcast_lengths); C10_CUDA_KERNEL_LAUNCH_CHECK(); @@ -112,11 +113,11 @@ DLL_PUBLIC Tensor reorder_batched_ad_lengths_gpu( template __global__ __launch_bounds__(kMaxThreads) void narrow_broadcast_indices_kernel( - const at::PackedTensorAccessor32 + const pta::PackedTensorAccessor32 cat_ad_offsets, - const at::PackedTensorAccessor32 + const pta::PackedTensorAccessor32 cat_ad_indices, - at::PackedTensorAccessor32 + pta::PackedTensorAccessor32 reordered_cat_ad_indices, const int num_ads_in_batch, const int reordered_cat_ad_batches) { @@ -139,15 +140,15 @@ __global__ __launch_bounds__(kMaxThreads) void narrow_broadcast_indices_kernel( template __global__ __launch_bounds__(kMaxThreads) void narrow_batched_broadcast_indices_kernel( - const at::PackedTensorAccessor32 + const pta::PackedTensorAccessor32 cat_ad_offsets, - const at::PackedTensorAccessor32 + const pta::PackedTensorAccessor32 cat_ad_indices, - const at::PackedTensorAccessor32 + const pta::PackedTensorAccessor32 reordered_cat_ad_offsets, - at::PackedTensorAccessor32 + pta::PackedTensorAccessor32 reordered_cat_ad_indices, - const at::PackedTensorAccessor32 + const pta::PackedTensorAccessor32 batch_offsets, const int32_t T) { const auto B = batch_offsets.size(0) - 1; @@ -196,15 +197,15 @@ __launch_bounds__(kMaxThreads) void reorder_batched_ad_indices_kernel( // if broadcast_indices is enabled, all the indices will be copies of the // first batch of the cat_ad_indices, this is useful for request-only // broadcast - const at::PackedTensorAccessor32 + const pta::PackedTensorAccessor32 cat_ad_offsets, - const at::PackedTensorAccessor32 + const pta::PackedTensorAccessor32 cat_ad_indices, - const at::PackedTensorAccessor32 + const pta::PackedTensorAccessor32 reordered_cat_ad_offsets, - at::PackedTensorAccessor32 + pta::PackedTensorAccessor32 reordered_cat_ad_indices, - const at::PackedTensorAccessor32 + const pta::PackedTensorAccessor32 batch_offsets, const int32_t T, const bool broadcast_indices) { @@ -291,23 +292,24 @@ DLL_PUBLIC Tensor reorder_batched_ad_indices_gpu( cat_ad_offsets.scalar_type(), "narrow_broadcast_indices_kernel_2", [&] { +#ifdef FBGEMM_GPU_MEMCHECK + const auto func_name = "narrow_broadcast_indices_kernel"; +#endif narrow_broadcast_indices_kernel <<>>( - cat_ad_offsets.packed_accessor32< - index_t, - 1, - at::RestrictPtrTraits>(), - cat_ad_indices.packed_accessor32< + MAKE_PTA_WITH_NAME( + func_name, cat_ad_offsets, index_t, 1, 32), + MAKE_PTA_WITH_NAME( + func_name, cat_ad_indices, scalar_t, 1, 32), + MAKE_PTA_WITH_NAME( + func_name, + reordered_cat_ad_indices, scalar_t, 1, - at::RestrictPtrTraits>(), - reordered_cat_ad_indices.packed_accessor32< - scalar_t, - 1, - at::RestrictPtrTraits>(), + 32), num_ads_in_batch, reordered_cat_ad_offsets.numel() - 1); C10_CUDA_KERNEL_LAUNCH_CHECK(); @@ -329,31 +331,33 @@ DLL_PUBLIC Tensor reorder_batched_ad_indices_gpu( cat_ad_offsets.scalar_type(), "narrow_batched_broadcast_indices_kernel_2", [&] { +#ifdef FBGEMM_GPU_MEMCHECK + const auto func_name = + "narrow_batched_broadcast_indices_kernel"; +#endif narrow_batched_broadcast_indices_kernel <<>>( - cat_ad_offsets.packed_accessor32< + MAKE_PTA_WITH_NAME( + func_name, cat_ad_offsets, index_t, 1, 32), + MAKE_PTA_WITH_NAME( + func_name, cat_ad_indices, scalar_t, 1, 32), + MAKE_PTA_WITH_NAME( + func_name, + reordered_cat_ad_offsets, index_t, 1, - at::RestrictPtrTraits>(), - cat_ad_indices.packed_accessor32< + 32), + MAKE_PTA_WITH_NAME( + func_name, + reordered_cat_ad_indices, scalar_t, 1, - at::RestrictPtrTraits>(), - reordered_cat_ad_offsets.packed_accessor32< - index_t, - 1, - at::RestrictPtrTraits>(), - reordered_cat_ad_indices.packed_accessor32< - scalar_t, - 1, - at::RestrictPtrTraits>(), - batch_offsets.packed_accessor32< - int32_t, - 1, - at::RestrictPtrTraits>(), + 32), + MAKE_PTA_WITH_NAME( + func_name, batch_offsets, int32_t, 1, 32), T); C10_CUDA_KERNEL_LAUNCH_CHECK(); }); @@ -374,23 +378,23 @@ DLL_PUBLIC Tensor reorder_batched_ad_indices_gpu( cat_ad_offsets.scalar_type(), "reorder_batched_ad_indices_gpu_kernel_2", [&] { - reorder_batched_ad_indices_kernel<<< - blocks, - threads, - 0, - at::cuda::getCurrentCUDAStream()>>>( - cat_ad_offsets - .packed_accessor32(), - cat_ad_indices - .packed_accessor32(), - reordered_cat_ad_offsets - .packed_accessor32(), - reordered_cat_ad_indices - .packed_accessor32(), - batch_offsets - .packed_accessor32(), - T, - broadcast_indices); +#ifdef FBGEMM_GPU_MEMCHECK + const auto func_name = "reorder_batched_ad_indices_kernel"; +#endif + reorder_batched_ad_indices_kernel + <<>>( + MAKE_PTA_WITH_NAME( + func_name, cat_ad_offsets, index_t, 1, 32), + MAKE_PTA_WITH_NAME( + func_name, cat_ad_indices, scalar_t, 1, 32), + MAKE_PTA_WITH_NAME( + func_name, reordered_cat_ad_offsets, index_t, 1, 32), + MAKE_PTA_WITH_NAME( + func_name, reordered_cat_ad_indices, scalar_t, 1, 32), + MAKE_PTA_WITH_NAME( + func_name, batch_offsets, int32_t, 1, 32), + T, + broadcast_indices); C10_CUDA_KERNEL_LAUNCH_CHECK(); }); }); @@ -403,15 +407,15 @@ __launch_bounds__(kMaxThreads) void reorder_batched_sequence_embeddings_kernel( // reorder embeddings from (ragged) [B x T x #num_ads_B_{i} x length_{B_{i}, // t, a})x D] to [T][B][#num_ads_b][length_{b, t, a}][D], i.e. // [sum(length_{B_{i}, t, a}), D] - const at::PackedTensorAccessor32 + const pta::PackedTensorAccessor32 cat_sequence_embeddings_offsets, - const at::PackedTensorAccessor32 + const pta::PackedTensorAccessor32 cat_sequence_embeddings, - const at::PackedTensorAccessor32 + const pta::PackedTensorAccessor32 reordered_cat_sequence_embeddings_offsets, - at::PackedTensorAccessor32 + pta::PackedTensorAccessor32 reordered_cat_sequence_embeddings, - const at::PackedTensorAccessor32 + const pta::PackedTensorAccessor32 batch_offsets, const int32_t T, const int32_t D) { @@ -485,23 +489,40 @@ DLL_PUBLIC Tensor reorder_batched_sequence_embeddings_gpu( cat_sequence_embeddings_offsets.scalar_type(), "reorder_batched_sequence_embeddings_gpu_kernel_2", [&] { - reorder_batched_sequence_embeddings_kernel<<< - blocks, - threads, - 0, - at::cuda::getCurrentCUDAStream()>>>( - cat_sequence_embeddings_offsets - .packed_accessor32(), - cat_sequence_embeddings_contig - ->packed_accessor32(), - reordered_cat_sequence_embeddings_offsets - .packed_accessor32(), - reordered_cat_sequence_embeddings - .packed_accessor32(), - batch_offsets - .packed_accessor32(), - T, - D); +#ifdef FBGEMM_GPU_MEMCHECK + const auto func_name = + "reorder_batched_sequence_embeddings_kernel"; +#endif + reorder_batched_sequence_embeddings_kernel + <<>>( + MAKE_PTA_WITH_NAME( + func_name, + cat_sequence_embeddings_offsets, + index_t, + 1, + 32), + MAKE_PTA_WITH_NAME( + func_name, + (*cat_sequence_embeddings_contig), + scalar_t, + 2, + 32), + MAKE_PTA_WITH_NAME( + func_name, + reordered_cat_sequence_embeddings_offsets, + index_t, + 1, + 32), + MAKE_PTA_WITH_NAME( + func_name, + reordered_cat_sequence_embeddings, + scalar_t, + 2, + 32), + MAKE_PTA_WITH_NAME( + func_name, batch_offsets, int32_t, 1, 32), + T, + D); C10_CUDA_KERNEL_LAUNCH_CHECK(); }); }); diff --git a/fbgemm_gpu/test/sparse/common.py b/fbgemm_gpu/test/sparse/common.py index c067cfa8d7..5a3fba27c8 100644 --- a/fbgemm_gpu/test/sparse/common.py +++ b/fbgemm_gpu/test/sparse/common.py @@ -127,11 +127,14 @@ def extend_test_class( "", os.path.dirname(__file__), "failures_dict.json" ) - additional_decorators = (additional_decorators or {}) | { - "test_pt2_compliant_tag_fbgemm_permute_2D_sparse_data": [ - # This operator has been grandfathered in. We need to fix this test failure. - unittest.expectedFailure, - ], + additional_decorators = { + **(additional_decorators or {}), + **{ + "test_pt2_compliant_tag_fbgemm_permute_2D_sparse_data": [ + # This operator has been grandfathered in. We need to fix this test failure. + unittest.expectedFailure, + ] + }, } # Only generate tests for PyTorch 2.2+