From 0ac664b0e2b1128816be1b78f9f66346f6dec36f Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Thu, 23 May 2024 16:36:59 -0700
Subject: [PATCH] Add NCCL integration (#2624)

Summary:
- Add example triton test to exercise triton integration
- Add NCCL integration

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2624

Reviewed By: sryap, jianyuh

Differential Revision: D57741140

Pulled By: q10

fbshipit-source-id: 3529139b907893d92231902b3ae34d4914142b6b
---
 .github/scripts/fbgemm_gpu_build.bash         | 32 +++++---
 cmake/modules/CudaSetup.cmake                 |  7 ++
 fbgemm_gpu/CMakeLists.txt                     |  4 +-
 fbgemm_gpu/FbgemmGpu.cmake                    | 13 +++-
 .../experimental/example/CMakeLists.txt       | 17 ++++-
 .../experimental/example/src/nccl_example.cpp | 23 ++++++
 .../example/test/triton_example_test.py       | 74 +++++++++++++++++++
 fbgemm_gpu/experimental/gen_ai/CMakeLists.txt | 23 +++---
 .../gen_ai/test/quantize/quantize_test.py     |  1 -
 9 files changed, 164 insertions(+), 30 deletions(-)
 create mode 100644 fbgemm_gpu/experimental/example/src/nccl_example.cpp
 create mode 100644 fbgemm_gpu/experimental/example/test/triton_example_test.py

diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index cbb41af29..d258cd796 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -77,6 +77,7 @@ __configure_fbgemm_gpu_build_nvcc () {
   # shellcheck disable=SC2155
   local env_prefix=$(env_name_or_prefix "${env_name}")
 
+  echo "[BUILD] Looking up CUDA version ..."
   # shellcheck disable=SC2155,SC2086
   local cxx_path=$(conda run ${env_prefix} which c++)
   # shellcheck disable=SC2155,SC2086
@@ -84,6 +85,14 @@ __configure_fbgemm_gpu_build_nvcc () {
   # shellcheck disable=SC2206
   local cuda_version_arr=(${cuda_version//./ })
 
+  echo "[BUILD] Looking up NCCL path ..."
+  # shellcheck disable=SC2155,SC2086
+  local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
+  # shellcheck disable=SC2155,SC2086
+  local nccl_lib=$(conda run ${env_prefix} find ${conda_prefix} -name "libnccl.so*")
+  # shellcheck disable=SC2155,SC2086
+  local nccl_path=$(dirname "$(dirname ${nccl_lib})")
+
   # Only NVCC 12+ supports C++20
   if [[ ${cuda_version_arr[0]} -lt 12 ]]; then
     local cppstd_ver=17
@@ -109,11 +118,13 @@ __configure_fbgemm_gpu_build_nvcc () {
   # shellcheck disable=SC2086
   print_exec conda env config vars set ${env_prefix} NVCC_PREPEND_FLAGS=\"${nvcc_prepend_flags}\"
 
+  echo "[BUILD] Setting CUDA build args ..."
   # shellcheck disable=SC2206
   build_args+=(
     # Override CMake configuration
     -DCMAKE_CXX_STANDARD="${cppstd_ver}"
-    -DHIP_STANDARD="${cppstd_ver}"
+    -DNCCL_INCLUDE_DIR=${nccl_path}/include
+    -DNCCL_LIB_DIR=${nccl_path}/lib
   )
 }
 
@@ -158,14 +169,17 @@ __configure_fbgemm_gpu_build_rocm () {
   print_exec conda env config vars set ${env_prefix} PYTORCH_ROCM_ARCH="${arch_list}"
 
   echo "[BUILD] Setting ROCm build args ..."
+  # shellcheck disable=SC2155
+  local cxx_flags="-DTORCH_USE_HIP_DSA"
+
   build_args=(
     --package_variant=rocm
     # HIP_ROOT_DIR now required for HIP to be correctly detected by CMake
     -DHIP_ROOT_DIR=/opt/rocm
     # Enable device-side assertions in HIP
     # https://stackoverflow.com/questions/44284275/passing-compiler-options-in-cmake-command-line
-    -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA"
-    -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
+    -DCMAKE_C_FLAGS="'${cxx_flags}'"
+    -DCMAKE_CXX_FLAGS="'${cxx_flags}'"
   )
 }
 
@@ -473,18 +487,18 @@ build_fbgemm_gpu_package () {
   # shellcheck disable=SC2086
   print_exec conda run --no-capture-output ${env_prefix} \
     python -m build --wheel --no-isolation \
-      "${build_args[@]}"
+      "${build_args[@]}" || return 1
 
   # Run checks on the built libraries
   (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1
 
   echo "[BUILD] Enumerating the built wheels ..."
-  print_exec ls -lth dist/*.whl
+  print_exec ls -lth dist/*.whl || return 1
 
   echo "[BUILD] Enumerating the wheel SHAs ..."
-  print_exec sha1sum dist/*.whl
-  print_exec sha256sum dist/*.whl
-  print_exec md5sum dist/*.whl
+  print_exec sha1sum dist/*.whl || return 1
+  print_exec sha256sum dist/*.whl || return 1
+  print_exec md5sum dist/*.whl || return 1
 
   echo "[BUILD] FBGEMM-GPU build + package completed"
 }
@@ -524,7 +538,7 @@ build_fbgemm_gpu_install () {
   # shellcheck disable=SC2086
   print_exec conda run --no-capture-output ${env_prefix} \
     python setup.py "${run_multicore}" install \
-      "${build_args[@]}"
+      "${build_args[@]}" || return 1
 
   # Run checks on the built libraries
   (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1
diff --git a/cmake/modules/CudaSetup.cmake b/cmake/modules/CudaSetup.cmake
index d86963109..f24316e07 100644
--- a/cmake/modules/CudaSetup.cmake
+++ b/cmake/modules/CudaSetup.cmake
@@ -11,6 +11,13 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/modules/Utilities.cmake)
 # CUDA Setup
 ################################################################################
 
+BLOCK_PRINT(
+  "NCCL flags"
+  ""
+  "NCCL_INCLUDE_DIR=${NCCL_INCLUDE_DIR}"
+  "NCCL_LIB_DIR=${NCCL_LIB_DIR}"
+)
+
 # Set NVML_LIB_PATH if provided, or detect the default lib path
 if(NOT NVML_LIB_PATH)
   set(DEFAULT_NVML_LIB_PATH
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 1146b2866..c4d47c84e 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -95,10 +95,12 @@ endif()
 ################################################################################
 
 if(NOT FBGEMM_CPU_ONLY)
-  add_subdirectory(experimental/example)
   add_subdirectory(experimental/gemm)
 
   if(NOT USE_ROCM)
+    # TODO: Figure out NCCL/RCCL integration with ROCm
+    add_subdirectory(experimental/example)
+
     # CUTLASS currently doesn't build on ROCm and CK hasnt yet been added:
     #
     # 2024-05-06T23:09:35.5730483Z /__w/FBGEMM/FBGEMM/fbgemm_gpu/../third_party/cutlass/include/cutlass/half.h:73:10: fatal error: 'cuda_fp16.h' file not found
diff --git a/fbgemm_gpu/FbgemmGpu.cmake b/fbgemm_gpu/FbgemmGpu.cmake
index a6824b57c..ee32edd14 100644
--- a/fbgemm_gpu/FbgemmGpu.cmake
+++ b/fbgemm_gpu/FbgemmGpu.cmake
@@ -24,11 +24,14 @@ set(fbgemm_sources_include_directories
   ${CMAKE_CURRENT_SOURCE_DIR}
   ${CMAKE_CURRENT_SOURCE_DIR}/include
   ${CMAKE_CURRENT_SOURCE_DIR}/../include
+  # PyTorch
+  ${TORCH_INCLUDE_DIRS}
   # Third-party
   ${THIRDPARTY}/asmjit/src
   ${THIRDPARTY}/cpuinfo/include
   ${THIRDPARTY}/cutlass/include
-  ${THIRDPARTY}/cutlass/tools/util/include)
+  ${THIRDPARTY}/cutlass/tools/util/include
+  ${NCCL_INCLUDE_DIR})
 
 
 ################################################################################
@@ -624,13 +627,17 @@ else()
 endif()
 
 # Add PyTorch include/
-target_include_directories(fbgemm_gpu_py PRIVATE ${TORCH_INCLUDE_DIRS})
+target_include_directories(fbgemm_gpu_py PRIVATE
+  ${TORCH_INCLUDE_DIRS}
+  ${NCCL_INCLUDE_DIR})
 
 # Remove `lib` from the output artifact name `libfbgemm_gpu_py.so`
 set_target_properties(fbgemm_gpu_py PROPERTIES PREFIX "")
 
 # Link to PyTorch
-target_link_libraries(fbgemm_gpu_py ${TORCH_LIBRARIES})
+target_link_libraries(fbgemm_gpu_py
+  ${TORCH_LIBRARIES}
+  ${NCCL_LIB_DIR})
 
 # Link to NVML
 if(NVML_LIB_PATH)
diff --git a/fbgemm_gpu/experimental/example/CMakeLists.txt b/fbgemm_gpu/experimental/example/CMakeLists.txt
index 457ab86b1..473f62749 100644
--- a/fbgemm_gpu/experimental/example/CMakeLists.txt
+++ b/fbgemm_gpu/experimental/example/CMakeLists.txt
@@ -18,11 +18,14 @@ if(FBGEMM_GENAI_ONLY)
     ${CMAKE_CURRENT_SOURCE_DIR}../..
     ${CMAKE_CURRENT_SOURCE_DIR}../../include
     ${CMAKE_CURRENT_SOURCE_DIR}../../../include
+    # PyTorch
+    ${TORCH_INCLUDE_DIRS}
     # Third-party
     ${THIRDPARTY}/asmjit/src
     ${THIRDPARTY}/cpuinfo/include
     ${THIRDPARTY}/cutlass/include
-    ${THIRDPARTY}/cutlass/tools/util/include)
+    ${THIRDPARTY}/cutlass/tools/util/include
+    ${NCCL_INCLUDE_DIR})
 
   set(third_party_include_directories
     ${THIRDPARTY}/asmjit/src
@@ -32,7 +35,8 @@ endif()
 
 set(experimental_example_cpp_source_files
     src/cutlass_sgemm_nn.cu
-    src/example_ops.cpp)
+    src/example_ops.cpp
+    src/nccl_example.cpp)
 
 set_source_files_properties(${experimental_example_cpp_source_files}
     PROPERTIES INCLUDE_DIRECTORIES
@@ -50,8 +54,13 @@ set(experimental_example_python_source_files
 add_library(fbgemm_gpu_experimental_example_py MODULE
     ${experimental_example_cpp_source_files})
 
-target_include_directories(fbgemm_gpu_experimental_example_py PRIVATE ${TORCH_INCLUDE_DIRS})
-target_link_libraries(fbgemm_gpu_experimental_example_py ${TORCH_LIBRARIES})
+target_include_directories(fbgemm_gpu_experimental_example_py PRIVATE
+    ${TORCH_INCLUDE_DIRS}
+    ${NCCL_INCLUDE_DIR})
+
+target_link_libraries(fbgemm_gpu_experimental_example_py
+    ${TORCH_LIBRARIES}
+    ${NCCL_LIB_DIR})
 
 # Remove `lib` from the output artifact name
 set_target_properties(fbgemm_gpu_experimental_example_py PROPERTIES PREFIX "")
diff --git a/fbgemm_gpu/experimental/example/src/nccl_example.cpp b/fbgemm_gpu/experimental/example/src/nccl_example.cpp
new file mode 100644
index 000000000..12bd72018
--- /dev/null
+++ b/fbgemm_gpu/experimental/example/src/nccl_example.cpp
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <nccl.h>
+
+namespace fbgemm_gpu::experimental {
+
+void example_nccl_code() {
+  ncclComm_t comms[4];
+  int devs[4] = {0, 1, 2, 3};
+  ncclCommInitAll(comms, 4, devs);
+
+  for (int i = 0; i < 4; i++) {
+    ncclCommDestroy(comms[i]);
+  }
+}
+
+} // namespace fbgemm_gpu::experimental
diff --git a/fbgemm_gpu/experimental/example/test/triton_example_test.py b/fbgemm_gpu/experimental/example/test/triton_example_test.py
new file mode 100644
index 000000000..8e043656a
--- /dev/null
+++ b/fbgemm_gpu/experimental/example/test/triton_example_test.py
@@ -0,0 +1,74 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+# fmt: off
+def triton_add_kernel(x_ptr, y_ptr, z_ptr, n_elements, BLOCK_SIZE: tl.constexpr) -> None:
+# fmt: on
+    # We use a 1D launch grid so axis is 0.
+    pid = tl.program_id(axis=0)
+
+    # Compute the offsets in BLOCK_SIZE chunks.
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+
+    # Create a mask to guard memory operations against out-of-bounds accesses.
+    mask = offsets < n_elements
+
+    # Load x and y from DRAM.
+    x = tl.load(x_ptr + offsets, mask=mask)
+    y = tl.load(y_ptr + offsets, mask=mask)
+
+    # Sum and write back to DRAM.
+    output = x + y
+    tl.store(z_ptr + offsets, output, mask=mask)
+
+
+def triton_add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    # Pre-allocate the output.
+    output = torch.empty_like(x)
+    assert x.is_cuda and y.is_cuda and output.is_cuda
+
+    # Create the SPMD launch grid.  It can be either Tuple[int], or
+    # Callable(metaparameters) -> Tuple[int].  In this case, we use a 1D grid
+    # where the size is the number of blocks:
+    n_elements = output.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)  # noqa: E731
+
+    # Launch the kernel.
+    #
+    # Each torch.tensor object is implicitly converted into a pointer to its
+    # first element.
+    #
+    # `triton.jit`'ed functions can be indexed with a launch grid to obtain a
+    # callable GPU kernel.
+    #
+    # Pass meta-parameters as keywords arguments.
+    # pyre-ignore
+    triton_add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
+
+    # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been
+    # called, the kernel is still running asynchronously at this point.
+    return output
+
+
+@unittest.skipIf(
+    not torch.cuda.is_available(),
+    "Requires CUDA to run",
+)
+class TestTriton(unittest.TestCase):
+    def test_triton_example(self) -> None:
+        size = 98432
+        X = torch.rand(size, device="cuda")
+        Y = torch.rand(size, device="cuda")
+
+        torch.testing.assert_close(triton_add(X, Y).cpu(), (X + Y).cpu())
diff --git a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt
index 50027a871..a82236cbb 100644
--- a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt
+++ b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt
@@ -16,16 +16,14 @@ if(FBGEMM_GENAI_ONLY)
     ${CMAKE_CURRENT_SOURCE_DIR}/../..
     ${CMAKE_CURRENT_SOURCE_DIR}/../../include
     ${CMAKE_CURRENT_SOURCE_DIR}/../../../include
+    # PyTorch
+    ${TORCH_INCLUDE_DIRS}
     # Third-party
     ${THIRDPARTY}/asmjit/src
     ${THIRDPARTY}/cpuinfo/include
     ${THIRDPARTY}/cutlass/include
-    ${THIRDPARTY}/cutlass/tools/util/include)
-
-  set(third_party_include_directories
-    ${THIRDPARTY}/asmjit/src
-    ${THIRDPARTY}/cpuinfo/include
-    ${THIRDPARTY}/cutlass/include)
+    ${THIRDPARTY}/cutlass/tools/util/include
+    ${NCCL_INCLUDE_DIR})
 endif()
 
 set(attention_ops_sources
@@ -98,18 +96,19 @@ if(USE_ROCM)
     ${ROCRAND_INCLUDE}
     ${ROCM_SMI_INCLUDE})
 
-  list(GET TORCH_INCLUDE_DIRS 0 TORCH_PATH)
-
 else()
   # Else create a CUDA library
   add_library(fbgemm_gpu_experimental_gen_ai_py MODULE
       ${experimental_gen_ai_cpp_source_files})
 endif()
 
-# Link to PyTorch
-target_include_directories(fbgemm_gpu_experimental_gen_ai_py
-  PRIVATE ${TORCH_INCLUDE_DIRS})
-target_link_libraries(fbgemm_gpu_experimental_gen_ai_py ${TORCH_LIBRARIES})
+target_include_directories(fbgemm_gpu_experimental_gen_ai_py PRIVATE
+  ${TORCH_INCLUDE_DIRS}
+  ${NCCL_INCLUDE_DIR})
+
+target_link_libraries(fbgemm_gpu_experimental_gen_ai_py
+  ${TORCH_LIBRARIES}
+  ${NCCL_LIB_DIR})
 
 # Remove `lib` from the output artifact name
 set_target_properties(fbgemm_gpu_experimental_gen_ai_py PROPERTIES PREFIX "")
diff --git a/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py b/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py
index 4511fc965..8d96dc0f3 100644
--- a/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py
+++ b/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py
@@ -5,7 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-strict
-
 # pyre-ignore-all-errors[56]
 
 import unittest