From 85c3ea8269c6088172ebb0b97f7d5dd0a8fad217 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Wed, 22 May 2024 16:30:27 -0700
Subject: [PATCH] wip

---
 .github/scripts/fbgemm_gpu_build.bash         | 35 ++++++---
 .../experimental/example/CMakeLists.txt       | 13 ++--
 .../experimental/example/src/nccl_example.cpp | 23 ++++++
 .../example/test/triton_example_test.py       | 71 +++++++++++++++++++
 fbgemm_gpu/experimental/gen_ai/CMakeLists.txt | 20 +++---
 5 files changed, 137 insertions(+), 25 deletions(-)
 create mode 100644 fbgemm_gpu/experimental/example/src/nccl_example.cpp
 create mode 100644 fbgemm_gpu/experimental/example/test/triton_example_test.py

diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index f7217fd664..ed40d483c8 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -77,6 +77,7 @@ __configure_fbgemm_gpu_build_nvcc () {
   # shellcheck disable=SC2155
   local env_prefix=$(env_name_or_prefix "${env_name}")
 
+  echo "[BUILD] Looking up CUDA version ..."
   # shellcheck disable=SC2155,SC2086
   local cxx_path=$(conda run ${env_prefix} which c++)
   # shellcheck disable=SC2155,SC2086
@@ -84,6 +85,14 @@ __configure_fbgemm_gpu_build_nvcc () {
   # shellcheck disable=SC2206
   local cuda_version_arr=(${cuda_version//./ })
 
+  echo "[BUILD] Looking up NCCL path ..."
+  # shellcheck disable=SC2155,SC2086
+  local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
+  # shellcheck disable=SC2155,SC2086
+  local nccl_lib=$(conda run ${env_prefix} find ${conda_prefix} -name "libnccl.so*")
+  # shellcheck disable=SC2155,SC2086
+  local nccl_path=$(dirname "$(dirname ${nccl_lib})")
+
   # Only NVCC 12+ supports C++20
   if [[ ${cuda_version_arr[0]} -lt 12 ]]; then
     local cppstd_ver=17
@@ -109,11 +118,16 @@ __configure_fbgemm_gpu_build_nvcc () {
   # shellcheck disable=SC2086
   print_exec conda env config vars set ${env_prefix} NVCC_PREPEND_FLAGS=\"${nvcc_prepend_flags}\"
 
+  echo "[BUILD] Setting CUDA build args ..."
+  # shellcheck disable=SC2155
+  local cxx_flags="-DNCCL_INCLUDE_DIR=${nccl_path}/include -DNCCL_LIB_DIR=${nccl_path}/lib"
+
   # shellcheck disable=SC2206
   build_args+=(
     # Override CMake configuration
     -DCMAKE_CXX_STANDARD="${cppstd_ver}"
-    -DHIP_STANDARD="${cppstd_ver}"
+    -DCMAKE_C_FLAGS="'${cxx_flags}'"
+    -DCMAKE_CXX_FLAGS="'${cxx_flags}'"
   )
 }
 
@@ -158,14 +172,17 @@ __configure_fbgemm_gpu_build_rocm () {
   print_exec conda env config vars set ${env_prefix} PYTORCH_ROCM_ARCH="${arch_list}"
 
   echo "[BUILD] Setting ROCm build args ..."
+  # shellcheck disable=SC2155
+  local cxx_flags="-DTORCH_USE_HIP_DSA"
+
   build_args=(
     --package_variant=rocm
     # HIP_ROOT_DIR now required for HIP to be correctly detected by CMake
     -DHIP_ROOT_DIR=/opt/rocm
     # Enable device-side assertions in HIP
     # https://stackoverflow.com/questions/44284275/passing-compiler-options-in-cmake-command-line
-    -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA"
-    -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
+    -DCMAKE_C_FLAGS="'${cxx_flags}'"
+    -DCMAKE_CXX_FLAGS="'${cxx_flags}'"
   )
 }
 
@@ -473,18 +490,18 @@ build_fbgemm_gpu_package () {
   # shellcheck disable=SC2086
   print_exec conda run --no-capture-output ${env_prefix} \
     python -m build --wheel --no-isolation \
-      "${build_args[@]}"
+      "${build_args[@]}" || return 1
 
   # Run checks on the built libraries
   (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1
 
   echo "[BUILD] Enumerating the built wheels ..."
-  print_exec ls -lth dist/*.whl
+  print_exec ls -lth dist/*.whl || return 1
 
   echo "[BUILD] Enumerating the wheel SHAs ..."
-  print_exec sha1sum dist/*.whl
-  print_exec sha256sum dist/*.whl
-  print_exec md5sum dist/*.whl
+  print_exec sha1sum dist/*.whl || return 1
+  print_exec sha256sum dist/*.whl || return 1
+  print_exec md5sum dist/*.whl || return 1
 
   echo "[BUILD] FBGEMM-GPU build + package completed"
 }
@@ -524,7 +541,7 @@ build_fbgemm_gpu_install () {
   # shellcheck disable=SC2086
   print_exec conda run --no-capture-output ${env_prefix} \
     python setup.py "${run_multicore}" install \
-      "${build_args[@]}"
+      "${build_args[@]}" || return 1
 
   # Run checks on the built libraries
   (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1
diff --git a/fbgemm_gpu/experimental/example/CMakeLists.txt b/fbgemm_gpu/experimental/example/CMakeLists.txt
index 457ab86b1f..9cdf4a7315 100644
--- a/fbgemm_gpu/experimental/example/CMakeLists.txt
+++ b/fbgemm_gpu/experimental/example/CMakeLists.txt
@@ -18,11 +18,14 @@ if(FBGEMM_GENAI_ONLY)
     ${CMAKE_CURRENT_SOURCE_DIR}../..
     ${CMAKE_CURRENT_SOURCE_DIR}../../include
     ${CMAKE_CURRENT_SOURCE_DIR}../../../include
+    # PyTorch
+    ${TORCH_INCLUDE_DIRS}
     # Third-party
     ${THIRDPARTY}/asmjit/src
     ${THIRDPARTY}/cpuinfo/include
     ${THIRDPARTY}/cutlass/include
-    ${THIRDPARTY}/cutlass/tools/util/include)
+    ${THIRDPARTY}/cutlass/tools/util/include
+    ${NCCL_INCLUDE_DIR})
 
   set(third_party_include_directories
     ${THIRDPARTY}/asmjit/src
@@ -32,7 +35,8 @@ endif()
 
 set(experimental_example_cpp_source_files
     src/cutlass_sgemm_nn.cu
-    src/example_ops.cpp)
+    src/example_ops.cpp
+    src/nccl_example.cpp)
 
 set_source_files_properties(${experimental_example_cpp_source_files}
     PROPERTIES INCLUDE_DIRECTORIES
@@ -50,8 +54,9 @@ set(experimental_example_python_source_files
 add_library(fbgemm_gpu_experimental_example_py MODULE
     ${experimental_example_cpp_source_files})
 
-target_include_directories(fbgemm_gpu_experimental_example_py PRIVATE ${TORCH_INCLUDE_DIRS})
-target_link_libraries(fbgemm_gpu_experimental_example_py ${TORCH_LIBRARIES})
+target_link_libraries(fbgemm_gpu_experimental_example_py
+    ${TORCH_LIBRARIES}
+    ${NCCL_LIB_DIR})
 
 # Remove `lib` from the output artifact name
 set_target_properties(fbgemm_gpu_experimental_example_py PROPERTIES PREFIX "")
diff --git a/fbgemm_gpu/experimental/example/src/nccl_example.cpp b/fbgemm_gpu/experimental/example/src/nccl_example.cpp
new file mode 100644
index 0000000000..240a20e574
--- /dev/null
+++ b/fbgemm_gpu/experimental/example/src/nccl_example.cpp
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <nccl.h>
+
+namespace fbgemm_gpu::experimental {
+
+void example_nccl_code() {
+  ncclComm_t comms[4];
+  int devs[4] = { 0, 1, 2, 3 };
+  ncclCommInitAll(comms, 4, devs);
+
+  for (int i=0; i<4; i++) {
+    ncclCommDestroy(comms[i]);
+  }
+}
+
+} // namespace fbgemm_gpu::experimental
diff --git a/fbgemm_gpu/experimental/example/test/triton_example_test.py b/fbgemm_gpu/experimental/example/test/triton_example_test.py
new file mode 100644
index 0000000000..d524440203
--- /dev/null
+++ b/fbgemm_gpu/experimental/example/test/triton_example_test.py
@@ -0,0 +1,71 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import unittest
+
+import torch
+import triton
+import triton.language as tl
+
+@triton.jit
+def softmax_triton(Y, stride_ym, stride_yn, X, stride_xm, stride_xn, M, N):
+    # Row index
+    m = tl.program_id(0)
+
+    # Column indices.  This specific kernel only works for matrices that have
+    # less than BLOCK_SIZE columns
+    BLOCK_SIZE = 1024
+    n = tl.arange(0, BLOCK_SIZE)
+
+    # Compute the memory address of all the elements that we want to load
+    X = X + m * stride_xm + n * stride_xn
+
+    # Load input data; pad out-of-bounds elements with 0
+    x = tl.load(X, mask=n < N, other=-float('inf'))
+
+    # Compute numerically-stable softmax
+    z = x - tl.max(x, axis=0)
+    num = tl.exp(z)
+    denom = tl.sum(num, axis=0)
+    y = num / denom
+
+    # write back to Y
+    Y = Y + m * stride_ym + n * stride_yn
+    tl.store(Y, y, mask=n < N)
+
+
+@torch.jit.script
+def softmax_torch(x):
+    x_max = x.max(dim=1)[0]
+    z = x - x_max[:, None]
+    numerator = torch.exp(x)
+    denominator = numerator.sum(dim=1)
+    return numerator / denominator[:, None]
+
+
+@unittest.skipIf(
+    not torch.cuda.is_available(),
+    "Requires CUDA to run",
+)
+class TestTriton(unittest.TestCase):
+    def test_triton_example(self) -> None:
+        # Allocate input/output tensors
+        X = torch.normal(0, 1, size=(583, 931), device='cuda')
+        Y = torch.empty_like(X)
+
+        # SPMD launch grid
+        grid = (X.shape[0], )
+
+        # Enqueue GPU kernel
+        softmax_triton[grid](
+            Y, Y.stride(0), Y.stride(1),
+            X, X.stride(0), X.stride(1),
+            X.shape[0]    , X.shape[1]
+        )
+
+        torch.testing.assert_close(Y.cpu(), softmax_torch(X).cpu())
diff --git a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt
index a5051e04b3..01fd04ae09 100644
--- a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt
+++ b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt
@@ -16,16 +16,14 @@ if(FBGEMM_GENAI_ONLY)
     ${CMAKE_CURRENT_SOURCE_DIR}/../..
     ${CMAKE_CURRENT_SOURCE_DIR}/../../include
     ${CMAKE_CURRENT_SOURCE_DIR}/../../../include
+    # PyTorch
+    ${TORCH_INCLUDE_DIRS}
     # Third-party
     ${THIRDPARTY}/asmjit/src
     ${THIRDPARTY}/cpuinfo/include
     ${THIRDPARTY}/cutlass/include
-    ${THIRDPARTY}/cutlass/tools/util/include)
-
-  set(third_party_include_directories
-    ${THIRDPARTY}/asmjit/src
-    ${THIRDPARTY}/cpuinfo/include
-    ${THIRDPARTY}/cutlass/include)
+    ${THIRDPARTY}/cutlass/tools/util/include
+    ${NCCL_INCLUDE_DIR})
 endif()
 
 set(attention_ops_sources
@@ -93,18 +91,16 @@ if(USE_ROCM)
     ${ROCRAND_INCLUDE}
     ${ROCM_SMI_INCLUDE})
 
-  list(GET TORCH_INCLUDE_DIRS 0 TORCH_PATH)
-
 else()
   # Else create a CUDA library
   add_library(fbgemm_gpu_experimental_gen_ai_py MODULE
       ${experimental_gen_ai_cpp_source_files})
 endif()
 
-# Link to PyTorch
-target_include_directories(fbgemm_gpu_experimental_gen_ai_py
-  PRIVATE ${TORCH_INCLUDE_DIRS})
-target_link_libraries(fbgemm_gpu_experimental_gen_ai_py ${TORCH_LIBRARIES})
+# Link to PyTorch and NCCL
+target_link_libraries(fbgemm_gpu_experimental_gen_ai_py
+  ${TORCH_LIBRARIES}
+  ${NCCL_LIB_DIR})
 
 # Remove `lib` from the output artifact name
 set_target_properties(fbgemm_gpu_experimental_gen_ai_py PROPERTIES PREFIX "")