From 85c3ea8269c6088172ebb0b97f7d5dd0a8fad217 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Wed, 22 May 2024 16:30:27 -0700 Subject: [PATCH] wip --- .github/scripts/fbgemm_gpu_build.bash | 35 ++++++--- .../experimental/example/CMakeLists.txt | 13 ++-- .../experimental/example/src/nccl_example.cpp | 23 ++++++ .../example/test/triton_example_test.py | 71 +++++++++++++++++++ fbgemm_gpu/experimental/gen_ai/CMakeLists.txt | 20 +++--- 5 files changed, 137 insertions(+), 25 deletions(-) create mode 100644 fbgemm_gpu/experimental/example/src/nccl_example.cpp create mode 100644 fbgemm_gpu/experimental/example/test/triton_example_test.py diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash index f7217fd664..ed40d483c8 100644 --- a/.github/scripts/fbgemm_gpu_build.bash +++ b/.github/scripts/fbgemm_gpu_build.bash @@ -77,6 +77,7 @@ __configure_fbgemm_gpu_build_nvcc () { # shellcheck disable=SC2155 local env_prefix=$(env_name_or_prefix "${env_name}") + echo "[BUILD] Looking up CUDA version ..." # shellcheck disable=SC2155,SC2086 local cxx_path=$(conda run ${env_prefix} which c++) # shellcheck disable=SC2155,SC2086 @@ -84,6 +85,14 @@ __configure_fbgemm_gpu_build_nvcc () { # shellcheck disable=SC2206 local cuda_version_arr=(${cuda_version//./ }) + echo "[BUILD] Looking up NCCL path ..." + # shellcheck disable=SC2155,SC2086 + local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX) + # shellcheck disable=SC2155,SC2086 + local nccl_lib=$(conda run ${env_prefix} find ${conda_prefix} -name "libnccl.so*") + # shellcheck disable=SC2155,SC2086 + local nccl_path=$(dirname "$(dirname ${nccl_lib})") + # Only NVCC 12+ supports C++20 if [[ ${cuda_version_arr[0]} -lt 12 ]]; then local cppstd_ver=17 @@ -109,11 +118,16 @@ __configure_fbgemm_gpu_build_nvcc () { # shellcheck disable=SC2086 print_exec conda env config vars set ${env_prefix} NVCC_PREPEND_FLAGS=\"${nvcc_prepend_flags}\" + echo "[BUILD] Setting CUDA build args ..." + # shellcheck disable=SC2155 + local cxx_flags="-DNCCL_INCLUDE_DIR=${nccl_path}/include -DNCCL_LIB_DIR=${nccl_path}/lib" + # shellcheck disable=SC2206 build_args+=( # Override CMake configuration -DCMAKE_CXX_STANDARD="${cppstd_ver}" - -DHIP_STANDARD="${cppstd_ver}" + -DCMAKE_C_FLAGS="'${cxx_flags}'" + -DCMAKE_CXX_FLAGS="'${cxx_flags}'" ) } @@ -158,14 +172,17 @@ __configure_fbgemm_gpu_build_rocm () { print_exec conda env config vars set ${env_prefix} PYTORCH_ROCM_ARCH="${arch_list}" echo "[BUILD] Setting ROCm build args ..." + # shellcheck disable=SC2155 + local cxx_flags="-DTORCH_USE_HIP_DSA" + build_args=( --package_variant=rocm # HIP_ROOT_DIR now required for HIP to be correctly detected by CMake -DHIP_ROOT_DIR=/opt/rocm # Enable device-side assertions in HIP # https://stackoverflow.com/questions/44284275/passing-compiler-options-in-cmake-command-line - -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" - -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA" + -DCMAKE_C_FLAGS="'${cxx_flags}'" + -DCMAKE_CXX_FLAGS="'${cxx_flags}'" ) } @@ -473,18 +490,18 @@ build_fbgemm_gpu_package () { # shellcheck disable=SC2086 print_exec conda run --no-capture-output ${env_prefix} \ python -m build --wheel --no-isolation \ - "${build_args[@]}" + "${build_args[@]}" || return 1 # Run checks on the built libraries (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1 echo "[BUILD] Enumerating the built wheels ..." - print_exec ls -lth dist/*.whl + print_exec ls -lth dist/*.whl || return 1 echo "[BUILD] Enumerating the wheel SHAs ..." - print_exec sha1sum dist/*.whl - print_exec sha256sum dist/*.whl - print_exec md5sum dist/*.whl + print_exec sha1sum dist/*.whl || return 1 + print_exec sha256sum dist/*.whl || return 1 + print_exec md5sum dist/*.whl || return 1 echo "[BUILD] FBGEMM-GPU build + package completed" } @@ -524,7 +541,7 @@ build_fbgemm_gpu_install () { # shellcheck disable=SC2086 print_exec conda run --no-capture-output ${env_prefix} \ python setup.py "${run_multicore}" install \ - "${build_args[@]}" + "${build_args[@]}" || return 1 # Run checks on the built libraries (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1 diff --git a/fbgemm_gpu/experimental/example/CMakeLists.txt b/fbgemm_gpu/experimental/example/CMakeLists.txt index 457ab86b1f..9cdf4a7315 100644 --- a/fbgemm_gpu/experimental/example/CMakeLists.txt +++ b/fbgemm_gpu/experimental/example/CMakeLists.txt @@ -18,11 +18,14 @@ if(FBGEMM_GENAI_ONLY) ${CMAKE_CURRENT_SOURCE_DIR}../.. ${CMAKE_CURRENT_SOURCE_DIR}../../include ${CMAKE_CURRENT_SOURCE_DIR}../../../include + # PyTorch + ${TORCH_INCLUDE_DIRS} # Third-party ${THIRDPARTY}/asmjit/src ${THIRDPARTY}/cpuinfo/include ${THIRDPARTY}/cutlass/include - ${THIRDPARTY}/cutlass/tools/util/include) + ${THIRDPARTY}/cutlass/tools/util/include + ${NCCL_INCLUDE_DIR}) set(third_party_include_directories ${THIRDPARTY}/asmjit/src @@ -32,7 +35,8 @@ endif() set(experimental_example_cpp_source_files src/cutlass_sgemm_nn.cu - src/example_ops.cpp) + src/example_ops.cpp + src/nccl_example.cpp) set_source_files_properties(${experimental_example_cpp_source_files} PROPERTIES INCLUDE_DIRECTORIES @@ -50,8 +54,9 @@ set(experimental_example_python_source_files add_library(fbgemm_gpu_experimental_example_py MODULE ${experimental_example_cpp_source_files}) -target_include_directories(fbgemm_gpu_experimental_example_py PRIVATE ${TORCH_INCLUDE_DIRS}) -target_link_libraries(fbgemm_gpu_experimental_example_py ${TORCH_LIBRARIES}) +target_link_libraries(fbgemm_gpu_experimental_example_py + ${TORCH_LIBRARIES} + ${NCCL_LIB_DIR}) # Remove `lib` from the output artifact name set_target_properties(fbgemm_gpu_experimental_example_py PROPERTIES PREFIX "") diff --git a/fbgemm_gpu/experimental/example/src/nccl_example.cpp b/fbgemm_gpu/experimental/example/src/nccl_example.cpp new file mode 100644 index 0000000000..240a20e574 --- /dev/null +++ b/fbgemm_gpu/experimental/example/src/nccl_example.cpp @@ -0,0 +1,23 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace fbgemm_gpu::experimental { + +void example_nccl_code() { + ncclComm_t comms[4]; + int devs[4] = { 0, 1, 2, 3 }; + ncclCommInitAll(comms, 4, devs); + + for (int i=0; i<4; i++) { + ncclCommDestroy(comms[i]); + } +} + +} // namespace fbgemm_gpu::experimental diff --git a/fbgemm_gpu/experimental/example/test/triton_example_test.py b/fbgemm_gpu/experimental/example/test/triton_example_test.py new file mode 100644 index 0000000000..d524440203 --- /dev/null +++ b/fbgemm_gpu/experimental/example/test/triton_example_test.py @@ -0,0 +1,71 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + +import unittest + +import torch +import triton +import triton.language as tl + +@triton.jit +def softmax_triton(Y, stride_ym, stride_yn, X, stride_xm, stride_xn, M, N): + # Row index + m = tl.program_id(0) + + # Column indices. This specific kernel only works for matrices that have + # less than BLOCK_SIZE columns + BLOCK_SIZE = 1024 + n = tl.arange(0, BLOCK_SIZE) + + # Compute the memory address of all the elements that we want to load + X = X + m * stride_xm + n * stride_xn + + # Load input data; pad out-of-bounds elements with 0 + x = tl.load(X, mask=n < N, other=-float('inf')) + + # Compute numerically-stable softmax + z = x - tl.max(x, axis=0) + num = tl.exp(z) + denom = tl.sum(num, axis=0) + y = num / denom + + # write back to Y + Y = Y + m * stride_ym + n * stride_yn + tl.store(Y, y, mask=n < N) + + +@torch.jit.script +def softmax_torch(x): + x_max = x.max(dim=1)[0] + z = x - x_max[:, None] + numerator = torch.exp(x) + denominator = numerator.sum(dim=1) + return numerator / denominator[:, None] + + +@unittest.skipIf( + not torch.cuda.is_available(), + "Requires CUDA to run", +) +class TestTriton(unittest.TestCase): + def test_triton_example(self) -> None: + # Allocate input/output tensors + X = torch.normal(0, 1, size=(583, 931), device='cuda') + Y = torch.empty_like(X) + + # SPMD launch grid + grid = (X.shape[0], ) + + # Enqueue GPU kernel + softmax_triton[grid]( + Y, Y.stride(0), Y.stride(1), + X, X.stride(0), X.stride(1), + X.shape[0] , X.shape[1] + ) + + torch.testing.assert_close(Y.cpu(), softmax_torch(X).cpu()) diff --git a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt index a5051e04b3..01fd04ae09 100644 --- a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt +++ b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt @@ -16,16 +16,14 @@ if(FBGEMM_GENAI_ONLY) ${CMAKE_CURRENT_SOURCE_DIR}/../.. ${CMAKE_CURRENT_SOURCE_DIR}/../../include ${CMAKE_CURRENT_SOURCE_DIR}/../../../include + # PyTorch + ${TORCH_INCLUDE_DIRS} # Third-party ${THIRDPARTY}/asmjit/src ${THIRDPARTY}/cpuinfo/include ${THIRDPARTY}/cutlass/include - ${THIRDPARTY}/cutlass/tools/util/include) - - set(third_party_include_directories - ${THIRDPARTY}/asmjit/src - ${THIRDPARTY}/cpuinfo/include - ${THIRDPARTY}/cutlass/include) + ${THIRDPARTY}/cutlass/tools/util/include + ${NCCL_INCLUDE_DIR}) endif() set(attention_ops_sources @@ -93,18 +91,16 @@ if(USE_ROCM) ${ROCRAND_INCLUDE} ${ROCM_SMI_INCLUDE}) - list(GET TORCH_INCLUDE_DIRS 0 TORCH_PATH) - else() # Else create a CUDA library add_library(fbgemm_gpu_experimental_gen_ai_py MODULE ${experimental_gen_ai_cpp_source_files}) endif() -# Link to PyTorch -target_include_directories(fbgemm_gpu_experimental_gen_ai_py - PRIVATE ${TORCH_INCLUDE_DIRS}) -target_link_libraries(fbgemm_gpu_experimental_gen_ai_py ${TORCH_LIBRARIES}) +# Link to PyTorch and NCCL +target_link_libraries(fbgemm_gpu_experimental_gen_ai_py + ${TORCH_LIBRARIES} + ${NCCL_LIB_DIR}) # Remove `lib` from the output artifact name set_target_properties(fbgemm_gpu_experimental_gen_ai_py PROPERTIES PREFIX "")