bazel-contrib · cloudhan · Sep 4, 2023 · Jul 30, 2023 · Sep 2, 2023 · Sep 3, 2023
@@ -17,3 +17,31 @@ load("@rules_cuda//cuda:repositories.bzl", "register_detected_cuda_toolchains",
 rules_cuda_dependencies()
 
 register_detected_cuda_toolchains()
+
+#################################
+# Dependencies for nccl example #
+#################################
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+http_archive(
+    name = "nccl",
+    add_prefix = "nccl",
+    build_file = "@rules_cuda_examples//nccl:nccl.BUILD",
+    sha256 = "83b299cfc2dfe63887dadf3590b3ac2b8b2fd68ec5515b6878774eda39a697d2",
+    strip_prefix = "nccl-9814c75eea18fc7374cde884592233b6b7dc055b",
+    urls = ["https://github.com/nvidia/nccl/archive/9814c75eea18fc7374cde884592233b6b7dc055b.tar.gz"],
+)
+
+http_archive(
+    name = "nccl-tests",
+    add_prefix = "nccl-tests",
+    build_file = "@rules_cuda_examples//nccl:nccl-tests.BUILD",
+    patch_args = [
+        "--directory=nccl-tests",
+        "-p1",
+    ],
+    patches = ["@rules_cuda_examples//nccl:nccl-tests-clang.patch"],
+    sha256 = "946adb84f63aec66aea7aab9739d41df81c24f783e85fba6328ba243cfc057e0",
+    strip_prefix = "nccl-tests-1a5f551ffd6e3271982b03a9d5653a3f6ba545fa",
+    urls = ["https://github.com/nvidia/nccl-tests/archive/1a5f551ffd6e3271982b03a9d5653a3f6ba545fa.tar.gz"],
+)
@@ -0,0 +1,22 @@
+filegroup(
+    name = "nccl_shared",
+    srcs = [
+        "@nccl//:nccl_shared",
+    ],
+)
+
+filegroup(
+    name = "perf_binaries",
+    srcs = [
+        "@nccl-tests//:all_gather_perf",
+        "@nccl-tests//:all_reduce_perf",
+        "@nccl-tests//:alltoall_perf",
+        "@nccl-tests//:broadcast_perf",
+        "@nccl-tests//:gather_perf",
+        "@nccl-tests//:hypercube_perf",
+        "@nccl-tests//:reduce_perf",
+        "@nccl-tests//:reduce_scatter_perf",
+        "@nccl-tests//:scatter_perf",
+        "@nccl-tests//:sendrecv_perf",
+    ],
+)
@@ -0,0 +1,172 @@
+diff --git a/src/all_gather.cu b/src/all_gather.cu
+index 0831207..941ec1b 100644
+--- a/src/all_gather.cu
++++ b/src/all_gather.cu
+@@ -85,9 +85,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t
+   return testSuccess;
+ }
+
+-struct testEngine allGatherEngine = {
++struct testEngine ncclTestEngine = {
+   AllGatherGetBuffSize,
+   AllGatherRunTest
+ };
+-
+-#pragma weak ncclTestEngine=allGatherEngine
+diff --git a/src/all_reduce.cu b/src/all_reduce.cu
+index a38eabe..acb66a8 100644
+--- a/src/all_reduce.cu
++++ b/src/all_reduce.cu
+@@ -93,9 +93,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
+   return testSuccess;
+ }
+
+-struct testEngine allReduceEngine = {
++struct testEngine ncclTestEngine = {
+   AllReduceGetBuffSize,
+   AllReduceRunTest
+ };
+-
+-#pragma weak ncclTestEngine=allReduceEngine
+diff --git a/src/alltoall.cu b/src/alltoall.cu
+index 41c7c4a..712e664 100644
+--- a/src/alltoall.cu
++++ b/src/alltoall.cu
+@@ -99,9 +99,7 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t
+   return testSuccess;
+ }
+
+-struct testEngine alltoAllEngine = {
++struct testEngine ncclTestEngine = {
+   AlltoAllGetBuffSize,
+   AlltoAllRunTest
+ };
+-
+-#pragma weak ncclTestEngine=alltoAllEngine
+diff --git a/src/broadcast.cu b/src/broadcast.cu
+index 903066a..778c664 100644
+--- a/src/broadcast.cu
++++ b/src/broadcast.cu
+@@ -99,9 +99,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t
+   return testSuccess;
+ }
+
+-struct testEngine broadcastEngine = {
++struct testEngine ncclTestEngine = {
+   BroadcastGetBuffSize,
+   BroadcastRunTest
+ };
+-
+-#pragma weak ncclTestEngine=broadcastEngine
+diff --git a/src/common.cu b/src/common.cu
+index 48a629c..d888edc 100644
+--- a/src/common.cu
++++ b/src/common.cu
+@@ -330,7 +330,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
+   size_t count = args->nbytes / wordSize(type);
+
+   // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
+-  size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
++  size_t totalnbytes = std::max(args->sendBytes, args->expectedBytes);
+   size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+   size_t shift = totalnbytes * (iter % steps);
+
+@@ -597,7 +597,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
+       setupArgs(size, type, args);
+       char rootName[100];
+       sprintf(rootName, "%6i", root);
+-      PRINT("%12li  %12li  %8s  %6s  %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
++      PRINT("%12li  %12li  %8s  %6s  %6s", std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
+       TESTCHECK(BenchTime(args, type, op, root, 0));
+       TESTCHECK(BenchTime(args, type, op, root, 1));
+       PRINT("\n");
+diff --git a/src/gather.cu b/src/gather.cu
+index 03ef4d9..242a298 100644
+--- a/src/gather.cu
++++ b/src/gather.cu
+@@ -108,9 +108,7 @@ testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t typ
+   return testSuccess;
+ }
+
+-struct testEngine gatherEngine = {
++struct testEngine ncclTestEngine = {
+   GatherGetBuffSize,
+   GatherRunTest
+ };
+-
+-#pragma weak ncclTestEngine=gatherEngine
+diff --git a/src/hypercube.cu b/src/hypercube.cu
+index 5c1456f..9aadfc5 100644
+--- a/src/hypercube.cu
++++ b/src/hypercube.cu
+@@ -110,9 +110,7 @@ testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t
+   return testSuccess;
+ }
+
+-struct testEngine hyperCubeEngine = {
++struct testEngine ncclTestEngine = {
+   HyperCubeGetBuffSize,
+   HyperCubeRunTest
+ };
+-
+-#pragma weak ncclTestEngine=hyperCubeEngine
+diff --git a/src/reduce.cu b/src/reduce.cu
+index f2fa80d..80aadc5 100644
+--- a/src/reduce.cu
++++ b/src/reduce.cu
+@@ -102,9 +102,7 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ
+   return testSuccess;
+ }
+
+-struct testEngine reduceEngine = {
++struct testEngine ncclTestEngine = {
+   ReduceGetBuffSize,
+   ReduceRunTest
+ };
+-
+-#pragma weak ncclTestEngine=reduceEngine
+diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
+index ed372e3..212a6f0 100644
+--- a/src/reduce_scatter.cu
++++ b/src/reduce_scatter.cu
+@@ -97,9 +97,7 @@ testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataTyp
+   return testSuccess;
+ }
+
+-struct testEngine reduceScatterEngine = {
++struct testEngine ncclTestEngine = {
+   ReduceScatterGetBuffSize,
+   ReduceScatterRunTest
+ };
+-
+-#pragma weak ncclTestEngine=reduceScatterEngine
+diff --git a/src/scatter.cu b/src/scatter.cu
+index 49d20e1..56f5ede 100644
+--- a/src/scatter.cu
++++ b/src/scatter.cu
+@@ -104,9 +104,7 @@ testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t ty
+   return testSuccess;
+ }
+
+-struct testEngine scatterEngine = {
++struct testEngine ncclTestEngine = {
+   ScatterGetBuffSize,
+   ScatterRunTest
+ };
+-
+-#pragma weak ncclTestEngine=scatterEngine
+diff --git a/src/sendrecv.cu b/src/sendrecv.cu
+index c9eb5bb..316a449 100644
+--- a/src/sendrecv.cu
++++ b/src/sendrecv.cu
+@@ -106,9 +106,7 @@ testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t t
+   return testSuccess;
+ }
+
+-struct testEngine sendRecvEngine = {
++struct testEngine ncclTestEngine = {
+   SendRecvGetBuffSize,
+   SendRecvRunTest
+ };
+-
+-#pragma weak ncclTestEngine=sendRecvEngine
@@ -0,0 +1,50 @@
+load("@rules_cuda//cuda:defs.bzl", "cuda_library")
+load("@rules_cuda_examples//nccl:nccl-tests.bzl", "nccl_tests_binary")
+
+cc_library(
+    name = "nccl_tests_include",
+    hdrs = glob(["nccl-tests/src/*.h"]),
+    includes = ["nccl-tests/src"],
+)
+
+cuda_library(
+    name = "common_cuda",
+    srcs = [
+        "nccl-tests/src/common.cu",
+        "nccl-tests/verifiable/verifiable.cu",
+    ] + glob([
+        "nccl-tests/**/*.h",
+    ]),
+    deps = [
+        ":nccl_tests_include",
+        "@nccl",
+    ],
+)
+
+cc_library(
+    name = "common_cc",
+    srcs = ["nccl-tests/src/timer.cc"],
+    hdrs = ["nccl-tests/src/timer.h"],
+    alwayslink = 1,
+)
+
+# :common_cuda, :common_cc and @nccl//:nccl_shared are implicitly hardcoded in `nccl_tests_binary`
+nccl_tests_binary(name = "all_reduce")
+
+nccl_tests_binary(name = "all_gather")
+
+nccl_tests_binary(name = "broadcast")
+
+nccl_tests_binary(name = "reduce_scatter")
+
+nccl_tests_binary(name = "reduce")
+
+nccl_tests_binary(name = "alltoall")
+
+nccl_tests_binary(name = "scatter")
+
+nccl_tests_binary(name = "gather")
+
+nccl_tests_binary(name = "sendrecv")
+
+nccl_tests_binary(name = "hypercube")
@@ -0,0 +1,19 @@
+load("@rules_cuda//cuda:defs.bzl", "cuda_library")
+
+def nccl_tests_binary(name, cc_deps = [], cuda_deps = []):
+    cuda_library(
+        name = name,
+        srcs = ["nccl-tests/src/{}.cu".format(name)],
+        deps = [
+            "@nccl//:nccl_shared",
+            ":common_cuda",
+        ],
+        alwayslink = 1,
+    )
+
+    bin_name = name + "_perf"
+    native.cc_binary(
+        name = bin_name,
+        deps = [":common_cc", ":" + name],
+        visibility = ["//visibility:public"],
+    )