Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add nccl as an example #157

Merged
merged 4 commits into from
Sep 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions examples/WORKSPACE.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,31 @@ load("@rules_cuda//cuda:repositories.bzl", "register_detected_cuda_toolchains",
rules_cuda_dependencies()

register_detected_cuda_toolchains()

#################################
# Dependencies for nccl example #
#################################
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")

http_archive(
name = "nccl",
add_prefix = "nccl",
build_file = "@rules_cuda_examples//nccl:nccl.BUILD",
sha256 = "83b299cfc2dfe63887dadf3590b3ac2b8b2fd68ec5515b6878774eda39a697d2",
strip_prefix = "nccl-9814c75eea18fc7374cde884592233b6b7dc055b",
urls = ["https://github.com/nvidia/nccl/archive/9814c75eea18fc7374cde884592233b6b7dc055b.tar.gz"],
)

http_archive(
name = "nccl-tests",
add_prefix = "nccl-tests",
build_file = "@rules_cuda_examples//nccl:nccl-tests.BUILD",
patch_args = [
"--directory=nccl-tests",
"-p1",
],
patches = ["@rules_cuda_examples//nccl:nccl-tests-clang.patch"],
sha256 = "946adb84f63aec66aea7aab9739d41df81c24f783e85fba6328ba243cfc057e0",
strip_prefix = "nccl-tests-1a5f551ffd6e3271982b03a9d5653a3f6ba545fa",
urls = ["https://github.com/nvidia/nccl-tests/archive/1a5f551ffd6e3271982b03a9d5653a3f6ba545fa.tar.gz"],
)
22 changes: 22 additions & 0 deletions examples/nccl/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
filegroup(
name = "nccl_shared",
srcs = [
"@nccl//:nccl_shared",
],
)

filegroup(
name = "perf_binaries",
srcs = [
"@nccl-tests//:all_gather_perf",
"@nccl-tests//:all_reduce_perf",
"@nccl-tests//:alltoall_perf",
"@nccl-tests//:broadcast_perf",
"@nccl-tests//:gather_perf",
"@nccl-tests//:hypercube_perf",
"@nccl-tests//:reduce_perf",
"@nccl-tests//:reduce_scatter_perf",
"@nccl-tests//:scatter_perf",
"@nccl-tests//:sendrecv_perf",
],
)
172 changes: 172 additions & 0 deletions examples/nccl/nccl-tests-clang.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
diff --git a/src/all_gather.cu b/src/all_gather.cu
index 0831207..941ec1b 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -85,9 +85,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t
return testSuccess;
}

-struct testEngine allGatherEngine = {
+struct testEngine ncclTestEngine = {
AllGatherGetBuffSize,
AllGatherRunTest
};
-
-#pragma weak ncclTestEngine=allGatherEngine
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index a38eabe..acb66a8 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -93,9 +93,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
return testSuccess;
}

-struct testEngine allReduceEngine = {
+struct testEngine ncclTestEngine = {
AllReduceGetBuffSize,
AllReduceRunTest
};
-
-#pragma weak ncclTestEngine=allReduceEngine
diff --git a/src/alltoall.cu b/src/alltoall.cu
index 41c7c4a..712e664 100644
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
@@ -99,9 +99,7 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t
return testSuccess;
}

-struct testEngine alltoAllEngine = {
+struct testEngine ncclTestEngine = {
AlltoAllGetBuffSize,
AlltoAllRunTest
};
-
-#pragma weak ncclTestEngine=alltoAllEngine
diff --git a/src/broadcast.cu b/src/broadcast.cu
index 903066a..778c664 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -99,9 +99,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t
return testSuccess;
}

-struct testEngine broadcastEngine = {
+struct testEngine ncclTestEngine = {
BroadcastGetBuffSize,
BroadcastRunTest
};
-
-#pragma weak ncclTestEngine=broadcastEngine
diff --git a/src/common.cu b/src/common.cu
index 48a629c..d888edc 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -330,7 +330,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
size_t count = args->nbytes / wordSize(type);

// Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
- size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+ size_t totalnbytes = std::max(args->sendBytes, args->expectedBytes);
size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
size_t shift = totalnbytes * (iter % steps);

@@ -597,7 +597,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
setupArgs(size, type, args);
char rootName[100];
sprintf(rootName, "%6i", root);
- PRINT("%12li %12li %8s %6s %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
+ PRINT("%12li %12li %8s %6s %6s", std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
TESTCHECK(BenchTime(args, type, op, root, 0));
TESTCHECK(BenchTime(args, type, op, root, 1));
PRINT("\n");
diff --git a/src/gather.cu b/src/gather.cu
index 03ef4d9..242a298 100644
--- a/src/gather.cu
+++ b/src/gather.cu
@@ -108,9 +108,7 @@ testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t typ
return testSuccess;
}

-struct testEngine gatherEngine = {
+struct testEngine ncclTestEngine = {
GatherGetBuffSize,
GatherRunTest
};
-
-#pragma weak ncclTestEngine=gatherEngine
diff --git a/src/hypercube.cu b/src/hypercube.cu
index 5c1456f..9aadfc5 100644
--- a/src/hypercube.cu
+++ b/src/hypercube.cu
@@ -110,9 +110,7 @@ testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t
return testSuccess;
}

-struct testEngine hyperCubeEngine = {
+struct testEngine ncclTestEngine = {
HyperCubeGetBuffSize,
HyperCubeRunTest
};
-
-#pragma weak ncclTestEngine=hyperCubeEngine
diff --git a/src/reduce.cu b/src/reduce.cu
index f2fa80d..80aadc5 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -102,9 +102,7 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ
return testSuccess;
}

-struct testEngine reduceEngine = {
+struct testEngine ncclTestEngine = {
ReduceGetBuffSize,
ReduceRunTest
};
-
-#pragma weak ncclTestEngine=reduceEngine
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index ed372e3..212a6f0 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -97,9 +97,7 @@ testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataTyp
return testSuccess;
}

-struct testEngine reduceScatterEngine = {
+struct testEngine ncclTestEngine = {
ReduceScatterGetBuffSize,
ReduceScatterRunTest
};
-
-#pragma weak ncclTestEngine=reduceScatterEngine
diff --git a/src/scatter.cu b/src/scatter.cu
index 49d20e1..56f5ede 100644
--- a/src/scatter.cu
+++ b/src/scatter.cu
@@ -104,9 +104,7 @@ testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t ty
return testSuccess;
}

-struct testEngine scatterEngine = {
+struct testEngine ncclTestEngine = {
ScatterGetBuffSize,
ScatterRunTest
};
-
-#pragma weak ncclTestEngine=scatterEngine
diff --git a/src/sendrecv.cu b/src/sendrecv.cu
index c9eb5bb..316a449 100644
--- a/src/sendrecv.cu
+++ b/src/sendrecv.cu
@@ -106,9 +106,7 @@ testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t t
return testSuccess;
}

-struct testEngine sendRecvEngine = {
+struct testEngine ncclTestEngine = {
SendRecvGetBuffSize,
SendRecvRunTest
};
-
-#pragma weak ncclTestEngine=sendRecvEngine
50 changes: 50 additions & 0 deletions examples/nccl/nccl-tests.BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
load("@rules_cuda//cuda:defs.bzl", "cuda_library")
load("@rules_cuda_examples//nccl:nccl-tests.bzl", "nccl_tests_binary")

cc_library(
name = "nccl_tests_include",
hdrs = glob(["nccl-tests/src/*.h"]),
includes = ["nccl-tests/src"],
)

cuda_library(
name = "common_cuda",
srcs = [
"nccl-tests/src/common.cu",
"nccl-tests/verifiable/verifiable.cu",
] + glob([
"nccl-tests/**/*.h",
]),
deps = [
":nccl_tests_include",
"@nccl",
],
)

cc_library(
name = "common_cc",
srcs = ["nccl-tests/src/timer.cc"],
hdrs = ["nccl-tests/src/timer.h"],
alwayslink = 1,
)

# :common_cuda, :common_cc and @nccl//:nccl_shared are implicitly hardcoded in `nccl_tests_binary`
nccl_tests_binary(name = "all_reduce")

nccl_tests_binary(name = "all_gather")

nccl_tests_binary(name = "broadcast")

nccl_tests_binary(name = "reduce_scatter")

nccl_tests_binary(name = "reduce")

nccl_tests_binary(name = "alltoall")

nccl_tests_binary(name = "scatter")

nccl_tests_binary(name = "gather")

nccl_tests_binary(name = "sendrecv")

nccl_tests_binary(name = "hypercube")
19 changes: 19 additions & 0 deletions examples/nccl/nccl-tests.bzl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
load("@rules_cuda//cuda:defs.bzl", "cuda_library")

def nccl_tests_binary(name, cc_deps = [], cuda_deps = []):
cuda_library(
name = name,
srcs = ["nccl-tests/src/{}.cu".format(name)],
deps = [
"@nccl//:nccl_shared",
":common_cuda",
],
alwayslink = 1,
)

bin_name = name + "_perf"
native.cc_binary(
name = bin_name,
deps = [":common_cc", ":" + name],
visibility = ["//visibility:public"],
)
Loading