From cb0bf63b20f5c440addd8ec51e4c31f9574d7f58 Mon Sep 17 00:00:00 2001 From: Cloud Han Date: Sun, 30 Jul 2023 11:23:00 +0800 Subject: [PATCH 1/4] Add example --- examples/nccl/BUILD.bazel | 22 +++++ examples/nccl/WORKSPACE.bazel | 46 ++++++++++ examples/nccl/nccl-tests.BUILD | 48 ++++++++++ examples/nccl/nccl-tests.bzl | 19 ++++ examples/nccl/nccl.BUILD | 162 +++++++++++++++++++++++++++++++++ examples/nccl/nccl.bzl | 43 +++++++++ 6 files changed, 340 insertions(+) create mode 100644 examples/nccl/BUILD.bazel create mode 100644 examples/nccl/WORKSPACE.bazel create mode 100644 examples/nccl/nccl-tests.BUILD create mode 100644 examples/nccl/nccl-tests.bzl create mode 100644 examples/nccl/nccl.BUILD create mode 100644 examples/nccl/nccl.bzl diff --git a/examples/nccl/BUILD.bazel b/examples/nccl/BUILD.bazel new file mode 100644 index 00000000..b5446b9a --- /dev/null +++ b/examples/nccl/BUILD.bazel @@ -0,0 +1,22 @@ +filegroup( + name = "nccl_shared", + srcs = [ + "@nccl//:nccl_shared", + ], +) + +filegroup( + name = "perf_binaries", + srcs = [ + "@nccl-tests//:all_gather_perf", + "@nccl-tests//:all_reduce_perf", + "@nccl-tests//:alltoall_perf", + "@nccl-tests//:broadcast_perf", + "@nccl-tests//:gather_perf", + "@nccl-tests//:hypercube_perf", + "@nccl-tests//:reduce_perf", + "@nccl-tests//:reduce_scatter_perf", + "@nccl-tests//:scatter_perf", + "@nccl-tests//:sendrecv_perf", + ], +) diff --git a/examples/nccl/WORKSPACE.bazel b/examples/nccl/WORKSPACE.bazel new file mode 100644 index 00000000..bde03daf --- /dev/null +++ b/examples/nccl/WORKSPACE.bazel @@ -0,0 +1,46 @@ +workspace(name = "rules_cuda_examples_nccl") + +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") + +local_repository( + name = "rules_cuda", + path = "../../", +) + +load("@rules_cuda//cuda:repositories.bzl", "register_detected_cuda_toolchains", "rules_cuda_dependencies") + +rules_cuda_dependencies() + +register_detected_cuda_toolchains() + +################################# +# Dependencies for this example # +################################# +http_archive( + name = "bazel_skylib", + sha256 = "2e6fa9a61db799266072df115a719a14a9af0e8a630b1f770ef0bd757e68cd71", + strip_prefix = "bazel-skylib-de3035d605b4c89a62d6da060188e4ab0c5034b9", + urls = ["https://github.com/bazelbuild/bazel-skylib/archive/de3035d605b4c89a62d6da060188e4ab0c5034b9.tar.gz"], +) + +load("@bazel_skylib//:workspace.bzl", "bazel_skylib_workspace") + +bazel_skylib_workspace() + +http_archive( + name = "nccl", + add_prefix = "nccl", + build_file = "@rules_cuda_examples_nccl//:nccl.BUILD", + sha256 = "83b299cfc2dfe63887dadf3590b3ac2b8b2fd68ec5515b6878774eda39a697d2", + strip_prefix = "nccl-9814c75eea18fc7374cde884592233b6b7dc055b", + urls = ["https://github.com/nvidia/nccl/archive/9814c75eea18fc7374cde884592233b6b7dc055b.tar.gz"], +) + +http_archive( + name = "nccl-tests", + add_prefix = "nccl-tests", + build_file = "@rules_cuda_examples_nccl//:nccl-tests.BUILD", + sha256 = "946adb84f63aec66aea7aab9739d41df81c24f783e85fba6328ba243cfc057e0", + strip_prefix = "nccl-tests-1a5f551ffd6e3271982b03a9d5653a3f6ba545fa", + urls = ["https://github.com/nvidia/nccl-tests/archive/1a5f551ffd6e3271982b03a9d5653a3f6ba545fa.tar.gz"], +) diff --git a/examples/nccl/nccl-tests.BUILD b/examples/nccl/nccl-tests.BUILD new file mode 100644 index 00000000..4098ca0b --- /dev/null +++ b/examples/nccl/nccl-tests.BUILD @@ -0,0 +1,48 @@ +load("@rules_cuda//cuda:defs.bzl", "cuda_library") +load("@rules_cuda_examples_nccl//:nccl-tests.bzl", "nccl_tests_binary") + +cc_library( + name = "nccl_tests_include", + hdrs = glob(["nccl-tests/src/*.h"]), + includes = ["nccl-tests/src"], +) + +cuda_library( + name = "common_cuda", + srcs = [ + "nccl-tests/src/common.cu", + "nccl-tests/verifiable/verifiable.cu", + ], + deps = [ + ":nccl_tests_include", + "@nccl", + ], +) + +cc_library( + name = "common_cc", + srcs = ["nccl-tests/src/timer.cc"], + hdrs = ["nccl-tests/src/timer.h"], + alwayslink = 1, +) + +# :common_cuda, :common_cc and @nccl//:nccl_shared are implicitly hardcoded in `nccl_tests_binary` +nccl_tests_binary(name = "all_reduce") + +nccl_tests_binary(name = "all_gather") + +nccl_tests_binary(name = "broadcast") + +nccl_tests_binary(name = "reduce_scatter") + +nccl_tests_binary(name = "reduce") + +nccl_tests_binary(name = "alltoall") + +nccl_tests_binary(name = "scatter") + +nccl_tests_binary(name = "gather") + +nccl_tests_binary(name = "sendrecv") + +nccl_tests_binary(name = "hypercube") diff --git a/examples/nccl/nccl-tests.bzl b/examples/nccl/nccl-tests.bzl new file mode 100644 index 00000000..48229031 --- /dev/null +++ b/examples/nccl/nccl-tests.bzl @@ -0,0 +1,19 @@ +load("@rules_cuda//cuda:defs.bzl", "cuda_library") + +def nccl_tests_binary(name, cc_deps = [], cuda_deps = []): + cuda_library( + name = name, + srcs = ["nccl-tests/src/{}.cu".format(name)], + deps = [ + "@nccl//:nccl_shared", + ":common_cuda", + ], + alwayslink = 1, + ) + + bin_name = name + "_perf" + native.cc_binary( + name = bin_name, + deps = [":common_cc", ":" + name], + visibility = ["//visibility:public"], + ) diff --git a/examples/nccl/nccl.BUILD b/examples/nccl/nccl.BUILD new file mode 100644 index 00000000..d0cb6bbc --- /dev/null +++ b/examples/nccl/nccl.BUILD @@ -0,0 +1,162 @@ +load("@bazel_skylib//rules:expand_template.bzl", "expand_template") +load("@rules_cuda//cuda:defs.bzl", "cuda_library", "cuda_objects") +load("@rules_cuda_examples_nccl//:nccl.bzl", "if_cuda_clang", "if_cuda_nvcc", "nccl_primitive") + +expand_template( + name = "nccl_h", + out = "nccl/src/include/nccl.h", + substitutions = { + "${nccl:Major}": "2", + "${nccl:Minor}": "18", + "${nccl:Patch}": "3", + "${nccl:Suffix}": "", + # NCCL_VERSION(X,Y,Z) ((X) * 10000 + (Y) * 100 + (Z)) + "${nccl:Version}": "21803", + }, + template = "nccl/src/nccl.h.in", +) + +cc_library( + name = "nccl_include", + hdrs = [ + ":nccl_h", + ] + glob([ + "nccl/src/include/**/*.h", + "nccl/src/include/**/*.hpp", + ]), + includes = [ + # this will add both nccl/src/include in repo and + # bazel-out//bin/nccl/src/include to include paths + # so the previous expand_template generate nccl.h to the very path! + "nccl/src/include", + ], +) + +cuda_objects( + name = "nccl_device_common", + srcs = [ + "nccl/src/collectives/device/functions.cu", + "nccl/src/collectives/device/onerank_reduce.cu", + ], + # hdrs = hdrs, + copts = if_cuda_nvcc(["--extended-lambda"]), + ptxasopts = ["-maxrregcount=96"], + deps = [":nccl_include"], +) + +# must be manually disabled if cuda version is lower than 11. +USE_BF16 = True + +filegroup( + name = "collective_dev_hdrs", + srcs = [ + "nccl/src/collectives/device/all_gather.h", + "nccl/src/collectives/device/all_reduce.h", + "nccl/src/collectives/device/broadcast.h", + "nccl/src/collectives/device/common.h", + "nccl/src/collectives/device/common_kernel.h", + "nccl/src/collectives/device/gen_rules.sh", + "nccl/src/collectives/device/op128.h", + "nccl/src/collectives/device/primitives.h", + "nccl/src/collectives/device/prims_ll.h", + "nccl/src/collectives/device/prims_ll128.h", + "nccl/src/collectives/device/prims_simple.h", + "nccl/src/collectives/device/reduce.h", + "nccl/src/collectives/device/reduce_kernel.h", + "nccl/src/collectives/device/reduce_scatter.h", + "nccl/src/collectives/device/sendrecv.h", + ], +) + +# cuda_objects for each type of primitive +nccl_primitive( + name = "all_gather", + hdrs = ["collective_dev_hdrs"], + use_bf16 = USE_BF16, + deps = [":nccl_device_common"], +) + +nccl_primitive( + name = "all_reduce", + hdrs = ["collective_dev_hdrs"], + use_bf16 = USE_BF16, + deps = [":nccl_device_common"], +) + +nccl_primitive( + name = "broadcast", + hdrs = ["collective_dev_hdrs"], + use_bf16 = USE_BF16, + deps = [":nccl_device_common"], +) + +nccl_primitive( + name = "reduce", + hdrs = ["collective_dev_hdrs"], + use_bf16 = USE_BF16, + deps = [":nccl_device_common"], +) + +nccl_primitive( + name = "reduce_scatter", + hdrs = ["collective_dev_hdrs"], + use_bf16 = USE_BF16, + deps = [":nccl_device_common"], +) + +nccl_primitive( + name = "sendrecv", + hdrs = ["collective_dev_hdrs"], + use_bf16 = USE_BF16, + deps = [":nccl_device_common"], +) + +# device link +cuda_library( + name = "collectives", + rdc = 1, + deps = [ + ":all_gather", + ":all_reduce", + ":broadcast", + ":reduce", + ":reduce_scatter", + ":sendrecv", + ], + alwayslink = 1, +) + +cc_binary( + name = "nccl", + srcs = glob( + [ + "nccl/src/*.cc", + "nccl/src/collectives/*.cc", + "nccl/src/graph/*.cc", + "nccl/src/graph/*.h", + "nccl/src/misc/*.cc", + "nccl/src/transport/*.cc", + ], + exclude = [ + # https://github.com/NVIDIA/nccl/issues/658 + "nccl/src/enhcompat.cc", + ], + ), + copts = if_cuda_clang(["-xcu"]), + linkshared = 1, + linkstatic = 1, + deps = [ + ":collectives", + ":nccl_include", + "@rules_cuda//cuda:runtime", + ], + visibility = ["//visibility:public"], +) + +# To allow downstream targets to link with the nccl shared library, we need to `cc_import` it again. +# See https://groups.google.com/g/bazel-discuss/c/RtbidPdVFyU/m/TsUDOVHIAwAJ +cc_import( + name = "nccl_shared", + shared_library = ":nccl", + visibility = ["//visibility:public"], +) diff --git a/examples/nccl/nccl.bzl b/examples/nccl/nccl.bzl new file mode 100644 index 00000000..8a8553c8 --- /dev/null +++ b/examples/nccl/nccl.bzl @@ -0,0 +1,43 @@ +load("@bazel_skylib//rules:copy_file.bzl", "copy_file") +load("@rules_cuda//cuda:defs.bzl", "cuda_library", "cuda_objects") + +def if_cuda_nvcc(if_true, if_false = []): + return select({ + "@rules_cuda//cuda:compiler_is_nvcc": if_true, + "//conditions:default": if_false, + }) + +def if_cuda_clang(if_true, if_false = []): + return select({ + "@rules_cuda//cuda:compiler_is_clang": if_true, + "//conditions:default": if_false, + }) + +def nccl_primitive(name, hdrs = [], deps = [], use_bf16 = True): + ops = ["sum", "prod", "min", "max", "premulsum", "sumpostdiv"] + datatypes = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64"] + if use_bf16: + datatypes.append("bf16") + + intermediate_targets = [] + for opn, op in enumerate(ops): + for dtn, dt in enumerate(datatypes): + name_op_dt = "{}_{}_{}".format(name, op, dt) + copy_file( + name = name_op_dt + "_rename", + src = "nccl/src/collectives/device/{}.cu".format(name), + out = "nccl/src/collectives/device/{}.cu".format(name_op_dt), + ) + + cuda_objects( + name = name_op_dt, + srcs = [":{}_rename".format(name_op_dt)], + hdrs = hdrs, + deps = deps, + ptxasopts = ["-maxrregcount=96"], + defines = ["NCCL_OP={}".format(opn), "NCCL_TYPE={}".format(dtn)], + includes = ["nccl/src/collectives/device"], + ) + intermediate_targets.append(":" + name_op_dt) + + cuda_objects(name = name, deps = intermediate_targets) From 282da47c3e8dc7412c2799b9e1fb72003e12787f Mon Sep 17 00:00:00 2001 From: Cloud Han Date: Sat, 2 Sep 2023 22:01:57 +0800 Subject: [PATCH 2/4] Move nccl deps to examples repo --- examples/WORKSPACE.bazel | 23 +++++++++++++++++ examples/nccl/WORKSPACE.bazel | 46 ---------------------------------- examples/nccl/nccl-tests.BUILD | 6 +++-- examples/nccl/nccl.BUILD | 9 ++++--- 4 files changed, 32 insertions(+), 52 deletions(-) delete mode 100644 examples/nccl/WORKSPACE.bazel diff --git a/examples/WORKSPACE.bazel b/examples/WORKSPACE.bazel index f79eaaf7..b35a9c23 100644 --- a/examples/WORKSPACE.bazel +++ b/examples/WORKSPACE.bazel @@ -17,3 +17,26 @@ load("@rules_cuda//cuda:repositories.bzl", "register_detected_cuda_toolchains", rules_cuda_dependencies() register_detected_cuda_toolchains() + +################################# +# Dependencies for nccl example # +################################# +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") + +http_archive( + name = "nccl", + add_prefix = "nccl", + build_file = "@rules_cuda_examples//nccl:nccl.BUILD", + sha256 = "83b299cfc2dfe63887dadf3590b3ac2b8b2fd68ec5515b6878774eda39a697d2", + strip_prefix = "nccl-9814c75eea18fc7374cde884592233b6b7dc055b", + urls = ["https://github.com/nvidia/nccl/archive/9814c75eea18fc7374cde884592233b6b7dc055b.tar.gz"], +) + +http_archive( + name = "nccl-tests", + add_prefix = "nccl-tests", + build_file = "@rules_cuda_examples//nccl:nccl-tests.BUILD", + sha256 = "946adb84f63aec66aea7aab9739d41df81c24f783e85fba6328ba243cfc057e0", + strip_prefix = "nccl-tests-1a5f551ffd6e3271982b03a9d5653a3f6ba545fa", + urls = ["https://github.com/nvidia/nccl-tests/archive/1a5f551ffd6e3271982b03a9d5653a3f6ba545fa.tar.gz"], +) diff --git a/examples/nccl/WORKSPACE.bazel b/examples/nccl/WORKSPACE.bazel deleted file mode 100644 index bde03daf..00000000 --- a/examples/nccl/WORKSPACE.bazel +++ /dev/null @@ -1,46 +0,0 @@ -workspace(name = "rules_cuda_examples_nccl") - -load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") - -local_repository( - name = "rules_cuda", - path = "../../", -) - -load("@rules_cuda//cuda:repositories.bzl", "register_detected_cuda_toolchains", "rules_cuda_dependencies") - -rules_cuda_dependencies() - -register_detected_cuda_toolchains() - -################################# -# Dependencies for this example # -################################# -http_archive( - name = "bazel_skylib", - sha256 = "2e6fa9a61db799266072df115a719a14a9af0e8a630b1f770ef0bd757e68cd71", - strip_prefix = "bazel-skylib-de3035d605b4c89a62d6da060188e4ab0c5034b9", - urls = ["https://github.com/bazelbuild/bazel-skylib/archive/de3035d605b4c89a62d6da060188e4ab0c5034b9.tar.gz"], -) - -load("@bazel_skylib//:workspace.bzl", "bazel_skylib_workspace") - -bazel_skylib_workspace() - -http_archive( - name = "nccl", - add_prefix = "nccl", - build_file = "@rules_cuda_examples_nccl//:nccl.BUILD", - sha256 = "83b299cfc2dfe63887dadf3590b3ac2b8b2fd68ec5515b6878774eda39a697d2", - strip_prefix = "nccl-9814c75eea18fc7374cde884592233b6b7dc055b", - urls = ["https://github.com/nvidia/nccl/archive/9814c75eea18fc7374cde884592233b6b7dc055b.tar.gz"], -) - -http_archive( - name = "nccl-tests", - add_prefix = "nccl-tests", - build_file = "@rules_cuda_examples_nccl//:nccl-tests.BUILD", - sha256 = "946adb84f63aec66aea7aab9739d41df81c24f783e85fba6328ba243cfc057e0", - strip_prefix = "nccl-tests-1a5f551ffd6e3271982b03a9d5653a3f6ba545fa", - urls = ["https://github.com/nvidia/nccl-tests/archive/1a5f551ffd6e3271982b03a9d5653a3f6ba545fa.tar.gz"], -) diff --git a/examples/nccl/nccl-tests.BUILD b/examples/nccl/nccl-tests.BUILD index 4098ca0b..f482e6db 100644 --- a/examples/nccl/nccl-tests.BUILD +++ b/examples/nccl/nccl-tests.BUILD @@ -1,5 +1,5 @@ load("@rules_cuda//cuda:defs.bzl", "cuda_library") -load("@rules_cuda_examples_nccl//:nccl-tests.bzl", "nccl_tests_binary") +load("@rules_cuda_examples//nccl:nccl-tests.bzl", "nccl_tests_binary") cc_library( name = "nccl_tests_include", @@ -12,7 +12,9 @@ cuda_library( srcs = [ "nccl-tests/src/common.cu", "nccl-tests/verifiable/verifiable.cu", - ], + ] + glob([ + "nccl-tests/**/*.h", + ]), deps = [ ":nccl_tests_include", "@nccl", diff --git a/examples/nccl/nccl.BUILD b/examples/nccl/nccl.BUILD index d0cb6bbc..98f36117 100644 --- a/examples/nccl/nccl.BUILD +++ b/examples/nccl/nccl.BUILD @@ -1,6 +1,6 @@ load("@bazel_skylib//rules:expand_template.bzl", "expand_template") load("@rules_cuda//cuda:defs.bzl", "cuda_library", "cuda_objects") -load("@rules_cuda_examples_nccl//:nccl.bzl", "if_cuda_clang", "if_cuda_nvcc", "nccl_primitive") +load("@rules_cuda_examples//nccl:nccl.bzl", "if_cuda_clang", "if_cuda_nvcc", "nccl_primitive") expand_template( name = "nccl_h", @@ -37,8 +37,9 @@ cuda_objects( srcs = [ "nccl/src/collectives/device/functions.cu", "nccl/src/collectives/device/onerank_reduce.cu", - ], - # hdrs = hdrs, + ] + glob([ + "nccl/src/collectives/device/**/*.h", + ]), copts = if_cuda_nvcc(["--extended-lambda"]), ptxasopts = ["-maxrregcount=96"], deps = [":nccl_include"], @@ -145,12 +146,12 @@ cc_binary( copts = if_cuda_clang(["-xcu"]), linkshared = 1, linkstatic = 1, + visibility = ["//visibility:public"], deps = [ ":collectives", ":nccl_include", "@rules_cuda//cuda:runtime", ], - visibility = ["//visibility:public"], ) # To allow downstream targets to link with the nccl shared library, we need to `cc_import` it again. From 0216f8c0e3af3106b5f519bf74de9cfa696f42d6 Mon Sep 17 00:00:00 2001 From: Cloud Han Date: Mon, 4 Sep 2023 01:20:04 +0800 Subject: [PATCH 3/4] Fix clang --- examples/WORKSPACE.bazel | 2 + examples/nccl/nccl-tests-clang.patch | 172 +++++++++++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 examples/nccl/nccl-tests-clang.patch diff --git a/examples/WORKSPACE.bazel b/examples/WORKSPACE.bazel index b35a9c23..51346df9 100644 --- a/examples/WORKSPACE.bazel +++ b/examples/WORKSPACE.bazel @@ -36,6 +36,8 @@ http_archive( name = "nccl-tests", add_prefix = "nccl-tests", build_file = "@rules_cuda_examples//nccl:nccl-tests.BUILD", + patches = ["@rules_cuda_examples//nccl:nccl-tests-clang.patch"], + patch_args = ["--directory=nccl-tests", "-p1"], sha256 = "946adb84f63aec66aea7aab9739d41df81c24f783e85fba6328ba243cfc057e0", strip_prefix = "nccl-tests-1a5f551ffd6e3271982b03a9d5653a3f6ba545fa", urls = ["https://github.com/nvidia/nccl-tests/archive/1a5f551ffd6e3271982b03a9d5653a3f6ba545fa.tar.gz"], diff --git a/examples/nccl/nccl-tests-clang.patch b/examples/nccl/nccl-tests-clang.patch new file mode 100644 index 00000000..9d6d60d6 --- /dev/null +++ b/examples/nccl/nccl-tests-clang.patch @@ -0,0 +1,172 @@ +diff --git a/src/all_gather.cu b/src/all_gather.cu +index 0831207..941ec1b 100644 +--- a/src/all_gather.cu ++++ b/src/all_gather.cu +@@ -85,9 +85,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t + return testSuccess; + } + +-struct testEngine allGatherEngine = { ++struct testEngine ncclTestEngine = { + AllGatherGetBuffSize, + AllGatherRunTest + }; +- +-#pragma weak ncclTestEngine=allGatherEngine +diff --git a/src/all_reduce.cu b/src/all_reduce.cu +index a38eabe..acb66a8 100644 +--- a/src/all_reduce.cu ++++ b/src/all_reduce.cu +@@ -93,9 +93,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t + return testSuccess; + } + +-struct testEngine allReduceEngine = { ++struct testEngine ncclTestEngine = { + AllReduceGetBuffSize, + AllReduceRunTest + }; +- +-#pragma weak ncclTestEngine=allReduceEngine +diff --git a/src/alltoall.cu b/src/alltoall.cu +index 41c7c4a..712e664 100644 +--- a/src/alltoall.cu ++++ b/src/alltoall.cu +@@ -99,9 +99,7 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t + return testSuccess; + } + +-struct testEngine alltoAllEngine = { ++struct testEngine ncclTestEngine = { + AlltoAllGetBuffSize, + AlltoAllRunTest + }; +- +-#pragma weak ncclTestEngine=alltoAllEngine +diff --git a/src/broadcast.cu b/src/broadcast.cu +index 903066a..778c664 100644 +--- a/src/broadcast.cu ++++ b/src/broadcast.cu +@@ -99,9 +99,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t + return testSuccess; + } + +-struct testEngine broadcastEngine = { ++struct testEngine ncclTestEngine = { + BroadcastGetBuffSize, + BroadcastRunTest + }; +- +-#pragma weak ncclTestEngine=broadcastEngine +diff --git a/src/common.cu b/src/common.cu +index 48a629c..d888edc 100644 +--- a/src/common.cu ++++ b/src/common.cu +@@ -330,7 +330,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t + size_t count = args->nbytes / wordSize(type); + + // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange +- size_t totalnbytes = max(args->sendBytes, args->expectedBytes); ++ size_t totalnbytes = std::max(args->sendBytes, args->expectedBytes); + size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + size_t shift = totalnbytes * (iter % steps); + +@@ -597,7 +597,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* + setupArgs(size, type, args); + char rootName[100]; + sprintf(rootName, "%6i", root); +- PRINT("%12li %12li %8s %6s %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName); ++ PRINT("%12li %12li %8s %6s %6s", std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName); + TESTCHECK(BenchTime(args, type, op, root, 0)); + TESTCHECK(BenchTime(args, type, op, root, 1)); + PRINT("\n"); +diff --git a/src/gather.cu b/src/gather.cu +index 03ef4d9..242a298 100644 +--- a/src/gather.cu ++++ b/src/gather.cu +@@ -108,9 +108,7 @@ testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t typ + return testSuccess; + } + +-struct testEngine gatherEngine = { ++struct testEngine ncclTestEngine = { + GatherGetBuffSize, + GatherRunTest + }; +- +-#pragma weak ncclTestEngine=gatherEngine +diff --git a/src/hypercube.cu b/src/hypercube.cu +index 5c1456f..9aadfc5 100644 +--- a/src/hypercube.cu ++++ b/src/hypercube.cu +@@ -110,9 +110,7 @@ testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t + return testSuccess; + } + +-struct testEngine hyperCubeEngine = { ++struct testEngine ncclTestEngine = { + HyperCubeGetBuffSize, + HyperCubeRunTest + }; +- +-#pragma weak ncclTestEngine=hyperCubeEngine +diff --git a/src/reduce.cu b/src/reduce.cu +index f2fa80d..80aadc5 100644 +--- a/src/reduce.cu ++++ b/src/reduce.cu +@@ -102,9 +102,7 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ + return testSuccess; + } + +-struct testEngine reduceEngine = { ++struct testEngine ncclTestEngine = { + ReduceGetBuffSize, + ReduceRunTest + }; +- +-#pragma weak ncclTestEngine=reduceEngine +diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu +index ed372e3..212a6f0 100644 +--- a/src/reduce_scatter.cu ++++ b/src/reduce_scatter.cu +@@ -97,9 +97,7 @@ testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataTyp + return testSuccess; + } + +-struct testEngine reduceScatterEngine = { ++struct testEngine ncclTestEngine = { + ReduceScatterGetBuffSize, + ReduceScatterRunTest + }; +- +-#pragma weak ncclTestEngine=reduceScatterEngine +diff --git a/src/scatter.cu b/src/scatter.cu +index 49d20e1..56f5ede 100644 +--- a/src/scatter.cu ++++ b/src/scatter.cu +@@ -104,9 +104,7 @@ testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t ty + return testSuccess; + } + +-struct testEngine scatterEngine = { ++struct testEngine ncclTestEngine = { + ScatterGetBuffSize, + ScatterRunTest + }; +- +-#pragma weak ncclTestEngine=scatterEngine +diff --git a/src/sendrecv.cu b/src/sendrecv.cu +index c9eb5bb..316a449 100644 +--- a/src/sendrecv.cu ++++ b/src/sendrecv.cu +@@ -106,9 +106,7 @@ testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t t + return testSuccess; + } + +-struct testEngine sendRecvEngine = { ++struct testEngine ncclTestEngine = { + SendRecvGetBuffSize, + SendRecvRunTest + }; +- +-#pragma weak ncclTestEngine=sendRecvEngine From 948eae48c39b06eeb994e6807fe919df278f004d Mon Sep 17 00:00:00 2001 From: Cloud Han Date: Mon, 4 Sep 2023 01:31:09 +0800 Subject: [PATCH 4/4] Address formatting --- examples/WORKSPACE.bazel | 5 ++++- examples/nccl/nccl.bzl | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/WORKSPACE.bazel b/examples/WORKSPACE.bazel index 51346df9..7b6b5cb3 100644 --- a/examples/WORKSPACE.bazel +++ b/examples/WORKSPACE.bazel @@ -36,8 +36,11 @@ http_archive( name = "nccl-tests", add_prefix = "nccl-tests", build_file = "@rules_cuda_examples//nccl:nccl-tests.BUILD", + patch_args = [ + "--directory=nccl-tests", + "-p1", + ], patches = ["@rules_cuda_examples//nccl:nccl-tests-clang.patch"], - patch_args = ["--directory=nccl-tests", "-p1"], sha256 = "946adb84f63aec66aea7aab9739d41df81c24f783e85fba6328ba243cfc057e0", strip_prefix = "nccl-tests-1a5f551ffd6e3271982b03a9d5653a3f6ba545fa", urls = ["https://github.com/nvidia/nccl-tests/archive/1a5f551ffd6e3271982b03a9d5653a3f6ba545fa.tar.gz"], diff --git a/examples/nccl/nccl.bzl b/examples/nccl/nccl.bzl index 8a8553c8..e2758f27 100644 --- a/examples/nccl/nccl.bzl +++ b/examples/nccl/nccl.bzl @@ -13,7 +13,7 @@ def if_cuda_clang(if_true, if_false = []): "//conditions:default": if_false, }) -def nccl_primitive(name, hdrs = [], deps = [], use_bf16 = True): +def nccl_primitive(name, hdrs = [], deps = [], use_bf16 = True): ops = ["sum", "prod", "min", "max", "premulsum", "sumpostdiv"] datatypes = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64"] if use_bf16: