From cb0bf63b20f5c440addd8ec51e4c31f9574d7f58 Mon Sep 17 00:00:00 2001
From: Cloud Han <cloudhan@outlook.com>
Date: Sun, 30 Jul 2023 11:23:00 +0800
Subject: [PATCH 1/4] Add example

---
 examples/nccl/BUILD.bazel      |  22 +++++
 examples/nccl/WORKSPACE.bazel  |  46 ++++++++++
 examples/nccl/nccl-tests.BUILD |  48 ++++++++++
 examples/nccl/nccl-tests.bzl   |  19 ++++
 examples/nccl/nccl.BUILD       | 162 +++++++++++++++++++++++++++++++++
 examples/nccl/nccl.bzl         |  43 +++++++++
 6 files changed, 340 insertions(+)
 create mode 100644 examples/nccl/BUILD.bazel
 create mode 100644 examples/nccl/WORKSPACE.bazel
 create mode 100644 examples/nccl/nccl-tests.BUILD
 create mode 100644 examples/nccl/nccl-tests.bzl
 create mode 100644 examples/nccl/nccl.BUILD
 create mode 100644 examples/nccl/nccl.bzl

diff --git a/examples/nccl/BUILD.bazel b/examples/nccl/BUILD.bazel
new file mode 100644
index 00000000..b5446b9a
--- /dev/null
+++ b/examples/nccl/BUILD.bazel
@@ -0,0 +1,22 @@
+filegroup(
+    name = "nccl_shared",
+    srcs = [
+        "@nccl//:nccl_shared",
+    ],
+)
+
+filegroup(
+    name = "perf_binaries",
+    srcs = [
+        "@nccl-tests//:all_gather_perf",
+        "@nccl-tests//:all_reduce_perf",
+        "@nccl-tests//:alltoall_perf",
+        "@nccl-tests//:broadcast_perf",
+        "@nccl-tests//:gather_perf",
+        "@nccl-tests//:hypercube_perf",
+        "@nccl-tests//:reduce_perf",
+        "@nccl-tests//:reduce_scatter_perf",
+        "@nccl-tests//:scatter_perf",
+        "@nccl-tests//:sendrecv_perf",
+    ],
+)
diff --git a/examples/nccl/WORKSPACE.bazel b/examples/nccl/WORKSPACE.bazel
new file mode 100644
index 00000000..bde03daf
--- /dev/null
+++ b/examples/nccl/WORKSPACE.bazel
@@ -0,0 +1,46 @@
+workspace(name = "rules_cuda_examples_nccl")
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+local_repository(
+    name = "rules_cuda",
+    path = "../../",
+)
+
+load("@rules_cuda//cuda:repositories.bzl", "register_detected_cuda_toolchains", "rules_cuda_dependencies")
+
+rules_cuda_dependencies()
+
+register_detected_cuda_toolchains()
+
+#################################
+# Dependencies for this example #
+#################################
+http_archive(
+    name = "bazel_skylib",
+    sha256 = "2e6fa9a61db799266072df115a719a14a9af0e8a630b1f770ef0bd757e68cd71",
+    strip_prefix = "bazel-skylib-de3035d605b4c89a62d6da060188e4ab0c5034b9",
+    urls = ["https://github.com/bazelbuild/bazel-skylib/archive/de3035d605b4c89a62d6da060188e4ab0c5034b9.tar.gz"],
+)
+
+load("@bazel_skylib//:workspace.bzl", "bazel_skylib_workspace")
+
+bazel_skylib_workspace()
+
+http_archive(
+    name = "nccl",
+    add_prefix = "nccl",
+    build_file = "@rules_cuda_examples_nccl//:nccl.BUILD",
+    sha256 = "83b299cfc2dfe63887dadf3590b3ac2b8b2fd68ec5515b6878774eda39a697d2",
+    strip_prefix = "nccl-9814c75eea18fc7374cde884592233b6b7dc055b",
+    urls = ["https://github.com/nvidia/nccl/archive/9814c75eea18fc7374cde884592233b6b7dc055b.tar.gz"],
+)
+
+http_archive(
+    name = "nccl-tests",
+    add_prefix = "nccl-tests",
+    build_file = "@rules_cuda_examples_nccl//:nccl-tests.BUILD",
+    sha256 = "946adb84f63aec66aea7aab9739d41df81c24f783e85fba6328ba243cfc057e0",
+    strip_prefix = "nccl-tests-1a5f551ffd6e3271982b03a9d5653a3f6ba545fa",
+    urls = ["https://github.com/nvidia/nccl-tests/archive/1a5f551ffd6e3271982b03a9d5653a3f6ba545fa.tar.gz"],
+)
diff --git a/examples/nccl/nccl-tests.BUILD b/examples/nccl/nccl-tests.BUILD
new file mode 100644
index 00000000..4098ca0b
--- /dev/null
+++ b/examples/nccl/nccl-tests.BUILD
@@ -0,0 +1,48 @@
+load("@rules_cuda//cuda:defs.bzl", "cuda_library")
+load("@rules_cuda_examples_nccl//:nccl-tests.bzl", "nccl_tests_binary")
+
+cc_library(
+    name = "nccl_tests_include",
+    hdrs = glob(["nccl-tests/src/*.h"]),
+    includes = ["nccl-tests/src"],
+)
+
+cuda_library(
+    name = "common_cuda",
+    srcs = [
+        "nccl-tests/src/common.cu",
+        "nccl-tests/verifiable/verifiable.cu",
+    ],
+    deps = [
+        ":nccl_tests_include",
+        "@nccl",
+    ],
+)
+
+cc_library(
+    name = "common_cc",
+    srcs = ["nccl-tests/src/timer.cc"],
+    hdrs = ["nccl-tests/src/timer.h"],
+    alwayslink = 1,
+)
+
+# :common_cuda, :common_cc and @nccl//:nccl_shared are implicitly hardcoded in `nccl_tests_binary`
+nccl_tests_binary(name = "all_reduce")
+
+nccl_tests_binary(name = "all_gather")
+
+nccl_tests_binary(name = "broadcast")
+
+nccl_tests_binary(name = "reduce_scatter")
+
+nccl_tests_binary(name = "reduce")
+
+nccl_tests_binary(name = "alltoall")
+
+nccl_tests_binary(name = "scatter")
+
+nccl_tests_binary(name = "gather")
+
+nccl_tests_binary(name = "sendrecv")
+
+nccl_tests_binary(name = "hypercube")
diff --git a/examples/nccl/nccl-tests.bzl b/examples/nccl/nccl-tests.bzl
new file mode 100644
index 00000000..48229031
--- /dev/null
+++ b/examples/nccl/nccl-tests.bzl
@@ -0,0 +1,19 @@
+load("@rules_cuda//cuda:defs.bzl", "cuda_library")
+
+def nccl_tests_binary(name, cc_deps = [], cuda_deps = []):
+    cuda_library(
+        name = name,
+        srcs = ["nccl-tests/src/{}.cu".format(name)],
+        deps = [
+            "@nccl//:nccl_shared",
+            ":common_cuda",
+        ],
+        alwayslink = 1,
+    )
+
+    bin_name = name + "_perf"
+    native.cc_binary(
+        name = bin_name,
+        deps = [":common_cc", ":" + name],
+        visibility = ["//visibility:public"],
+    )
diff --git a/examples/nccl/nccl.BUILD b/examples/nccl/nccl.BUILD
new file mode 100644
index 00000000..d0cb6bbc
--- /dev/null
+++ b/examples/nccl/nccl.BUILD
@@ -0,0 +1,162 @@
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("@rules_cuda//cuda:defs.bzl", "cuda_library", "cuda_objects")
+load("@rules_cuda_examples_nccl//:nccl.bzl", "if_cuda_clang", "if_cuda_nvcc", "nccl_primitive")
+
+expand_template(
+    name = "nccl_h",
+    out = "nccl/src/include/nccl.h",
+    substitutions = {
+        "${nccl:Major}": "2",
+        "${nccl:Minor}": "18",
+        "${nccl:Patch}": "3",
+        "${nccl:Suffix}": "",
+        # NCCL_VERSION(X,Y,Z) ((X) * 10000 + (Y) * 100 + (Z))
+        "${nccl:Version}": "21803",
+    },
+    template = "nccl/src/nccl.h.in",
+)
+
+cc_library(
+    name = "nccl_include",
+    hdrs = [
+        ":nccl_h",
+    ] + glob([
+        "nccl/src/include/**/*.h",
+        "nccl/src/include/**/*.hpp",
+    ]),
+    includes = [
+        # this will add both nccl/src/include in repo and
+        # bazel-out/<compilation_mode>/bin/nccl/src/include to include paths
+        # so the previous expand_template generate nccl.h to the very path!
+        "nccl/src/include",
+    ],
+)
+
+cuda_objects(
+    name = "nccl_device_common",
+    srcs = [
+        "nccl/src/collectives/device/functions.cu",
+        "nccl/src/collectives/device/onerank_reduce.cu",
+    ],
+    # hdrs = hdrs,
+    copts = if_cuda_nvcc(["--extended-lambda"]),
+    ptxasopts = ["-maxrregcount=96"],
+    deps = [":nccl_include"],
+)
+
+# must be manually disabled if cuda version is lower than 11.
+USE_BF16 = True
+
+filegroup(
+    name = "collective_dev_hdrs",
+    srcs = [
+        "nccl/src/collectives/device/all_gather.h",
+        "nccl/src/collectives/device/all_reduce.h",
+        "nccl/src/collectives/device/broadcast.h",
+        "nccl/src/collectives/device/common.h",
+        "nccl/src/collectives/device/common_kernel.h",
+        "nccl/src/collectives/device/gen_rules.sh",
+        "nccl/src/collectives/device/op128.h",
+        "nccl/src/collectives/device/primitives.h",
+        "nccl/src/collectives/device/prims_ll.h",
+        "nccl/src/collectives/device/prims_ll128.h",
+        "nccl/src/collectives/device/prims_simple.h",
+        "nccl/src/collectives/device/reduce.h",
+        "nccl/src/collectives/device/reduce_kernel.h",
+        "nccl/src/collectives/device/reduce_scatter.h",
+        "nccl/src/collectives/device/sendrecv.h",
+    ],
+)
+
+# cuda_objects for each type of primitive
+nccl_primitive(
+    name = "all_gather",
+    hdrs = ["collective_dev_hdrs"],
+    use_bf16 = USE_BF16,
+    deps = [":nccl_device_common"],
+)
+
+nccl_primitive(
+    name = "all_reduce",
+    hdrs = ["collective_dev_hdrs"],
+    use_bf16 = USE_BF16,
+    deps = [":nccl_device_common"],
+)
+
+nccl_primitive(
+    name = "broadcast",
+    hdrs = ["collective_dev_hdrs"],
+    use_bf16 = USE_BF16,
+    deps = [":nccl_device_common"],
+)
+
+nccl_primitive(
+    name = "reduce",
+    hdrs = ["collective_dev_hdrs"],
+    use_bf16 = USE_BF16,
+    deps = [":nccl_device_common"],
+)
+
+nccl_primitive(
+    name = "reduce_scatter",
+    hdrs = ["collective_dev_hdrs"],
+    use_bf16 = USE_BF16,
+    deps = [":nccl_device_common"],
+)
+
+nccl_primitive(
+    name = "sendrecv",
+    hdrs = ["collective_dev_hdrs"],
+    use_bf16 = USE_BF16,
+    deps = [":nccl_device_common"],
+)
+
+# device link
+cuda_library(
+    name = "collectives",
+    rdc = 1,
+    deps = [
+        ":all_gather",
+        ":all_reduce",
+        ":broadcast",
+        ":reduce",
+        ":reduce_scatter",
+        ":sendrecv",
+    ],
+    alwayslink = 1,
+)
+
+cc_binary(
+    name = "nccl",
+    srcs = glob(
+        [
+            "nccl/src/*.cc",
+            "nccl/src/collectives/*.cc",
+            "nccl/src/graph/*.cc",
+            "nccl/src/graph/*.h",
+            "nccl/src/misc/*.cc",
+            "nccl/src/transport/*.cc",
+        ],
+        exclude = [
+            # https://github.com/NVIDIA/nccl/issues/658
+            "nccl/src/enhcompat.cc",
+        ],
+    ),
+    copts = if_cuda_clang(["-xcu"]),
+    linkshared = 1,
+    linkstatic = 1,
+    deps = [
+        ":collectives",
+        ":nccl_include",
+        "@rules_cuda//cuda:runtime",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+# To allow downstream targets to link with the nccl shared library, we need to `cc_import` it again.
+# See https://groups.google.com/g/bazel-discuss/c/RtbidPdVFyU/m/TsUDOVHIAwAJ
+cc_import(
+    name = "nccl_shared",
+    shared_library = ":nccl",
+    visibility = ["//visibility:public"],
+)
diff --git a/examples/nccl/nccl.bzl b/examples/nccl/nccl.bzl
new file mode 100644
index 00000000..8a8553c8
--- /dev/null
+++ b/examples/nccl/nccl.bzl
@@ -0,0 +1,43 @@
+load("@bazel_skylib//rules:copy_file.bzl", "copy_file")
+load("@rules_cuda//cuda:defs.bzl", "cuda_library", "cuda_objects")
+
+def if_cuda_nvcc(if_true, if_false = []):
+    return select({
+        "@rules_cuda//cuda:compiler_is_nvcc": if_true,
+        "//conditions:default": if_false,
+    })
+
+def if_cuda_clang(if_true, if_false = []):
+    return select({
+        "@rules_cuda//cuda:compiler_is_clang": if_true,
+        "//conditions:default": if_false,
+    })
+
+def nccl_primitive(name, hdrs = [],  deps = [], use_bf16 = True):
+    ops = ["sum", "prod", "min", "max", "premulsum", "sumpostdiv"]
+    datatypes = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64"]
+    if use_bf16:
+        datatypes.append("bf16")
+
+    intermediate_targets = []
+    for opn, op in enumerate(ops):
+        for dtn, dt in enumerate(datatypes):
+            name_op_dt = "{}_{}_{}".format(name, op, dt)
+            copy_file(
+                name = name_op_dt + "_rename",
+                src = "nccl/src/collectives/device/{}.cu".format(name),
+                out = "nccl/src/collectives/device/{}.cu".format(name_op_dt),
+            )
+
+            cuda_objects(
+                name = name_op_dt,
+                srcs = [":{}_rename".format(name_op_dt)],
+                hdrs = hdrs,
+                deps = deps,
+                ptxasopts = ["-maxrregcount=96"],
+                defines = ["NCCL_OP={}".format(opn), "NCCL_TYPE={}".format(dtn)],
+                includes = ["nccl/src/collectives/device"],
+            )
+            intermediate_targets.append(":" + name_op_dt)
+
+    cuda_objects(name = name, deps = intermediate_targets)

From 282da47c3e8dc7412c2799b9e1fb72003e12787f Mon Sep 17 00:00:00 2001
From: Cloud Han <cloudhan@outlook.com>
Date: Sat, 2 Sep 2023 22:01:57 +0800
Subject: [PATCH 2/4] Move nccl deps to examples repo

---
 examples/WORKSPACE.bazel       | 23 +++++++++++++++++
 examples/nccl/WORKSPACE.bazel  | 46 ----------------------------------
 examples/nccl/nccl-tests.BUILD |  6 +++--
 examples/nccl/nccl.BUILD       |  9 ++++---
 4 files changed, 32 insertions(+), 52 deletions(-)
 delete mode 100644 examples/nccl/WORKSPACE.bazel

diff --git a/examples/WORKSPACE.bazel b/examples/WORKSPACE.bazel
index f79eaaf7..b35a9c23 100644
--- a/examples/WORKSPACE.bazel
+++ b/examples/WORKSPACE.bazel
@@ -17,3 +17,26 @@ load("@rules_cuda//cuda:repositories.bzl", "register_detected_cuda_toolchains",
 rules_cuda_dependencies()
 
 register_detected_cuda_toolchains()
+
+#################################
+# Dependencies for nccl example #
+#################################
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+http_archive(
+    name = "nccl",
+    add_prefix = "nccl",
+    build_file = "@rules_cuda_examples//nccl:nccl.BUILD",
+    sha256 = "83b299cfc2dfe63887dadf3590b3ac2b8b2fd68ec5515b6878774eda39a697d2",
+    strip_prefix = "nccl-9814c75eea18fc7374cde884592233b6b7dc055b",
+    urls = ["https://github.com/nvidia/nccl/archive/9814c75eea18fc7374cde884592233b6b7dc055b.tar.gz"],
+)
+
+http_archive(
+    name = "nccl-tests",
+    add_prefix = "nccl-tests",
+    build_file = "@rules_cuda_examples//nccl:nccl-tests.BUILD",
+    sha256 = "946adb84f63aec66aea7aab9739d41df81c24f783e85fba6328ba243cfc057e0",
+    strip_prefix = "nccl-tests-1a5f551ffd6e3271982b03a9d5653a3f6ba545fa",
+    urls = ["https://github.com/nvidia/nccl-tests/archive/1a5f551ffd6e3271982b03a9d5653a3f6ba545fa.tar.gz"],
+)
diff --git a/examples/nccl/WORKSPACE.bazel b/examples/nccl/WORKSPACE.bazel
deleted file mode 100644
index bde03daf..00000000
--- a/examples/nccl/WORKSPACE.bazel
+++ /dev/null
@@ -1,46 +0,0 @@
-workspace(name = "rules_cuda_examples_nccl")
-
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-
-local_repository(
-    name = "rules_cuda",
-    path = "../../",
-)
-
-load("@rules_cuda//cuda:repositories.bzl", "register_detected_cuda_toolchains", "rules_cuda_dependencies")
-
-rules_cuda_dependencies()
-
-register_detected_cuda_toolchains()
-
-#################################
-# Dependencies for this example #
-#################################
-http_archive(
-    name = "bazel_skylib",
-    sha256 = "2e6fa9a61db799266072df115a719a14a9af0e8a630b1f770ef0bd757e68cd71",
-    strip_prefix = "bazel-skylib-de3035d605b4c89a62d6da060188e4ab0c5034b9",
-    urls = ["https://github.com/bazelbuild/bazel-skylib/archive/de3035d605b4c89a62d6da060188e4ab0c5034b9.tar.gz"],
-)
-
-load("@bazel_skylib//:workspace.bzl", "bazel_skylib_workspace")
-
-bazel_skylib_workspace()
-
-http_archive(
-    name = "nccl",
-    add_prefix = "nccl",
-    build_file = "@rules_cuda_examples_nccl//:nccl.BUILD",
-    sha256 = "83b299cfc2dfe63887dadf3590b3ac2b8b2fd68ec5515b6878774eda39a697d2",
-    strip_prefix = "nccl-9814c75eea18fc7374cde884592233b6b7dc055b",
-    urls = ["https://github.com/nvidia/nccl/archive/9814c75eea18fc7374cde884592233b6b7dc055b.tar.gz"],
-)
-
-http_archive(
-    name = "nccl-tests",
-    add_prefix = "nccl-tests",
-    build_file = "@rules_cuda_examples_nccl//:nccl-tests.BUILD",
-    sha256 = "946adb84f63aec66aea7aab9739d41df81c24f783e85fba6328ba243cfc057e0",
-    strip_prefix = "nccl-tests-1a5f551ffd6e3271982b03a9d5653a3f6ba545fa",
-    urls = ["https://github.com/nvidia/nccl-tests/archive/1a5f551ffd6e3271982b03a9d5653a3f6ba545fa.tar.gz"],
-)
diff --git a/examples/nccl/nccl-tests.BUILD b/examples/nccl/nccl-tests.BUILD
index 4098ca0b..f482e6db 100644
--- a/examples/nccl/nccl-tests.BUILD
+++ b/examples/nccl/nccl-tests.BUILD
@@ -1,5 +1,5 @@
 load("@rules_cuda//cuda:defs.bzl", "cuda_library")
-load("@rules_cuda_examples_nccl//:nccl-tests.bzl", "nccl_tests_binary")
+load("@rules_cuda_examples//nccl:nccl-tests.bzl", "nccl_tests_binary")
 
 cc_library(
     name = "nccl_tests_include",
@@ -12,7 +12,9 @@ cuda_library(
     srcs = [
         "nccl-tests/src/common.cu",
         "nccl-tests/verifiable/verifiable.cu",
-    ],
+    ] + glob([
+        "nccl-tests/**/*.h",
+    ]),
     deps = [
         ":nccl_tests_include",
         "@nccl",
diff --git a/examples/nccl/nccl.BUILD b/examples/nccl/nccl.BUILD
index d0cb6bbc..98f36117 100644
--- a/examples/nccl/nccl.BUILD
+++ b/examples/nccl/nccl.BUILD
@@ -1,6 +1,6 @@
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("@rules_cuda//cuda:defs.bzl", "cuda_library", "cuda_objects")
-load("@rules_cuda_examples_nccl//:nccl.bzl", "if_cuda_clang", "if_cuda_nvcc", "nccl_primitive")
+load("@rules_cuda_examples//nccl:nccl.bzl", "if_cuda_clang", "if_cuda_nvcc", "nccl_primitive")
 
 expand_template(
     name = "nccl_h",
@@ -37,8 +37,9 @@ cuda_objects(
     srcs = [
         "nccl/src/collectives/device/functions.cu",
         "nccl/src/collectives/device/onerank_reduce.cu",
-    ],
-    # hdrs = hdrs,
+    ] + glob([
+        "nccl/src/collectives/device/**/*.h",
+    ]),
     copts = if_cuda_nvcc(["--extended-lambda"]),
     ptxasopts = ["-maxrregcount=96"],
     deps = [":nccl_include"],
@@ -145,12 +146,12 @@ cc_binary(
     copts = if_cuda_clang(["-xcu"]),
     linkshared = 1,
     linkstatic = 1,
+    visibility = ["//visibility:public"],
     deps = [
         ":collectives",
         ":nccl_include",
         "@rules_cuda//cuda:runtime",
     ],
-    visibility = ["//visibility:public"],
 )
 
 # To allow downstream targets to link with the nccl shared library, we need to `cc_import` it again.

From 0216f8c0e3af3106b5f519bf74de9cfa696f42d6 Mon Sep 17 00:00:00 2001
From: Cloud Han <cloudhan@outlook.com>
Date: Mon, 4 Sep 2023 01:20:04 +0800
Subject: [PATCH 3/4] Fix clang

---
 examples/WORKSPACE.bazel             |   2 +
 examples/nccl/nccl-tests-clang.patch | 172 +++++++++++++++++++++++++++
 2 files changed, 174 insertions(+)
 create mode 100644 examples/nccl/nccl-tests-clang.patch

diff --git a/examples/WORKSPACE.bazel b/examples/WORKSPACE.bazel
index b35a9c23..51346df9 100644
--- a/examples/WORKSPACE.bazel
+++ b/examples/WORKSPACE.bazel
@@ -36,6 +36,8 @@ http_archive(
     name = "nccl-tests",
     add_prefix = "nccl-tests",
     build_file = "@rules_cuda_examples//nccl:nccl-tests.BUILD",
+    patches = ["@rules_cuda_examples//nccl:nccl-tests-clang.patch"],
+    patch_args = ["--directory=nccl-tests", "-p1"],
     sha256 = "946adb84f63aec66aea7aab9739d41df81c24f783e85fba6328ba243cfc057e0",
     strip_prefix = "nccl-tests-1a5f551ffd6e3271982b03a9d5653a3f6ba545fa",
     urls = ["https://github.com/nvidia/nccl-tests/archive/1a5f551ffd6e3271982b03a9d5653a3f6ba545fa.tar.gz"],
diff --git a/examples/nccl/nccl-tests-clang.patch b/examples/nccl/nccl-tests-clang.patch
new file mode 100644
index 00000000..9d6d60d6
--- /dev/null
+++ b/examples/nccl/nccl-tests-clang.patch
@@ -0,0 +1,172 @@
+diff --git a/src/all_gather.cu b/src/all_gather.cu
+index 0831207..941ec1b 100644
+--- a/src/all_gather.cu
++++ b/src/all_gather.cu
+@@ -85,9 +85,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t
+   return testSuccess;
+ }
+ 
+-struct testEngine allGatherEngine = {
++struct testEngine ncclTestEngine = {
+   AllGatherGetBuffSize,
+   AllGatherRunTest
+ };
+-
+-#pragma weak ncclTestEngine=allGatherEngine
+diff --git a/src/all_reduce.cu b/src/all_reduce.cu
+index a38eabe..acb66a8 100644
+--- a/src/all_reduce.cu
++++ b/src/all_reduce.cu
+@@ -93,9 +93,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
+   return testSuccess;
+ }
+ 
+-struct testEngine allReduceEngine = {
++struct testEngine ncclTestEngine = {
+   AllReduceGetBuffSize,
+   AllReduceRunTest
+ };
+-
+-#pragma weak ncclTestEngine=allReduceEngine
+diff --git a/src/alltoall.cu b/src/alltoall.cu
+index 41c7c4a..712e664 100644
+--- a/src/alltoall.cu
++++ b/src/alltoall.cu
+@@ -99,9 +99,7 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t
+   return testSuccess;
+ }
+ 
+-struct testEngine alltoAllEngine = {
++struct testEngine ncclTestEngine = {
+   AlltoAllGetBuffSize,
+   AlltoAllRunTest
+ };
+-
+-#pragma weak ncclTestEngine=alltoAllEngine
+diff --git a/src/broadcast.cu b/src/broadcast.cu
+index 903066a..778c664 100644
+--- a/src/broadcast.cu
++++ b/src/broadcast.cu
+@@ -99,9 +99,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t
+   return testSuccess;
+ }
+ 
+-struct testEngine broadcastEngine = {
++struct testEngine ncclTestEngine = {
+   BroadcastGetBuffSize,
+   BroadcastRunTest
+ };
+-
+-#pragma weak ncclTestEngine=broadcastEngine
+diff --git a/src/common.cu b/src/common.cu
+index 48a629c..d888edc 100644
+--- a/src/common.cu
++++ b/src/common.cu
+@@ -330,7 +330,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
+   size_t count = args->nbytes / wordSize(type);
+ 
+   // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
+-  size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
++  size_t totalnbytes = std::max(args->sendBytes, args->expectedBytes);
+   size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+   size_t shift = totalnbytes * (iter % steps);
+ 
+@@ -597,7 +597,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
+       setupArgs(size, type, args);
+       char rootName[100];
+       sprintf(rootName, "%6i", root);
+-      PRINT("%12li  %12li  %8s  %6s  %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
++      PRINT("%12li  %12li  %8s  %6s  %6s", std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
+       TESTCHECK(BenchTime(args, type, op, root, 0));
+       TESTCHECK(BenchTime(args, type, op, root, 1));
+       PRINT("\n");
+diff --git a/src/gather.cu b/src/gather.cu
+index 03ef4d9..242a298 100644
+--- a/src/gather.cu
++++ b/src/gather.cu
+@@ -108,9 +108,7 @@ testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t typ
+   return testSuccess;
+ }
+ 
+-struct testEngine gatherEngine = {
++struct testEngine ncclTestEngine = {
+   GatherGetBuffSize,
+   GatherRunTest
+ };
+-
+-#pragma weak ncclTestEngine=gatherEngine
+diff --git a/src/hypercube.cu b/src/hypercube.cu
+index 5c1456f..9aadfc5 100644
+--- a/src/hypercube.cu
++++ b/src/hypercube.cu
+@@ -110,9 +110,7 @@ testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t
+   return testSuccess;
+ }
+ 
+-struct testEngine hyperCubeEngine = {
++struct testEngine ncclTestEngine = {
+   HyperCubeGetBuffSize,
+   HyperCubeRunTest
+ };
+-
+-#pragma weak ncclTestEngine=hyperCubeEngine
+diff --git a/src/reduce.cu b/src/reduce.cu
+index f2fa80d..80aadc5 100644
+--- a/src/reduce.cu
++++ b/src/reduce.cu
+@@ -102,9 +102,7 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ
+   return testSuccess;
+ }
+ 
+-struct testEngine reduceEngine = {
++struct testEngine ncclTestEngine = {
+   ReduceGetBuffSize,
+   ReduceRunTest
+ };
+-
+-#pragma weak ncclTestEngine=reduceEngine
+diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
+index ed372e3..212a6f0 100644
+--- a/src/reduce_scatter.cu
++++ b/src/reduce_scatter.cu
+@@ -97,9 +97,7 @@ testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataTyp
+   return testSuccess;
+ }
+ 
+-struct testEngine reduceScatterEngine = {
++struct testEngine ncclTestEngine = {
+   ReduceScatterGetBuffSize,
+   ReduceScatterRunTest
+ };
+-
+-#pragma weak ncclTestEngine=reduceScatterEngine
+diff --git a/src/scatter.cu b/src/scatter.cu
+index 49d20e1..56f5ede 100644
+--- a/src/scatter.cu
++++ b/src/scatter.cu
+@@ -104,9 +104,7 @@ testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t ty
+   return testSuccess;
+ }
+ 
+-struct testEngine scatterEngine = {
++struct testEngine ncclTestEngine = {
+   ScatterGetBuffSize,
+   ScatterRunTest
+ };
+-
+-#pragma weak ncclTestEngine=scatterEngine
+diff --git a/src/sendrecv.cu b/src/sendrecv.cu
+index c9eb5bb..316a449 100644
+--- a/src/sendrecv.cu
++++ b/src/sendrecv.cu
+@@ -106,9 +106,7 @@ testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t t
+   return testSuccess;
+ }
+ 
+-struct testEngine sendRecvEngine = {
++struct testEngine ncclTestEngine = {
+   SendRecvGetBuffSize,
+   SendRecvRunTest
+ };
+-
+-#pragma weak ncclTestEngine=sendRecvEngine

From 948eae48c39b06eeb994e6807fe919df278f004d Mon Sep 17 00:00:00 2001
From: Cloud Han <cloudhan@outlook.com>
Date: Mon, 4 Sep 2023 01:31:09 +0800
Subject: [PATCH 4/4] Address formatting

---
 examples/WORKSPACE.bazel | 5 ++++-
 examples/nccl/nccl.bzl   | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/WORKSPACE.bazel b/examples/WORKSPACE.bazel
index 51346df9..7b6b5cb3 100644
--- a/examples/WORKSPACE.bazel
+++ b/examples/WORKSPACE.bazel
@@ -36,8 +36,11 @@ http_archive(
     name = "nccl-tests",
     add_prefix = "nccl-tests",
     build_file = "@rules_cuda_examples//nccl:nccl-tests.BUILD",
+    patch_args = [
+        "--directory=nccl-tests",
+        "-p1",
+    ],
     patches = ["@rules_cuda_examples//nccl:nccl-tests-clang.patch"],
-    patch_args = ["--directory=nccl-tests", "-p1"],
     sha256 = "946adb84f63aec66aea7aab9739d41df81c24f783e85fba6328ba243cfc057e0",
     strip_prefix = "nccl-tests-1a5f551ffd6e3271982b03a9d5653a3f6ba545fa",
     urls = ["https://github.com/nvidia/nccl-tests/archive/1a5f551ffd6e3271982b03a9d5653a3f6ba545fa.tar.gz"],
diff --git a/examples/nccl/nccl.bzl b/examples/nccl/nccl.bzl
index 8a8553c8..e2758f27 100644
--- a/examples/nccl/nccl.bzl
+++ b/examples/nccl/nccl.bzl
@@ -13,7 +13,7 @@ def if_cuda_clang(if_true, if_false = []):
         "//conditions:default": if_false,
     })
 
-def nccl_primitive(name, hdrs = [],  deps = [], use_bf16 = True):
+def nccl_primitive(name, hdrs = [], deps = [], use_bf16 = True):
     ops = ["sum", "prod", "min", "max", "premulsum", "sumpostdiv"]
     datatypes = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64"]
     if use_bf16: