fix(gpu): dispatch/gather inputs and outputs to the ks and pbs on all…

… GPUs
zama-ai · Jul 22, 2024 · ae8a5a9 · ae8a5a9
1 parent 95ef13f
commit ae8a5a9
Show file tree

Hide file tree

Showing 34 changed files with 856 additions and 510 deletions.
diff --git a/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml b/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml
@@ -46,8 +46,8 @@ jobs:
           github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
           slab-url: ${{ secrets.SLAB_BASE_URL }}
           job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: multi-gpu-test
+          backend: hyperstack
+          profile: multi-h100-nvlink
 
   cuda-integer-multi-bit-multi-gpu-benchmarks:
     name: Execute multi GPU integer multi-bit benchmarks
@@ -62,11 +62,23 @@ jobs:
         include:
           - os: ubuntu-22.04
             cuda: "12.2"
-            gcc: 9
+            gcc: 11
     env:
       CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-
+      CMAKE_VERSION: 3.29.6
     steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
       - name: Checkout tfhe-rs repo with tags
         uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
         with:
@@ -135,7 +147,7 @@ jobs:
         run: |
           python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
           --database tfhe_rs \
-          --hardware "p3.8xlarge" \
+          --hardware "n3-H100x8-nvlink" \
           --backend gpu \
           --project-version "${{ env.COMMIT_HASH }}" \
           --branch ${{ github.ref_name }} \

diff --git a/.github/workflows/integer_multi_gpu_full_benchmark.yml b/.github/workflows/integer_multi_gpu_full_benchmark.yml
@@ -35,8 +35,8 @@ jobs:
           github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
           slab-url: ${{ secrets.SLAB_BASE_URL }}
           job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: multi-gpu-test
+          backend: hyperstack
+          profile: multi-h100-nvlink
 
   cuda-integer-full-multi-gpu-benchmarks:
     name: Execute multi GPU integer benchmarks for all operations flavor
@@ -54,11 +54,23 @@ jobs:
         include:
           - os: ubuntu-22.04
             cuda: "12.2"
-            gcc: 9
+            gcc: 11
     env:
       CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-
+      CMAKE_VERSION: 3.29.6
     steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
       - name: Checkout tfhe-rs repo with tags
         uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
         with:
@@ -117,7 +129,7 @@ jobs:
         run: |
           python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
           --database tfhe_rs \
-          --hardware "p3.8xlarge" \
+          --hardware "n3-H100x8-nvlink" \
           --backend gpu \
           --project-version "${{ env.COMMIT_HASH }}" \
           --branch ${{ github.ref_name }} \

diff --git a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -1,6 +1,8 @@
 #ifndef HELPER_MULTI_GPU_H
 #define HELPER_MULTI_GPU_H
 #include <mutex>
+#include <variant>
+#include <vector>
 
 extern std::mutex m;
 extern bool p2p_enabled;
@@ -9,6 +11,20 @@ extern "C" {
 int cuda_setup_multi_gpu();
 }
 
+// Define a variant type that can be either a vector or a single pointer
+template <typename Torus>
+using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
+
+// Macro to define the visitor logic using std::holds_alternative for vectors
+#define GET_VARIANT_ELEMENT(variant, index)                                    \
+  [&] {                                                                        \
+    if (std::holds_alternative<std::vector<Torus *>>(variant)) {               \
+      return std::get<std::vector<Torus *>>(variant)[index];                   \
+    } else {                                                                   \
+      return std::get<Torus *>(variant);                                       \
+    }                                                                          \
+  }()
+
 int get_active_gpu_count(int num_inputs, int gpu_count);
 
 int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);