fix(gpu): attempt to fix scalar comparison with 1 block

zama-ai · Jul 22, 2024 · 3f3860c · 3f3860c
1 parent 32262e0
commit 3f3860c
Show file tree

Hide file tree

Showing 6 changed files with 7 additions and 8 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -1736,7 +1736,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
                                      int_radix_params params,
                                      uint32_t num_radix_blocks,
                                      bool allocate_gpu_memory) {
-    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+    active_gpu_count = get_active_gpu_count(1, gpu_count);
     // In the arithmetic shift, a PBS has to be applied to the last rotated
     // block twice: once to shift it, once to compute the padding block to be
     // copied onto all blocks to the left of the last rotated block

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -76,8 +76,10 @@ __host__ void host_integer_radix_cmux_kb(
                   mem_ptr->predicate_lut, bsks, ksks, num_radix_blocks);
     }
   }
-  for (uint j = 0; j < gpu_count; j++) {
+  for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
     cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
+  }
+  for (uint j = 0; j < mem_ptr->zero_if_false_buffer->active_gpu_count; j++) {
     cuda_synchronize_stream(false_streams[j], gpu_indexes[j]);
   }
 

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -245,7 +245,6 @@ __host__ void host_compare_with_zero_equality(
     int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
     int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {
 
-  cudaSetDevice(gpu_indexes[0]);
   auto params = mem_ptr->params;
   auto big_lwe_dimension = params.big_lwe_dimension;
   auto message_modulus = params.message_modulus;

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -133,7 +133,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
             mem_ptr, bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
       }
     }
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
       cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
       cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
     }
@@ -205,7 +205,6 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
     std::function<Torus(Torus)> sign_handler_f, void **bsks, Torus **ksks,
     uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
 
-  cudaSetDevice(gpu_indexes[0]);
   auto params = mem_ptr->params;
   auto big_lwe_dimension = params.big_lwe_dimension;
   auto glwe_dimension = params.glwe_dimension;
@@ -737,7 +736,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
     }
   }
 
-  for (uint j = 0; j < gpu_count; j++) {
+  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
     cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
     cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
   }

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -245,7 +245,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
         }
       }
     }
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < mem->active_gpu_count; j++) {
       cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]);
       cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]);
     }

diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
@@ -27,7 +27,6 @@ int cuda_setup_multi_gpu() {
         num_used_gpus += 1;
       }
     } else {
-      int has_peer_access_to_device_0;
       for (int i = 1; i < num_gpus; i++)
         num_used_gpus += 1;
     }