feat(gpu): add abs operation on gpu backend

zama-ai · Nov 15, 2024 · 0aee4c5 · 0aee4c5
1 parent f9e8df4
commit 0aee4c5
Show file tree

Hide file tree

Showing 11 changed files with 470 additions and 0 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -417,5 +417,24 @@ void cuda_integer_reverse_blocks_64_inplace(void *const *streams,
                                             uint32_t num_blocks,
                                             uint32_t lwe_size);
 
+void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, bool is_signed, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory);
+
+void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks);
+
+void cleanup_cuda_integer_abs_inplace(void *const *streams,
+                                      uint32_t const *gpu_indexes,
+                                      uint32_t gpu_count,
+                                      int8_t **mem_ptr_void);
+
 } // extern C
 #endif // CUDA_INTEGER_H
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -3012,4 +3012,52 @@ template <typename Torus> struct int_scalar_mul_buffer {
   }
 };
 
+template <typename Torus> struct int_abs_buffer {
+  int_radix_params params;
+
+  int_arithmetic_scalar_shift_buffer<Torus> *arithmetic_scalar_shift_mem;
+  int_sc_prop_memory<Torus> *scp_mem;
+  int_bitop_buffer<Torus> *bitxor_mem;
+
+  Torus *mask;
+  int_abs_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+                 uint32_t gpu_count, int_radix_params params,
+                 uint32_t num_radix_blocks, bool allocate_gpu_memory) {
+    this->params = params;
+
+    if (allocate_gpu_memory) {
+      arithmetic_scalar_shift_mem =
+          new int_arithmetic_scalar_shift_buffer<Torus>(
+              streams, gpu_indexes, gpu_count,
+              SHIFT_OR_ROTATE_TYPE::RIGHT_SHIFT, params, num_radix_blocks,
+              allocate_gpu_memory);
+      scp_mem =
+          new int_sc_prop_memory<Torus>(streams, gpu_indexes, gpu_count, params,
+                                        num_radix_blocks, allocate_gpu_memory);
+      bitxor_mem = new int_bitop_buffer<Torus>(
+          streams, gpu_indexes, gpu_count, BITOP_TYPE::BITXOR, params,
+          num_radix_blocks, allocate_gpu_memory);
+
+      uint32_t lwe_size = params.big_lwe_dimension + 1;
+      uint32_t lwe_size_bytes = lwe_size * sizeof(Torus);
+
+      mask = (Torus *)cuda_malloc_async(num_radix_blocks * lwe_size_bytes,
+                                        streams[0], gpu_indexes[0]);
+    }
+  }
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+    arithmetic_scalar_shift_mem->release(streams, gpu_indexes, gpu_count);
+    scp_mem->release(streams, gpu_indexes, gpu_count);
+    bitxor_mem->release(streams, gpu_indexes, gpu_count);
+
+    delete arithmetic_scalar_shift_mem;
+    delete scp_mem;
+    delete bitxor_mem;
+
+    cuda_drop_async(mask, streams[0], gpu_indexes[0]);
+  }
+};
+
 #endif // CUDA_INTEGER_UTILITIES_H
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
@@ -0,0 +1,43 @@
+#include "integer/abs.cuh"
+
+void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, bool is_signed, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  scratch_cuda_integer_abs_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_abs_buffer<uint64_t> **)mem_ptr, is_signed, num_blocks, params,
+      allocate_gpu_memory);
+}
+
+void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks) {
+
+  auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;
+
+  host_integer_abs_kb<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
+                                gpu_count, static_cast<uint64_t *>(ct), bsks,
+                                (uint64_t **)(ksks), mem, is_signed,
+                                num_blocks);
+}
+
+void cleanup_cuda_integer_abs_inplace(void *const *streams,
+                                      uint32_t const *gpu_indexes,
+                                      uint32_t gpu_count,
+                                      int8_t **mem_ptr_void) {
+  int_abs_buffer<uint64_t> *mem_ptr =
+      (int_abs_buffer<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
@@ -0,0 +1,69 @@
+#ifndef TFHE_RS_ABS_CUH
+#define TFHE_RS_ABS_CUH
+
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer/bitwise_ops.cuh"
+#include "integer/comparison.cuh"
+#include "integer/integer.cuh"
+#include "integer/integer_utilities.h"
+#include "integer/negation.cuh"
+#include "integer/scalar_shifts.cuh"
+#include "linear_algebra.h"
+#include "pbs/programmable_bootstrap.h"
+#include "utils/helper.cuh"
+#include "utils/kernel_dimensions.cuh"
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+template <typename Torus>
+__host__ void scratch_cuda_integer_abs_kb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, int_abs_buffer<Torus> **mem_ptr, bool is_signed,
+    uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
+
+  if (is_signed)
+    *mem_ptr =
+        new int_abs_buffer<Torus>(streams, gpu_indexes, gpu_count, params,
+                                  num_blocks, allocate_gpu_memory);
+}
+
+template <typename Torus>
+__host__ void
+host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+                    uint32_t gpu_count, Torus *ct, void *const *bsks,
+                    uint64_t *const *ksks, int_abs_buffer<uint64_t> *mem_ptr,
+                    bool is_signed, uint32_t num_blocks) {
+  if (!is_signed)
+    return;
+
+  auto radix_params = mem_ptr->params;
+  auto mask = mem_ptr->mask;
+
+  auto big_lwe_dimension = radix_params.big_lwe_dimension;
+  auto big_lwe_size = big_lwe_dimension + 1;
+  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+  uint32_t num_bits_in_ciphertext =
+      (31 - __builtin_clz(radix_params.message_modulus)) * num_blocks;
+
+  cuda_memcpy_async_gpu_to_gpu(mask, ct, num_blocks * big_lwe_size_bytes,
+                               streams[0], gpu_indexes[0]);
+
+  host_integer_radix_arithmetic_scalar_shift_kb_inplace(
+      streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
+      mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, num_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
+                       radix_params.big_lwe_dimension, num_blocks);
+
+  host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count, ct,
+                                     nullptr, nullptr, mem_ptr->scp_mem, bsks,
+                                     ksks, num_blocks);
+
+  host_integer_radix_bitop_kb(streams, gpu_indexes, gpu_count, ct, mask, ct,
+                              mem_ptr->bitxor_mem, bsks, ksks, num_blocks);
+}
+
+#endif // TFHE_RS_ABS_CUH
diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -1036,6 +1036,50 @@ extern "C" {
         lwe_size: u32,
     );
 }
+extern "C" {
+    pub fn scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        is_signed: bool,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_blocks: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+    );
+}
+extern "C" {
+    pub fn cuda_integer_abs_inplace_radix_ciphertext_kb_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        ct: *mut ffi::c_void,
+        mem_ptr: *mut i8,
+        is_signed: bool,
+        bsks: *const *mut ffi::c_void,
+        ksks: *const *mut ffi::c_void,
+        num_blocks: u32,
+    );
+}
+extern "C" {
+    pub fn cleanup_cuda_integer_abs_inplace(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
 extern "C" {
     pub fn cuda_keyswitch_lwe_ciphertext_vector_32(
         stream: *mut ffi::c_void,

diff --git a/tfhe/benches/integer/signed_bench.rs b/tfhe/benches/integer/signed_bench.rs
@@ -1602,6 +1602,10 @@ mod cuda {
         display_name: neg
     );
 
+    define_cuda_server_key_bench_clean_input_signed_unary_fn!(
+        method_name: unchecked_abs,
+        display_name: abs
+    );
     define_cuda_server_key_bench_clean_input_signed_fn!(
         method_name: unchecked_mul,
         display_name: mul
@@ -1842,6 +1846,11 @@ mod cuda {
         display_name: neg
     );
 
+    define_cuda_server_key_bench_clean_input_signed_unary_fn!(
+        method_name: abs,
+        display_name: abs
+    );
+
     define_cuda_server_key_bench_clean_input_signed_fn!(
         method_name: mul,
         display_name: mul
@@ -2056,6 +2065,7 @@ mod cuda {
         cuda_unchecked_add,
         cuda_unchecked_sub,
         cuda_unchecked_neg,
+        cuda_unchecked_abs,
         cuda_unchecked_mul,
         cuda_unchecked_bitand,
         cuda_unchecked_bitnot,
@@ -2105,6 +2115,7 @@ mod cuda {
         cuda_add,
         cuda_sub,
         cuda_neg,
+        cuda_abs,
         cuda_mul,
         cuda_bitand,
         cuda_bitnot,

diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs
@@ -2770,3 +2770,68 @@ pub unsafe fn reverse_blocks_inplace_async<T: UnsignedInteger>(
         );
     }
 }
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub unsafe fn unchecked_signed_abs_radix_kb_assign_async<T: UnsignedInteger, B: Numeric>(
+    streams: &CudaStreams,
+    ct: &mut CudaVec<T>,
+    bootstrapping_key: &CudaVec<B>,
+    keyswitch_key: &CudaVec<T>,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    big_lwe_dimension: LweDimension,
+    small_lwe_dimension: LweDimension,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    num_blocks: u32,
+    pbs_type: PBSType,
+    grouping_factor: LweBskGroupingFactor,
+) {
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+        true,
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+        big_lwe_dimension.0 as u32,
+        small_lwe_dimension.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        grouping_factor.0 as u32,
+        num_blocks,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        pbs_type as u32,
+        true,
+    );
+    cuda_integer_abs_inplace_radix_ciphertext_kb_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        ct.as_mut_c_ptr(0),
+        mem_ptr,
+        true,
+        bootstrapping_key.ptr.as_ptr(),
+        keyswitch_key.ptr.as_ptr(),
+        num_blocks,
+    );
+    cleanup_cuda_integer_abs_inplace(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+    );
+}