diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h index a9990423fe..4de3dad896 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h @@ -417,5 +417,24 @@ void cuda_integer_reverse_blocks_64_inplace(void *const *streams, uint32_t num_blocks, uint32_t lwe_size); +void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, bool is_signed, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t big_lwe_dimension, + uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, + PBS_TYPE pbs_type, bool allocate_gpu_memory); + +void cuda_integer_abs_inplace_radix_ciphertext_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks, + void *const *ksks, uint32_t num_blocks); + +void cleanup_cuda_integer_abs_inplace(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void); + } // extern C #endif // CUDA_INTEGER_H diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h index 6dc85d4225..f925edadfb 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h @@ -3012,4 +3012,52 @@ template struct int_scalar_mul_buffer { } }; +template struct int_abs_buffer { + int_radix_params params; + + int_arithmetic_scalar_shift_buffer *arithmetic_scalar_shift_mem; + int_sc_prop_memory *scp_mem; + int_bitop_buffer *bitxor_mem; + + Torus *mask; + int_abs_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_radix_params params, + uint32_t num_radix_blocks, bool allocate_gpu_memory) { + this->params = params; + + if (allocate_gpu_memory) { + arithmetic_scalar_shift_mem = + new int_arithmetic_scalar_shift_buffer( + streams, gpu_indexes, gpu_count, + SHIFT_OR_ROTATE_TYPE::RIGHT_SHIFT, params, num_radix_blocks, + allocate_gpu_memory); + scp_mem = + new int_sc_prop_memory(streams, gpu_indexes, gpu_count, params, + num_radix_blocks, allocate_gpu_memory); + bitxor_mem = new int_bitop_buffer( + streams, gpu_indexes, gpu_count, BITOP_TYPE::BITXOR, params, + num_radix_blocks, allocate_gpu_memory); + + uint32_t lwe_size = params.big_lwe_dimension + 1; + uint32_t lwe_size_bytes = lwe_size * sizeof(Torus); + + mask = (Torus *)cuda_malloc_async(num_radix_blocks * lwe_size_bytes, + streams[0], gpu_indexes[0]); + } + } + + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count) { + arithmetic_scalar_shift_mem->release(streams, gpu_indexes, gpu_count); + scp_mem->release(streams, gpu_indexes, gpu_count); + bitxor_mem->release(streams, gpu_indexes, gpu_count); + + delete arithmetic_scalar_shift_mem; + delete scp_mem; + delete bitxor_mem; + + cuda_drop_async(mask, streams[0], gpu_indexes[0]); + } +}; + #endif // CUDA_INTEGER_UTILITIES_H diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu new file mode 100644 index 0000000000..1f462753ee --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu @@ -0,0 +1,43 @@ +#include "integer/abs.cuh" + +void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, bool is_signed, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t big_lwe_dimension, + uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, + PBS_TYPE pbs_type, bool allocate_gpu_memory) { + + int_radix_params params(pbs_type, glwe_dimension, polynomial_size, + big_lwe_dimension, small_lwe_dimension, ks_level, + ks_base_log, pbs_level, pbs_base_log, grouping_factor, + message_modulus, carry_modulus); + + scratch_cuda_integer_abs_kb( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + (int_abs_buffer **)mem_ptr, is_signed, num_blocks, params, + allocate_gpu_memory); +} + +void cuda_integer_abs_inplace_radix_ciphertext_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks, + void *const *ksks, uint32_t num_blocks) { + + auto mem = (int_abs_buffer *)mem_ptr; + + host_integer_abs_kb((cudaStream_t *)(streams), gpu_indexes, + gpu_count, static_cast(ct), bsks, + (uint64_t **)(ksks), mem, is_signed, + num_blocks); +} + +void cleanup_cuda_integer_abs_inplace(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void) { + int_abs_buffer *mem_ptr = + (int_abs_buffer *)(*mem_ptr_void); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); +} diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh new file mode 100644 index 0000000000..ad1a4b9e23 --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh @@ -0,0 +1,69 @@ +#ifndef TFHE_RS_ABS_CUH +#define TFHE_RS_ABS_CUH + +#include "crypto/keyswitch.cuh" +#include "device.h" +#include "integer/bitwise_ops.cuh" +#include "integer/comparison.cuh" +#include "integer/integer.cuh" +#include "integer/integer_utilities.h" +#include "integer/negation.cuh" +#include "integer/scalar_shifts.cuh" +#include "linear_algebra.h" +#include "pbs/programmable_bootstrap.h" +#include "utils/helper.cuh" +#include "utils/kernel_dimensions.cuh" +#include +#include +#include +#include +#include + +template +__host__ void scratch_cuda_integer_abs_kb( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_abs_buffer **mem_ptr, bool is_signed, + uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) { + + if (is_signed) + *mem_ptr = + new int_abs_buffer(streams, gpu_indexes, gpu_count, params, + num_blocks, allocate_gpu_memory); +} + +template +__host__ void +host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *ct, void *const *bsks, + uint64_t *const *ksks, int_abs_buffer *mem_ptr, + bool is_signed, uint32_t num_blocks) { + if (!is_signed) + return; + + auto radix_params = mem_ptr->params; + auto mask = mem_ptr->mask; + + auto big_lwe_dimension = radix_params.big_lwe_dimension; + auto big_lwe_size = big_lwe_dimension + 1; + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + uint32_t num_bits_in_ciphertext = + (31 - __builtin_clz(radix_params.message_modulus)) * num_blocks; + + cuda_memcpy_async_gpu_to_gpu(mask, ct, num_blocks * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + + host_integer_radix_arithmetic_scalar_shift_kb_inplace( + streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1, + mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, num_blocks); + host_addition(streams[0], gpu_indexes[0], ct, mask, ct, + radix_params.big_lwe_dimension, num_blocks); + + host_propagate_single_carry(streams, gpu_indexes, gpu_count, ct, + nullptr, nullptr, mem_ptr->scp_mem, bsks, + ksks, num_blocks); + + host_integer_radix_bitop_kb(streams, gpu_indexes, gpu_count, ct, mask, ct, + mem_ptr->bitxor_mem, bsks, ksks, num_blocks); +} + +#endif // TFHE_RS_ABS_CUH diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs index 2a665498f3..feadbe676d 100644 --- a/backends/tfhe-cuda-backend/src/bindings.rs +++ b/backends/tfhe-cuda-backend/src/bindings.rs @@ -1036,6 +1036,50 @@ extern "C" { lwe_size: u32, ); } +extern "C" { + pub fn scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + is_signed: bool, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_blocks: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_integer_abs_inplace_radix_ciphertext_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + ct: *mut ffi::c_void, + mem_ptr: *mut i8, + is_signed: bool, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + num_blocks: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_integer_abs_inplace( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} extern "C" { pub fn cuda_keyswitch_lwe_ciphertext_vector_32( stream: *mut ffi::c_void, diff --git a/tfhe/benches/integer/signed_bench.rs b/tfhe/benches/integer/signed_bench.rs index ec7bcbdbc8..3638c8c65e 100644 --- a/tfhe/benches/integer/signed_bench.rs +++ b/tfhe/benches/integer/signed_bench.rs @@ -1602,6 +1602,10 @@ mod cuda { display_name: neg ); + define_cuda_server_key_bench_clean_input_signed_unary_fn!( + method_name: unchecked_abs, + display_name: abs + ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_mul, display_name: mul @@ -1842,6 +1846,11 @@ mod cuda { display_name: neg ); + define_cuda_server_key_bench_clean_input_signed_unary_fn!( + method_name: abs, + display_name: abs + ); + define_cuda_server_key_bench_clean_input_signed_fn!( method_name: mul, display_name: mul @@ -2056,6 +2065,7 @@ mod cuda { cuda_unchecked_add, cuda_unchecked_sub, cuda_unchecked_neg, + cuda_unchecked_abs, cuda_unchecked_mul, cuda_unchecked_bitand, cuda_unchecked_bitnot, @@ -2105,6 +2115,7 @@ mod cuda { cuda_add, cuda_sub, cuda_neg, + cuda_abs, cuda_mul, cuda_bitand, cuda_bitnot, diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index 0e40674f4c..871332beea 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -2770,3 +2770,68 @@ pub unsafe fn reverse_blocks_inplace_async( ); } } + +#[allow(clippy::too_many_arguments)] +/// # Safety +/// +/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization +/// is required +pub unsafe fn unchecked_signed_abs_radix_kb_assign_async( + streams: &CudaStreams, + ct: &mut CudaVec, + bootstrapping_key: &CudaVec, + keyswitch_key: &CudaVec, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + big_lwe_dimension: LweDimension, + small_lwe_dimension: LweDimension, + ks_level: DecompositionLevelCount, + ks_base_log: DecompositionBaseLog, + pbs_level: DecompositionLevelCount, + pbs_base_log: DecompositionBaseLog, + num_blocks: u32, + pbs_type: PBSType, + grouping_factor: LweBskGroupingFactor, +) { + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + true, + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + big_lwe_dimension.0 as u32, + small_lwe_dimension.0 as u32, + ks_level.0 as u32, + ks_base_log.0 as u32, + pbs_level.0 as u32, + pbs_base_log.0 as u32, + grouping_factor.0 as u32, + num_blocks, + message_modulus.0 as u32, + carry_modulus.0 as u32, + pbs_type as u32, + true, + ); + cuda_integer_abs_inplace_radix_ciphertext_kb_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + ct.as_mut_c_ptr(0), + mem_ptr, + true, + bootstrapping_key.ptr.as_ptr(), + keyswitch_key.ptr.as_ptr(), + num_blocks, + ); + cleanup_cuda_integer_abs_inplace( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + ); +} diff --git a/tfhe/src/integer/gpu/server_key/radix/abs.rs b/tfhe/src/integer/gpu/server_key/radix/abs.rs new file mode 100644 index 0000000000..26e605fb2e --- /dev/null +++ b/tfhe/src/integer/gpu/server_key/radix/abs.rs @@ -0,0 +1,142 @@ +use crate::core_crypto::gpu::CudaStreams; +use crate::core_crypto::prelude::LweBskGroupingFactor; +use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext; +use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey}; +use crate::integer::gpu::{unchecked_signed_abs_radix_kb_assign_async, PBSType}; + +impl CudaServerKey { + /// # Safety + /// + /// - [CudaStreams::synchronize] __must__ be called after this function as soon as + /// synchronization is required + pub unsafe fn unchecked_abs_assign_async(&self, ct: &mut T, streams: &CudaStreams) + where + T: CudaIntegerRadixCiphertext, + { + let num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 as u32; + + match &self.bootstrapping_key { + CudaBootstrappingKey::Classic(d_bsk) => { + unchecked_signed_abs_radix_kb_assign_async( + streams, + &mut ct.as_mut().d_blocks.0.d_vec, + &d_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_bsk.glwe_dimension, + d_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_bsk.decomp_level_count, + d_bsk.decomp_base_log, + num_blocks, + PBSType::Classical, + LweBskGroupingFactor(0), + ); + } + CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { + unchecked_signed_abs_radix_kb_assign_async( + streams, + &mut ct.as_mut().d_blocks.0.d_vec, + &d_multibit_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_multibit_bsk.glwe_dimension, + d_multibit_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_multibit_bsk.decomp_level_count, + d_multibit_bsk.decomp_base_log, + num_blocks, + PBSType::MultiBit, + d_multibit_bsk.grouping_factor, + ); + } + }; + } + pub fn unchecked_abs(&self, ct: &T, streams: &CudaStreams) -> T + where + T: CudaIntegerRadixCiphertext, + { + let mut res = unsafe { ct.duplicate_async(streams) }; + if T::IS_SIGNED { + unsafe { self.unchecked_abs_assign_async(&mut res, streams) }; + } + streams.synchronize(); + res + } + + /// Computes homomorphically an absolute value of ciphertext encrypting integer + /// values. + /// + /// This function, like all "default" operations (i.e. not smart, checked or unchecked), will + /// check that the input ciphertext block carries are empty and clears them if it's not the + /// case and the operation requires it. It outputs a ciphertext whose block carries are always + /// empty. + /// + /// # Warning + /// + /// - Multithreaded + /// + /// # Example + /// + /// ```rust + /// use tfhe::core_crypto::gpu::CudaStreams; + /// use tfhe::integer::gpu::ciphertext::CudaSignedRadixCiphertext; + /// use tfhe::integer::gpu::gen_keys_radix_gpu; + /// use tfhe::shortint::parameters::PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; + /// + /// let gpu_index = 0; + /// let streams = CudaStreams::new_single_gpu(gpu_index); + /// + /// // Generate the client key and the server key: + /// let num_blocks = 4; + /// let (cks, sks) = gen_keys_radix_gpu(PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64, num_blocks, &streams); + /// + /// let msg = -14; + /// + /// let ct = cks.encrypt(msg); + /// + /// // Copy to GPU + /// let d_ct = CudaSignedRadixCiphertext::from_radix_ciphertext(&ct, &streams); + /// + /// // Compute homomorphically an absolute value: + /// let d_ct_res = sks.abs(&d_ct, &streams); + /// + /// let ct_res = d_ct_res.to_radix_ciphertext(&streams); + /// + /// // Decrypt: + /// let dec_result: u64 = cks.decrypt(&ct_res); + /// + /// let abs_msg = if msg < 0 { -msg } else { msg }; + /// assert_eq!(dec_result, abs_msg ); + /// ``` + pub fn abs(&self, ct: &T, streams: &CudaStreams) -> T + where + T: CudaIntegerRadixCiphertext, + { + let mut res = unsafe { ct.duplicate_async(streams) }; + if !ct.block_carries_are_empty() { + unsafe { self.full_propagate_assign_async(&mut res, streams) }; + }; + if T::IS_SIGNED { + unsafe { self.unchecked_abs_assign_async(&mut res, streams) }; + } + streams.synchronize(); + res + } +} diff --git a/tfhe/src/integer/gpu/server_key/radix/mod.rs b/tfhe/src/integer/gpu/server_key/radix/mod.rs index 32daab51ce..f7571bf7d0 100644 --- a/tfhe/src/integer/gpu/server_key/radix/mod.rs +++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs @@ -24,6 +24,7 @@ use crate::shortint::server_key::{ }; use crate::shortint::PBSOrder; +mod abs; mod add; mod bitwise_op; mod cmux; diff --git a/tfhe/src/integer/gpu/server_key/radix/tests_signed/mod.rs b/tfhe/src/integer/gpu/server_key/radix/tests_signed/mod.rs index 5205a32842..e8fedd2848 100644 --- a/tfhe/src/integer/gpu/server_key/radix/tests_signed/mod.rs +++ b/tfhe/src/integer/gpu/server_key/radix/tests_signed/mod.rs @@ -1,3 +1,4 @@ +pub(crate) mod test_abs; pub(crate) mod test_add; pub(crate) mod test_bitwise_op; pub(crate) mod test_cmux; diff --git a/tfhe/src/integer/gpu/server_key/radix/tests_signed/test_abs.rs b/tfhe/src/integer/gpu/server_key/radix/tests_signed/test_abs.rs new file mode 100644 index 0000000000..83d3de352f --- /dev/null +++ b/tfhe/src/integer/gpu/server_key/radix/tests_signed/test_abs.rs @@ -0,0 +1,28 @@ +use crate::integer::gpu::server_key::radix::tests_unsigned::{ + create_gpu_parametrized_test, GpuFunctionExecutor, +}; +use crate::integer::gpu::CudaServerKey; +use crate::integer::server_key::radix_parallel::tests_signed::test_abs::{ + signed_default_absolute_value_test, signed_unchecked_absolute_value_test, +}; +use crate::integer::server_key::radix_parallel::tests_signed::test_add::signed_unchecked_add_test; +use crate::shortint::parameters::*; + +create_gpu_parametrized_test!(integer_signed_unchecked_abs); +create_gpu_parametrized_test!(integer_signed_abs); + +fn integer_signed_unchecked_abs

(param: P) +where + P: Into, +{ + let executor = GpuFunctionExecutor::new(&CudaServerKey::unchecked_abs); + signed_unchecked_absolute_value_test(param, executor); +} + +fn integer_signed_abs

(param: P) +where + P: Into, +{ + let executor = GpuFunctionExecutor::new(&CudaServerKey::abs); + signed_default_absolute_value_test(param, executor); +}