diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index 670b9f46c0..f7ee72f9db 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -35,8 +35,11 @@ enum COMPARISON_TYPE { MAX = 6, MIN = 7, }; + enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 }; +enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 }; + extern "C" { void scratch_cuda_apply_univariate_lut_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, @@ -355,6 +358,24 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64( void cleanup_cuda_integer_div_rem(void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void); +void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory); + +void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lhs, + void *rhs, void *overflowed, int8_t signed_operation, int8_t *mem_ptr, + void **bsks, void **ksks, uint32_t num_blocks_in_radix); + +void cleanup_signed_overflowing_add_or_sub(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void); } // extern C template @@ -2884,6 +2905,232 @@ template struct int_div_rem_memory { } }; +template struct int_last_block_inner_propagate_memory { + + int_radix_lut *last_block_inner_propagation_lut; + int_radix_params params; + + int_last_block_inner_propagate_memory( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + int_radix_params params, SIGNED_OPERATION op, uint32_t num_radix_blocks, + bool allocate_gpu_memory) { + + this->params = params; + auto message_modulus = params.message_modulus; + uint32_t bits_of_message = + static_cast(std::log2(params.message_modulus)); + Torus message_bit_mask = (1 << bits_of_message) - 1; + + // declare lambda function for last_block_inner_propagation_lut generation + auto f_last_block_inner_propagation_lut = + [op, message_modulus, message_bit_mask, + bits_of_message](Torus lhs_block, Torus rhs_block) -> Torus { + Torus local_rhs_block = 0; + if (op == SIGNED_OPERATION::SUBTRACTION) { + Torus flipped_rhs = !rhs_block; + local_rhs_block = (flipped_rhs << 1) & message_bit_mask; + } else { + local_rhs_block = (rhs_block << 1) & message_bit_mask; + }; + + Torus local_lhs_block = (lhs_block << 1) & message_bit_mask; + + // whole_result contains the result of addition with + // the carry being in the first bit of carry space + // the message space contains the message, but with one 0 + // on the right (lsb) + Torus whole_result = local_lhs_block + local_rhs_block; + Torus carry = whole_result >> bits_of_message; + Torus result = (whole_result & message_bit_mask) >> 1; + Torus propagation_result = 0; + if (carry == 1) { + // Addition of bits before last one generates a carry + propagation_result = OUTPUT_CARRY::GENERATED; + } else if (result == ((message_modulus - 1) >> 1)) { + // Addition of bits before last one puts the bits + // in a state that makes it so that an input carry into last block + // gets propagated to last bit. + propagation_result = OUTPUT_CARRY::PROPAGATED; + } else { + propagation_result = OUTPUT_CARRY::NONE; + }; + + // Shift the propagation result in carry part + // to have less noise growth later + return propagation_result << bits_of_message; + }; + + last_block_inner_propagation_lut = new int_radix_lut( + streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory); + + generate_device_accumulator_bivariate( + streams[0], gpu_indexes[0], + last_block_inner_propagation_lut->get_lut(gpu_indexes[0], 0), + params.glwe_dimension, params.polynomial_size, message_modulus, + params.carry_modulus, f_last_block_inner_propagation_lut); + last_block_inner_propagation_lut->broadcast_lut(streams, gpu_indexes, + gpu_indexes[0]); + } + + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + last_block_inner_propagation_lut->release(streams, gpu_indexes, gpu_count); + delete last_block_inner_propagation_lut; + } +}; + +template struct int_resolve_signed_overflow_memory { + + int_radix_lut *resolve_overflow_lut; + int_radix_params params; + + Torus *x; + + int_resolve_signed_overflow_memory(cudaStream_t *streams, + uint32_t *gpu_indexes, uint32_t gpu_count, + int_radix_params params, + bool allocate_gpu_memory) { + + this->params = params; + + auto message_modulus = params.message_modulus; + + uint32_t bits_of_message = + static_cast(std::log2(message_modulus)); + + x = (Torus *)cuda_malloc_async((params.big_lwe_dimension + 1) * + sizeof(Torus), + streams[0], gpu_indexes[0]); + + // declare lambda function for resolve_overflow_lut generation + auto f_resolve_overflow_lut = [bits_of_message](Torus x) -> Torus { + Torus carry_propagation = x >> bits_of_message; + Torus output_carry_of_block = (x >> 1) & 1; + Torus input_carry_of_block = x & 1; + + // Resolve the carry that the last bit actually receives as input + Torus input_carry_to_last_bit; + if (carry_propagation == OUTPUT_CARRY::PROPAGATED) { + input_carry_to_last_bit = input_carry_of_block; + } else if (carry_propagation == OUTPUT_CARRY::GENERATED) { + input_carry_to_last_bit = 1; + } else { + input_carry_to_last_bit = 0; + }; + + return input_carry_to_last_bit != output_carry_of_block; + }; + + resolve_overflow_lut = new int_radix_lut( + streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory); + + generate_device_accumulator( + streams[0], gpu_indexes[0], + resolve_overflow_lut->get_lut(gpu_indexes[0], 0), params.glwe_dimension, + params.polynomial_size, message_modulus, params.carry_modulus, + f_resolve_overflow_lut); + resolve_overflow_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + } + + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + resolve_overflow_lut->release(streams, gpu_indexes, gpu_count); + delete resolve_overflow_lut; + cuda_drop_async(x, streams[0], gpu_indexes[0]); + } +}; + +template struct int_signed_overflowing_add_or_sub_memory { + int_radix_params params; + + // memory objects for other operations + int_sc_prop_memory *scp_mem; + int_last_block_inner_propagate_memory *las_block_prop_mem; + int_resolve_signed_overflow_memory *resolve_overflow_mem; + // lookupt tables + + // sub streams + cudaStream_t *sub_streams_1; + cudaStream_t *sub_streams_2; + + // temporary device buffers + Torus *result; // num_blocks + Torus *input_carries; // num_blocks + Torus *output_carry; // single block + Torus *last_block_inner_propagation; // single block + + // allocate temporary arrays used to calculate + // cuda integer signed overflowing add or sub + void allocate_temporary_buffers(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, uint32_t num_blocks) { + uint32_t big_lwe_size = params.big_lwe_dimension + 1; + + result = (Torus *)cuda_malloc_async( + big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); + + input_carries = (Torus *)cuda_malloc_async( + big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); + output_carry = (Torus *)cuda_malloc_async(big_lwe_size * sizeof(Torus), + streams[0], gpu_indexes[0]); + last_block_inner_propagation = (Torus *)cuda_malloc_async( + big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]); + } + + // constructor without memory reuse + int_signed_overflowing_add_or_sub_memory( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + int_radix_params params, uint32_t num_blocks, SIGNED_OPERATION op, + bool allocate_gpu_memory) { + this->params = params; + + allocate_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks); + + // initialize streams + sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t)); + sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t)); + for (uint j = 0; j < gpu_count; j++) { + sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]); + sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]); + } + + // initialize memory objects for other operations + scp_mem = + new int_sc_prop_memory(streams, gpu_indexes, gpu_count, params, + num_blocks, allocate_gpu_memory); + las_block_prop_mem = new int_last_block_inner_propagate_memory( + streams, gpu_indexes, gpu_count, params, op, num_blocks, + allocate_gpu_memory); + + resolve_overflow_mem = new int_resolve_signed_overflow_memory( + streams, gpu_indexes, gpu_count, params, allocate_gpu_memory); + } + + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + // memory objects for other operations + scp_mem->release(streams, gpu_indexes, gpu_count); + las_block_prop_mem->release(streams, gpu_indexes, gpu_count); + resolve_overflow_mem->release(streams, gpu_indexes, gpu_count); + + delete scp_mem; + delete las_block_prop_mem; + delete resolve_overflow_mem; + + // temporary device buffers + cuda_drop_async(result, streams[0], gpu_indexes[0]); + cuda_drop_async(input_carries, streams[0], gpu_indexes[0]); + cuda_drop_async(output_carry, streams[0], gpu_indexes[0]); + cuda_drop_async(last_block_inner_propagation, streams[0], gpu_indexes[0]); + + // sub streams + for (uint i = 0; i < gpu_count; i++) { + cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]); + cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]); + } + free(sub_streams_1); + free(sub_streams_2); + } +}; template struct int_bitop_buffer { int_radix_params params; @@ -2981,8 +3228,8 @@ template struct int_scalar_mul_buffer { size_t num_ciphertext_bits = msg_bits * num_radix_blocks; //// Contains all shifted values of lhs for shift in range (0..msg_bits) - //// The idea is that with these we can create all other shift that are in - //// range (0..total_bits) for free (block rotation) + //// The idea is that with these we can create all other shift that are + /// in / range (0..total_bits) for free (block rotation) preshifted_buffer = (Torus *)cuda_malloc_async( num_ciphertext_bits * lwe_size_bytes, streams[0], gpu_indexes[0]); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu new file mode 100644 index 0000000000..cd78bf7b6d --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu @@ -0,0 +1,93 @@ +#include "integer/addition.cuh" + +void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory) { + + SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION + : SIGNED_OPERATION::SUBTRACTION; + int_radix_params params(pbs_type, glwe_dimension, polynomial_size, + big_lwe_dimension, small_lwe_dimension, ks_level, + ks_base_log, pbs_level, pbs_base_log, grouping_factor, + message_modulus, carry_modulus); + + scratch_cuda_integer_signed_overflowing_add_or_sub_kb( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + (int_signed_overflowing_add_or_sub_memory **)mem_ptr, + num_blocks, op, params, allocate_gpu_memory); +} + +void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lhs, + void *rhs, void *overflowed, int8_t signed_operation, int8_t *mem_ptr, + void **bsks, void **ksks, uint32_t num_blocks) { + + auto mem = (int_signed_overflowing_add_or_sub_memory *)mem_ptr; + SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION + : SIGNED_OPERATION::SUBTRACTION; + + switch (mem->params.polynomial_size) { + case 512: + host_integer_signed_overflowing_add_or_sub_kb>( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + static_cast(lhs), static_cast(rhs), + static_cast(overflowed), op, bsks, (uint64_t **)(ksks), mem, + num_blocks); + break; + case 1024: + + host_integer_signed_overflowing_add_or_sub_kb>( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + static_cast(lhs), static_cast(rhs), + static_cast(overflowed), op, bsks, (uint64_t **)(ksks), mem, + num_blocks); + break; + case 2048: + host_integer_signed_overflowing_add_or_sub_kb>( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + static_cast(lhs), static_cast(rhs), + static_cast(overflowed), op, bsks, (uint64_t **)(ksks), mem, + num_blocks); + break; + case 4096: + host_integer_signed_overflowing_add_or_sub_kb>( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + static_cast(lhs), static_cast(rhs), + static_cast(overflowed), op, bsks, (uint64_t **)(ksks), mem, + num_blocks); + break; + case 8192: + host_integer_signed_overflowing_add_or_sub_kb>( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + static_cast(lhs), static_cast(rhs), + static_cast(overflowed), op, bsks, (uint64_t **)(ksks), mem, + num_blocks); + break; + case 16384: + host_integer_signed_overflowing_add_or_sub_kb>( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + static_cast(lhs), static_cast(rhs), + static_cast(overflowed), op, bsks, (uint64_t **)(ksks), mem, + num_blocks); + break; + default: + PANIC("Cuda error (integer signed_overflowing_add_or_sub): unsupported " + "polynomial size. " + "Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported") + } +} + +void cleanup_signed_overflowing_add_or_sub(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void) { + int_signed_overflowing_add_or_sub_memory *mem_ptr = + (int_signed_overflowing_add_or_sub_memory *)(*mem_ptr_void); + + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); +} diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh new file mode 100644 index 0000000000..5feef93e6d --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh @@ -0,0 +1,146 @@ +#ifndef TFHE_RS_ADDITION_CUH +#define TFHE_RS_ADDITION_CUH + +#include "crypto/keyswitch.cuh" +#include "device.h" +#include "integer.h" +#include "integer/comparison.cuh" +#include "integer/integer.cuh" +#include "integer/negation.cuh" +#include "integer/scalar_shifts.cuh" +#include "linear_algebra.h" +#include "programmable_bootstrap.h" +#include "utils/helper.cuh" +#include "utils/kernel_dimensions.cuh" +#include +#include +#include +#include +#include +#include + +template +void host_resolve_signed_overflow( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + Torus *result, Torus *last_block_inner_propagation, + Torus *last_block_input_carry, Torus *last_block_output_carry, + int_resolve_signed_overflow_memory *mem, void **bsks, Torus **ksks) { + + auto x = mem->x; + + Torus *d_clears = + (Torus *)cuda_malloc_async(sizeof(Torus), streams[0], gpu_indexes[0]); + + cuda_set_value_async(streams[0], gpu_indexes[0], d_clears, 2, 1); + + // replace with host function call + cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( + streams[0], gpu_indexes[0], x, last_block_output_carry, d_clears, + mem->params.big_lwe_dimension, 1); + + host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation, + last_block_inner_propagation, x, mem->params.big_lwe_dimension, + 1); + host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation, + last_block_inner_propagation, last_block_input_carry, + mem->params.big_lwe_dimension, 1); + + host_apply_univariate_lut_kb(streams, gpu_indexes, gpu_count, result, + last_block_inner_propagation, + mem->resolve_overflow_lut, ksks, bsks, 1); + + cuda_drop_async(d_clears, streams[0], gpu_indexes[0]); +} + +template +__host__ void scratch_cuda_integer_signed_overflowing_add_or_sub_kb( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + int_signed_overflowing_add_or_sub_memory **mem_ptr, + uint32_t num_blocks, SIGNED_OPERATION op, int_radix_params params, + bool allocate_gpu_memory) { + + *mem_ptr = new int_signed_overflowing_add_or_sub_memory( + streams, gpu_indexes, gpu_count, params, num_blocks, op, + allocate_gpu_memory); +} + +/* + * Addition - signed_operation = 1 + * Subtraction - signed_operation = -1 + */ +template +__host__ void host_integer_signed_overflowing_add_or_sub_kb( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + Torus *lhs, Torus *rhs, Torus *overflowed, SIGNED_OPERATION op, void **bsks, + uint64_t **ksks, + int_signed_overflowing_add_or_sub_memory *mem_ptr, + uint32_t num_blocks) { + + auto radix_params = mem_ptr->params; + + uint32_t big_lwe_dimension = radix_params.big_lwe_dimension; + uint32_t big_lwe_size = big_lwe_dimension + 1; + uint32_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + + assert(radix_params.message_modulus >= 4 && radix_params.carry_modulus >= 4); + + auto result = mem_ptr->result; + auto input_carries = mem_ptr->input_carries; + auto output_carry = mem_ptr->output_carry; + auto last_block_inner_propagation = mem_ptr->last_block_inner_propagation; + + cuda_memcpy_async_gpu_to_gpu(result, lhs, num_blocks * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + + // phase 1 + if (op == SIGNED_OPERATION::ADDITION) { + host_addition(streams[0], gpu_indexes[0], result, lhs, rhs, + big_lwe_dimension, num_blocks); + } else { + host_subtraction(streams[0], gpu_indexes[0], result, lhs, rhs, + big_lwe_dimension, num_blocks); + } + + // phase 2 + for (uint j = 0; j < gpu_count; j++) { + cuda_synchronize_stream(streams[j], gpu_indexes[j]); + } + +#pragma omp parallel sections + { + // generate input_carries and output_carry +#pragma omp section + { + host_propagate_single_carry( + mem_ptr->sub_streams_1, gpu_indexes, gpu_count, result, output_carry, + input_carries, mem_ptr->scp_mem, bsks, ksks, num_blocks); + } + + // generate generate_last_block_inner_propagation +#pragma omp section + { + host_generate_last_block_inner_propagation( + mem_ptr->sub_streams_2, gpu_indexes, gpu_count, + last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size], + &rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, + bsks, ksks); + } + } + + for (uint j = 0; j < gpu_count; j++) { + cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]); + cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]); + } + + // phase 3 + auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size]; + + host_resolve_signed_overflow( + streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation, + input_carry, output_carry, mem_ptr->resolve_overflow_mem, bsks, ksks); + + cuda_memcpy_async_gpu_to_gpu(lhs, result, num_blocks * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); +} + +#endif // TFHE_RS_ADDITION_CUH diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index 77b9a77178..99ae8e0bba 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -496,6 +496,19 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes, num_blocks, message_acc); } +template +void host_generate_last_block_inner_propagation( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + Torus *last_block_inner_propagation, Torus *lhs, Torus *rhs, + int_last_block_inner_propagate_memory *mem, void **bsks, + Torus **ksks) { + + integer_radix_apply_bivariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, last_block_inner_propagation, lhs, rhs, + bsks, ksks, 1, mem->last_block_inner_propagation_lut, + mem->params.message_modulus); +} + template void host_propagate_single_sub_borrow(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs index 621b0ba45d..e36e3c94de 100644 --- a/backends/tfhe-cuda-backend/src/cuda_bind.rs +++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs @@ -1192,4 +1192,46 @@ extern "C" { mem_ptr: *mut *mut i8, ); + pub fn scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_blocks: u32, + signed_operation: i8, + message_modulus: u32, + carry_modulus: u32, + pbs_type: u32, + allocate_gpu_memory: bool, + ); + + pub fn cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lhs: *mut c_void, + rhs: *const c_void, + overflowed: *mut c_void, + signed_operation: i8, + mem_ptr: *mut i8, + bsks: *const *mut c_void, + ksks: *const *mut c_void, + num_blocks: u32, + ); + + pub fn cleanup_signed_overflowing_add_or_sub( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + ); } // extern "C" diff --git a/tfhe/benches/integer/signed_bench.rs b/tfhe/benches/integer/signed_bench.rs index 306e84a612..1ff428d87c 100644 --- a/tfhe/benches/integer/signed_bench.rs +++ b/tfhe/benches/integer/signed_bench.rs @@ -1683,6 +1683,11 @@ mod cuda { display_name: max ); + define_cuda_server_key_bench_clean_input_signed_fn!( + method_name: unchecked_signed_overflowing_add, + display_name: overflowing_add + ); + define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_add, display_name: add, @@ -1895,6 +1900,11 @@ mod cuda { display_name: max ); + define_cuda_server_key_bench_clean_input_signed_fn!( + method_name: signed_overflowing_add, + display_name: overflowing_add + ); + define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_add, display_name: add, @@ -2025,6 +2035,7 @@ mod cuda { cuda_unchecked_le, cuda_unchecked_min, cuda_unchecked_max, + cuda_unchecked_signed_overflowing_add, ); criterion_group!( @@ -2072,6 +2083,7 @@ mod cuda { cuda_min, cuda_max, cuda_if_then_else, + cuda_signed_overflowing_add, ); criterion_group!( diff --git a/tfhe/src/high_level_api/integers/signed/overflowing_ops.rs b/tfhe/src/high_level_api/integers/signed/overflowing_ops.rs index 3ae8d62492..6656248a5a 100644 --- a/tfhe/src/high_level_api/integers/signed/overflowing_ops.rs +++ b/tfhe/src/high_level_api/integers/signed/overflowing_ops.rs @@ -1,5 +1,7 @@ use crate::core_crypto::prelude::SignedNumeric; use crate::high_level_api::global_state; +#[cfg(feature = "gpu")] +use crate::high_level_api::global_state::with_thread_local_cuda_streams; use crate::high_level_api::integers::FheIntId; use crate::high_level_api::keys::InternalServerKey; use crate::integer::block_decomposition::DecomposableInto; @@ -48,9 +50,14 @@ where (FheInt::new(result), FheBool::new(overflow)) } #[cfg(feature = "gpu")] - InternalServerKey::Cuda(_) => { - todo!("Cuda devices do not support signed integer"); - } + InternalServerKey::Cuda(cuda_key) => with_thread_local_cuda_streams(|streams| { + let (result, overflow) = cuda_key.key.signed_overflowing_add( + &self.ciphertext.on_gpu(), + &other.ciphertext.on_gpu(), + streams, + ); + (FheInt::new(result), FheBool::new(overflow)) + }), }) } } diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index 1c8e307c2d..654ddce257 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -2339,3 +2339,76 @@ pub unsafe fn unchecked_unsigned_div_rem_integer_radix_kb_assign_async< std::ptr::addr_of_mut!(mem_ptr), ); } + +#[allow(clippy::too_many_arguments)] +/// # Safety +/// +/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization +/// is required +pub unsafe fn unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async< + T: UnsignedInteger, + B: Numeric, +>( + streams: &CudaStreams, + lhs: &mut CudaVec, + rhs: &CudaVec, + overflowed: &mut CudaVec, + signed_operation: i8, + bootstrapping_key: &CudaVec, + keyswitch_key: &CudaVec, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + big_lwe_dimension: LweDimension, + small_lwe_dimension: LweDimension, + ks_level: DecompositionLevelCount, + ks_base_log: DecompositionBaseLog, + pbs_level: DecompositionLevelCount, + pbs_base_log: DecompositionBaseLog, + num_blocks: u32, + pbs_type: PBSType, + grouping_factor: LweBskGroupingFactor, +) { + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + big_lwe_dimension.0 as u32, + small_lwe_dimension.0 as u32, + ks_level.0 as u32, + ks_base_log.0 as u32, + pbs_level.0 as u32, + pbs_base_log.0 as u32, + grouping_factor.0 as u32, + num_blocks, + signed_operation, + message_modulus.0 as u32, + carry_modulus.0 as u32, + pbs_type as u32, + true, + ); + cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + lhs.as_mut_c_ptr(0), + rhs.as_c_ptr(0), + overflowed.as_mut_c_ptr(0), + signed_operation, + mem_ptr, + bootstrapping_key.ptr.as_ptr(), + keyswitch_key.ptr.as_ptr(), + num_blocks, + ); + cleanup_signed_overflowing_add_or_sub( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + ); +} diff --git a/tfhe/src/integer/gpu/server_key/radix/add.rs b/tfhe/src/integer/gpu/server_key/radix/add.rs index d59b9d9197..826f29acc1 100644 --- a/tfhe/src/integer/gpu/server_key/radix/add.rs +++ b/tfhe/src/integer/gpu/server_key/radix/add.rs @@ -2,12 +2,16 @@ use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; use crate::core_crypto::gpu::CudaStreams; use crate::core_crypto::prelude::LweBskGroupingFactor; use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; -use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaUnsignedRadixCiphertext}; +use crate::integer::gpu::ciphertext::{ + CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext, +}; use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey}; use crate::integer::gpu::{ unchecked_add_integer_radix_assign_async, + unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async, unchecked_sum_ciphertexts_integer_radix_kb_assign_async, PBSType, }; +use crate::integer::server_key::radix_parallel::sub::SignedOperation; use crate::shortint::ciphertext::NoiseLevel; impl CudaServerKey { @@ -491,4 +495,240 @@ impl CudaServerKey { (ct_res, ct_overflowed) } + + /// ```rust + /// use tfhe::core_crypto::gpu::CudaStreams; + /// use tfhe::integer::gpu::ciphertext::{CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext}; + /// use tfhe::integer::gpu::gen_keys_radix_gpu; + /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS; + /// + /// let gpu_index = 0; + /// let streams = CudaStreams::new_single_gpu(gpu_index); + /// + /// // Generate the client key and the server key: + /// let num_blocks = 4; + /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, num_blocks, &streams); + /// let total_bits = num_blocks * cks.parameters().message_modulus().0.ilog2() as usize; + /// let modulus = 1 << total_bits; + /// + /// let msg1: i8 = 120; + /// let msg2: i8 = 8; + /// + /// let ct1 = cks.encrypt_signed(msg1); + /// let ct2 = cks.encrypt_signed(msg2); + /// + /// // Copy to GPU + /// let d_ct1 = CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct1, &streams); + /// let d_ct2 = CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct2, &streams); + /// + /// // Compute homomorphically an overflowing addition: + /// let (d_ct_res, d_ct_overflowed) = sks.signed_overflowing_add(&d_ct1, &d_ct2, &streams); + /// + /// let ct_res = d_ct_res.to_signed_radix_ciphertext(&streams); + /// let ct_overflowed = d_ct_overflowed.to_boolean_block(&streams); + /// + /// // Decrypt: + /// let dec_result: i8 = cks.decrypt_signed(&ct_res); + /// let dec_overflowed: bool = cks.decrypt_bool(&ct_overflowed); + /// let (clear_result, clear_overflowed) = msg1.overflowing_add(msg2); + /// assert_eq!(dec_result, clear_result); + /// assert_eq!(dec_overflowed, clear_overflowed); + /// ``` + pub fn signed_overflowing_add( + &self, + ct_left: &CudaSignedRadixCiphertext, + ct_right: &CudaSignedRadixCiphertext, + stream: &CudaStreams, + ) -> (CudaSignedRadixCiphertext, CudaBooleanBlock) { + let mut tmp_lhs; + let mut tmp_rhs; + let (lhs, rhs) = match ( + ct_left.block_carries_are_empty(), + ct_right.block_carries_are_empty(), + ) { + (true, true) => (ct_left, ct_right), + (true, false) => { + unsafe { + tmp_rhs = ct_right.duplicate_async(stream); + self.full_propagate_assign_async(&mut tmp_rhs, stream); + } + (ct_left, &tmp_rhs) + } + (false, true) => { + unsafe { + tmp_lhs = ct_left.duplicate_async(stream); + self.full_propagate_assign_async(&mut tmp_lhs, stream); + } + (&tmp_lhs, ct_right) + } + (false, false) => { + unsafe { + tmp_lhs = ct_left.duplicate_async(stream); + tmp_rhs = ct_right.duplicate_async(stream); + + self.full_propagate_assign_async(&mut tmp_lhs, stream); + self.full_propagate_assign_async(&mut tmp_rhs, stream); + } + + (&tmp_lhs, &tmp_rhs) + } + }; + + self.unchecked_signed_overflowing_add(lhs, rhs, stream) + } + + pub fn unchecked_signed_overflowing_add( + &self, + ct_left: &CudaSignedRadixCiphertext, + ct_right: &CudaSignedRadixCiphertext, + stream: &CudaStreams, + ) -> (CudaSignedRadixCiphertext, CudaBooleanBlock) { + assert_eq!( + ct_left.as_ref().d_blocks.lwe_ciphertext_count().0, + ct_right.as_ref().d_blocks.lwe_ciphertext_count().0, + "lhs and rhs must have the name number of blocks ({} vs {})", + ct_left.as_ref().d_blocks.lwe_ciphertext_count().0, + ct_right.as_ref().d_blocks.lwe_ciphertext_count().0 + ); + assert!( + ct_left.as_ref().d_blocks.lwe_ciphertext_count().0 > 0, + "inputs cannot be empty" + ); + + self.unchecked_signed_overflowing_add_or_sub( + ct_left, + ct_right, + SignedOperation::Addition, + stream, + ) + } + + pub(crate) fn unchecked_signed_overflowing_add_or_sub( + &self, + lhs: &CudaSignedRadixCiphertext, + rhs: &CudaSignedRadixCiphertext, + signed_operation: SignedOperation, + streams: &CudaStreams, + ) -> (CudaSignedRadixCiphertext, CudaBooleanBlock) { + assert!(self.message_modulus.0 >= 4 && self.carry_modulus.0 >= 4); + + let mut result: CudaSignedRadixCiphertext; + unsafe { + result = lhs.duplicate_async(streams); + } + let carry_out: CudaSignedRadixCiphertext = self.create_trivial_zero_radix(1, streams); + let mut overflowed = CudaBooleanBlock::from_cuda_radix_ciphertext(carry_out.ciphertext); + + unsafe { + self.unchecked_signed_overflowing_add_or_sub_assign_async( + &mut result, + rhs, + &mut overflowed, + signed_operation, + streams, + ); + streams.synchronize(); + } + + (result, overflowed) + } + + /// # Safety + /// + /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until stream is synchronized + pub(crate) unsafe fn unchecked_signed_overflowing_add_or_sub_assign_async( + &self, + lhs: &mut CudaSignedRadixCiphertext, + rhs: &CudaSignedRadixCiphertext, + overflowed: &mut CudaBooleanBlock, + signed_operation: SignedOperation, + streams: &CudaStreams, + ) { + if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO + && rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO + { + overflowed.as_mut().ciphertext.info = overflowed + .as_ref() + .ciphertext + .info + .boolean_info(NoiseLevel::ZERO); + } else { + overflowed.as_mut().ciphertext.info = overflowed + .as_ref() + .ciphertext + .info + .boolean_info(NoiseLevel::NOMINAL); + } + let num_blocks = lhs.as_ref().d_blocks.lwe_ciphertext_count().0 as u32; + let signed_operation_numeric: i8 = + if matches!(signed_operation, SignedOperation::Subtraction) { + -1 + } else { + 1 + }; + match &self.bootstrapping_key { + CudaBootstrappingKey::Classic(d_bsk) => { + unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async( + streams, + &mut lhs.as_mut().d_blocks.0.d_vec, + &rhs.as_ref().d_blocks.0.d_vec, + &mut overflowed.as_mut().ciphertext.d_blocks.0.d_vec, + signed_operation_numeric, + &d_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_bsk.glwe_dimension, + d_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_bsk.decomp_level_count, + d_bsk.decomp_base_log, + num_blocks, + PBSType::Classical, + LweBskGroupingFactor(0), + ); + } + CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { + unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async( + streams, + &mut lhs.as_mut().d_blocks.0.d_vec, + &rhs.as_ref().d_blocks.0.d_vec, + &mut overflowed.as_mut().ciphertext.d_blocks.0.d_vec, + signed_operation_numeric, + &d_multibit_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_multibit_bsk.glwe_dimension, + d_multibit_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_multibit_bsk.decomp_level_count, + d_multibit_bsk.decomp_base_log, + num_blocks, + PBSType::MultiBit, + d_multibit_bsk.grouping_factor, + ); + } + }; + + lhs.as_mut().info = lhs + .as_ref() + .info + .after_overflowing_add(&rhs.ciphertext.info); + } } diff --git a/tfhe/src/integer/gpu/server_key/radix/tests_signed/mod.rs b/tfhe/src/integer/gpu/server_key/radix/tests_signed/mod.rs index 74368a1ffe..1279e6d727 100644 --- a/tfhe/src/integer/gpu/server_key/radix/tests_signed/mod.rs +++ b/tfhe/src/integer/gpu/server_key/radix/tests_signed/mod.rs @@ -350,3 +350,42 @@ where d_res.to_signed_radix_ciphertext(&context.streams) } } +impl<'a, F> + FunctionExecutor< + (&'a SignedRadixCiphertext, &'a SignedRadixCiphertext), + (SignedRadixCiphertext, BooleanBlock), + > for GpuFunctionExecutor +where + F: Fn( + &CudaServerKey, + &CudaSignedRadixCiphertext, + &CudaSignedRadixCiphertext, + &CudaStreams, + ) -> (CudaSignedRadixCiphertext, CudaBooleanBlock), +{ + fn setup(&mut self, cks: &RadixClientKey, sks: Arc) { + self.setup_from_keys(cks, &sks); + } + + fn execute( + &mut self, + input: (&'a SignedRadixCiphertext, &'a SignedRadixCiphertext), + ) -> (SignedRadixCiphertext, BooleanBlock) { + let context = self + .context + .as_ref() + .expect("setup was not properly called"); + + let d_ctxt_1: CudaSignedRadixCiphertext = + CudaSignedRadixCiphertext::from_signed_radix_ciphertext(input.0, &context.streams); + let d_ctxt_2: CudaSignedRadixCiphertext = + CudaSignedRadixCiphertext::from_signed_radix_ciphertext(input.1, &context.streams); + + let (d_res, d_res_bool) = (self.func)(&context.sks, &d_ctxt_1, &d_ctxt_2, &context.streams); + + ( + d_res.to_signed_radix_ciphertext(&context.streams), + d_res_bool.to_boolean_block(&context.streams), + ) + } +} diff --git a/tfhe/src/integer/gpu/server_key/radix/tests_signed/test_add.rs b/tfhe/src/integer/gpu/server_key/radix/tests_signed/test_add.rs index 185a1ae45d..d1ac54a8ec 100644 --- a/tfhe/src/integer/gpu/server_key/radix/tests_signed/test_add.rs +++ b/tfhe/src/integer/gpu/server_key/radix/tests_signed/test_add.rs @@ -3,13 +3,16 @@ use crate::integer::gpu::server_key::radix::tests_unsigned::{ }; use crate::integer::gpu::CudaServerKey; use crate::integer::server_key::radix_parallel::tests_signed::test_add::{ - signed_default_add_test, signed_unchecked_add_test, + signed_default_add_test, signed_unchecked_add_test, signed_unchecked_overflowing_add_test, }; use crate::shortint::parameters::*; create_gpu_parametrized_test!(integer_unchecked_add); create_gpu_parametrized_test!(integer_add); +create_gpu_parametrized_test!(integer_unchecked_signed_overflowing_add); +create_gpu_parametrized_test!(integer_signed_overflowing_add); + fn integer_unchecked_add

(param: P) where P: Into, @@ -25,3 +28,19 @@ where let executor = GpuFunctionExecutor::new(&CudaServerKey::add); signed_default_add_test(param, executor); } + +fn integer_unchecked_signed_overflowing_add

(param: P) +where + P: Into, +{ + let executor = GpuFunctionExecutor::new(&CudaServerKey::unchecked_signed_overflowing_add); + signed_unchecked_overflowing_add_test(param, executor); +} + +fn integer_signed_overflowing_add

(param: P) +where + P: Into, +{ + let executor = GpuFunctionExecutor::new(&CudaServerKey::signed_overflowing_add); + signed_unchecked_overflowing_add_test(param, executor); +}