From 51cae3d3ac7d1734fda4d3e4e1ae35c3c3da62af Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Fri, 27 Sep 2024 17:14:15 -0300 Subject: [PATCH] fix(gpu): fix the indexes used in compression - also general minor fixes to compression --- .../cuda/include/ciphertext.h | 2 +- .../cuda/include/compression.h | 12 +- .../cuda/src/crypto/ciphertext.cu | 16 +-- .../src/integer/compression/compression.cu | 11 +- .../src/integer/compression/compression.cuh | 108 +++++++++++------- backends/tfhe-cuda-backend/src/cuda_bind.rs | 4 +- .../ciphertext/compressed_ciphertext_list.rs | 30 ++--- .../gpu/list_compression/server_keys.rs | 43 +++---- 8 files changed, 125 insertions(+), 101 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h index bba21890b7..3b2559eb69 100644 --- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h +++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h @@ -18,7 +18,7 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream, void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index, void *lwe_array_out, void *glwe_array_in, - uint32_t *nth_array, uint32_t num_glwes, + uint32_t *nth_array, uint32_t num_nths, uint32_t glwe_dimension, uint32_t polynomial_size); }; diff --git a/backends/tfhe-cuda-backend/cuda/include/compression.h b/backends/tfhe-cuda-backend/cuda/include/compression.h index 537113dcae..1154b0a46d 100644 --- a/backends/tfhe-cuda-backend/cuda/include/compression.h +++ b/backends/tfhe-cuda-backend/cuda/include/compression.h @@ -8,7 +8,7 @@ void scratch_cuda_integer_compress_radix_ciphertext_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size, uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus, + uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus, bool allocate_gpu_memory); @@ -17,7 +17,7 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64( uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size, uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus, + uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count, bool allocate_gpu_memory); @@ -96,7 +96,7 @@ template struct int_decompression { uint32_t storage_log_modulus; - uint32_t num_lwes; + uint32_t num_radix_blocks; uint32_t body_count; Torus *tmp_extracted_glwe; @@ -113,7 +113,7 @@ template struct int_decompression { this->encryption_params = encryption_params; this->compression_params = compression_params; this->storage_log_modulus = storage_log_modulus; - this->num_lwes = num_radix_blocks; + this->num_radix_blocks = num_radix_blocks; this->body_count = body_count; if (allocate_gpu_memory) { @@ -134,7 +134,7 @@ template struct int_decompression { tmp_extracted_lwe = (Torus *)cuda_malloc_async( num_radix_blocks * lwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]); - // Decompression + // Carry extract LUT auto carry_extract_f = [encryption_params](Torus x) -> Torus { return x / encryption_params.message_modulus; @@ -157,7 +157,7 @@ template struct int_decompression { cuda_drop_async(tmp_indexes_array, streams[0], gpu_indexes[0]); carry_extract_lut->release(streams, gpu_indexes, gpu_count); - delete (carry_extract_lut); + delete carry_extract_lut; } }; #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu index 869e482fd4..a898eabd6b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu @@ -23,7 +23,7 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream, void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index, void *lwe_array_out, void *glwe_array_in, - uint32_t *nth_array, uint32_t num_glwes, + uint32_t *nth_array, uint32_t num_nths, uint32_t glwe_dimension, uint32_t polynomial_size) { @@ -31,43 +31,43 @@ void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index, case 256: host_sample_extract>( static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, - (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes, + (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths, glwe_dimension); break; case 512: host_sample_extract>( static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, - (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes, + (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths, glwe_dimension); break; case 1024: host_sample_extract>( static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, - (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes, + (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths, glwe_dimension); break; case 2048: host_sample_extract>( static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, - (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes, + (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths, glwe_dimension); break; case 4096: host_sample_extract>( static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, - (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes, + (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths, glwe_dimension); break; case 8192: host_sample_extract>( static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, - (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes, + (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths, glwe_dimension); break; case 16384: host_sample_extract>( static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, - (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes, + (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths, glwe_dimension); break; default: diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu index 6087b89746..a40aedd7cb 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu @@ -4,7 +4,7 @@ void scratch_cuda_integer_compress_radix_ciphertext_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size, uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus, + uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus, bool allocate_gpu_memory) { @@ -16,15 +16,16 @@ void scratch_cuda_integer_compress_radix_ciphertext_64( scratch_cuda_compress_integer_radix_ciphertext( (cudaStream_t *)(streams), gpu_indexes, gpu_count, - (int_compression **)mem_ptr, num_lwes, compression_params, - lwe_per_glwe, storage_log_modulus, allocate_gpu_memory); + (int_compression **)mem_ptr, num_radix_blocks, + compression_params, lwe_per_glwe, storage_log_modulus, + allocate_gpu_memory); } void scratch_cuda_integer_decompress_radix_ciphertext_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size, uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus, + uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count, bool allocate_gpu_memory) { @@ -41,7 +42,7 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64( scratch_cuda_integer_decompress_radix_ciphertext( (cudaStream_t *)(streams), gpu_indexes, gpu_count, - (int_decompression **)mem_ptr, num_lwes, body_count, + (int_decompression **)mem_ptr, num_radix_blocks, body_count, encryption_params, compression_params, storage_log_modulus, allocate_gpu_memory); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh index c93f4f11ef..16d4a119fa 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh @@ -12,7 +12,7 @@ template __global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus, - uint32_t num_glwes, uint32_t in_len, uint32_t out_len) { + uint32_t num_coeffs, uint32_t in_len, uint32_t out_len) { auto nbits = sizeof(Torus) * 8; auto tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -21,7 +21,7 @@ __global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus, auto chunk_array_in = array_in + glwe_index * in_len; auto chunk_array_out = array_out + glwe_index * out_len; - if (tid < num_glwes * out_len) { + if (tid < num_coeffs) { auto k = nbits * i / log_modulus; auto j = k; @@ -44,11 +44,15 @@ __global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus, template __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index, Torus *array_out, Torus *array_in, uint32_t num_glwes, - int_compression *mem_ptr) { + uint32_t num_lwes, int_compression *mem_ptr) { + if (array_in == array_out) + PANIC("Cuda error: Input and output must be different"); + cudaSetDevice(gpu_index); auto compression_params = mem_ptr->compression_params; auto log_modulus = mem_ptr->storage_log_modulus; + // [0..num_glwes-1) GLWEs auto in_len = (compression_params.glwe_dimension + 1) * compression_params.polynomial_size; auto number_bits_to_pack = in_len * log_modulus; @@ -56,20 +60,35 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index, // number_bits_to_pack.div_ceil(Scalar::BITS) auto out_len = (number_bits_to_pack + nbits - 1) / nbits; + // Last GLWE + auto last_body_count = num_lwes % compression_params.polynomial_size; + in_len = + compression_params.glwe_dimension * compression_params.polynomial_size + + last_body_count; + number_bits_to_pack = in_len * log_modulus; + auto last_out_len = (number_bits_to_pack + nbits - 1) / nbits; + + auto num_coeffs = (num_glwes - 1) * out_len + last_out_len; + int num_blocks = 0, num_threads = 0; - getNumBlocksAndThreads(num_glwes * out_len, 1024, num_blocks, num_threads); + getNumBlocksAndThreads(num_coeffs, 1024, num_blocks, num_threads); dim3 grid(num_blocks); dim3 threads(num_threads); + cuda_memset_async(array_out, 0, + num_glwes * (compression_params.glwe_dimension + 1) * + compression_params.polynomial_size * sizeof(Torus), + stream, gpu_index); pack<<>>(array_out, array_in, log_modulus, - num_glwes, in_len, out_len); + num_coeffs, in_len, out_len); + check_cuda_error(cudaGetLastError()); } template __host__ void host_integer_compress(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *glwe_array_out, Torus *lwe_array_in, - Torus **fp_ksk, uint32_t num_lwes, + Torus **fp_ksk, uint32_t num_radix_blocks, int_compression *mem_ptr) { auto compression_params = mem_ptr->compression_params; @@ -80,21 +99,23 @@ __host__ void host_integer_compress(cudaStream_t *streams, host_cleartext_multiplication( streams[0], gpu_indexes[0], lwe_shifted, lwe_array_in, (uint64_t)compression_params.message_modulus, input_lwe_dimension, - num_lwes); + num_radix_blocks); uint32_t lwe_in_size = input_lwe_dimension + 1; uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) * compression_params.polynomial_size; - uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe + 1; + uint32_t num_glwes_for_compression = + num_radix_blocks / mem_ptr->lwe_per_glwe + 1; // Keyswitch LWEs to GLWE auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out; cuda_memset_async(tmp_glwe_array_out, 0, - num_glwes * (compression_params.glwe_dimension + 1) * + num_glwes_for_compression * + (compression_params.glwe_dimension + 1) * compression_params.polynomial_size * sizeof(Torus), streams[0], gpu_indexes[0]); auto fp_ks_buffer = mem_ptr->fp_ks_buffer; - auto rem_lwes = num_lwes; + auto rem_lwes = num_radix_blocks; auto lwe_subset = lwe_shifted; auto glwe_out = tmp_glwe_array_out; @@ -115,13 +136,13 @@ __host__ void host_integer_compress(cudaStream_t *streams, // Modulus switch host_modulus_switch_inplace( streams[0], gpu_indexes[0], tmp_glwe_array_out, - num_glwes * (compression_params.glwe_dimension + 1) * + num_glwes_for_compression * (compression_params.glwe_dimension + 1) * compression_params.polynomial_size, mem_ptr->storage_log_modulus); - check_cuda_error(cudaGetLastError()); host_pack(streams[0], gpu_indexes[0], glwe_array_out, - tmp_glwe_array_out, num_glwes, mem_ptr); + tmp_glwe_array_out, num_glwes_for_compression, + num_radix_blocks, mem_ptr); } template @@ -160,11 +181,15 @@ __global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index, } } +/// Extracts the glwe_index-nth GLWE ciphertext template __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index, Torus *glwe_array_out, Torus *array_in, uint32_t glwe_index, int_decompression *mem_ptr) { + if (array_in == glwe_array_out) + PANIC("Cuda error: Input and output must be different"); + cudaSetDevice(gpu_index); auto compression_params = mem_ptr->compression_params; @@ -221,7 +246,10 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes, "be smaller than " "polynomial_size.") - auto num_lwes = h_mem_ptr->num_lwes; + auto num_radix_blocks = h_mem_ptr->num_radix_blocks; + if (num_radix_blocks != indexes_array_size) + PANIC("Cuda error: wrong number of LWEs in decompress: the number of LWEs " + "should be the same as indexes_array_size.") // the first element is the last index in h_indexes_array that lies in the // related GLWE @@ -251,23 +279,23 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes, } } // Sample extract all LWEs - Torus lwe_accumulator_size = - (compression_params.glwe_dimension * compression_params.polynomial_size + - 1); + Torus lwe_accumulator_size = compression_params.small_lwe_dimension + 1; auto extracted_lwe = h_mem_ptr->tmp_extracted_lwe; uint32_t current_idx = 0; + auto d_indexes_array_chunk = d_indexes_array; for (const auto &max_idx_and_glwe : glwe_vec) { - uint32_t max_idx = max_idx_and_glwe.first; + uint32_t last_idx = max_idx_and_glwe.first; extracted_glwe = max_idx_and_glwe.second; - cuda_glwe_sample_extract_64( - streams[0], gpu_indexes[0], extracted_lwe, extracted_glwe, - d_indexes_array, max_idx + 1 - current_idx, - compression_params.glwe_dimension, compression_params.polynomial_size); - + auto num_lwes = last_idx + 1 - current_idx; + cuda_glwe_sample_extract_64(streams[0], gpu_indexes[0], extracted_lwe, + extracted_glwe, d_indexes_array_chunk, num_lwes, + compression_params.glwe_dimension, + compression_params.polynomial_size); + d_indexes_array_chunk += num_lwes; extracted_lwe += lwe_accumulator_size; - current_idx = max_idx; + current_idx = last_idx; } // Reset @@ -280,9 +308,8 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes, /// dimension to a big LWE dimension auto encryption_params = h_mem_ptr->encryption_params; auto lut = h_mem_ptr->carry_extract_lut; - auto active_gpu_count = get_active_gpu_count(num_lwes, gpu_count); + auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); if (active_gpu_count == 1) { - execute_pbs_async( streams, gpu_indexes, active_gpu_count, d_lwe_array_out, lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe, @@ -291,7 +318,7 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes, compression_params.small_lwe_dimension, encryption_params.polynomial_size, encryption_params.pbs_base_log, encryption_params.pbs_level, encryption_params.grouping_factor, - num_lwes, encryption_params.pbs_type, lut_count, lut_stride); + num_radix_blocks, encryption_params.pbs_type, lut_count, lut_stride); } else { /// For multi GPU execution we create vectors of pointers for inputs and /// outputs @@ -306,7 +333,7 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes, /// gather data to GPU 0 we can copy back to the original indexing multi_gpu_scatter_lwe_async( streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe, - lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_lwes, + lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks, compression_params.small_lwe_dimension + 1); /// Apply PBS @@ -318,14 +345,14 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes, compression_params.small_lwe_dimension, encryption_params.polynomial_size, encryption_params.pbs_base_log, encryption_params.pbs_level, encryption_params.grouping_factor, - num_lwes, encryption_params.pbs_type, lut_count, lut_stride); + num_radix_blocks, encryption_params.pbs_type, lut_count, lut_stride); /// Copy data back to GPU 0 and release vecs - multi_gpu_gather_lwe_async(streams, gpu_indexes, active_gpu_count, - d_lwe_array_out, lwe_after_pbs_vec, - lut->h_lwe_indexes_out, - lut->using_trivial_lwe_indexes, num_lwes, - encryption_params.big_lwe_dimension + 1); + multi_gpu_gather_lwe_async( + streams, gpu_indexes, active_gpu_count, d_lwe_array_out, + lwe_after_pbs_vec, lut->h_lwe_indexes_out, + lut->using_trivial_lwe_indexes, num_radix_blocks, + encryption_params.big_lwe_dimension + 1); /// Synchronize all GPUs for (uint i = 0; i < active_gpu_count; i++) { @@ -337,24 +364,25 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes, template __host__ void scratch_cuda_compress_integer_radix_ciphertext( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_compression **mem_ptr, uint32_t num_lwes, + int_compression **mem_ptr, uint32_t num_radix_blocks, int_radix_params compression_params, uint32_t lwe_per_glwe, uint32_t storage_log_modulus, bool allocate_gpu_memory) { *mem_ptr = new int_compression( - streams, gpu_indexes, gpu_count, compression_params, num_lwes, + streams, gpu_indexes, gpu_count, compression_params, num_radix_blocks, lwe_per_glwe, storage_log_modulus, allocate_gpu_memory); } template __host__ void scratch_cuda_integer_decompress_radix_ciphertext( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_decompression **mem_ptr, uint32_t num_lwes, uint32_t body_count, - int_radix_params encryption_params, int_radix_params compression_params, - uint32_t storage_log_modulus, bool allocate_gpu_memory) { + int_decompression **mem_ptr, uint32_t num_radix_blocks, + uint32_t body_count, int_radix_params encryption_params, + int_radix_params compression_params, uint32_t storage_log_modulus, + bool allocate_gpu_memory) { *mem_ptr = new int_decompression( streams, gpu_indexes, gpu_count, encryption_params, compression_params, - num_lwes, body_count, storage_log_modulus, allocate_gpu_memory); + num_radix_blocks, body_count, storage_log_modulus, allocate_gpu_memory); } #endif diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs index d6ca49755a..fa6e82335d 100644 --- a/backends/tfhe-cuda-backend/src/cuda_bind.rs +++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs @@ -96,7 +96,7 @@ extern "C" { lwe_dimension: u32, ks_level: u32, ks_base_log: u32, - num_lwes: u32, + num_radix_blocks: u32, message_modulus: u32, carry_modulus: u32, pbs_type: u32, @@ -117,7 +117,7 @@ extern "C" { lwe_dimension: u32, pbs_level: u32, pbs_base_log: u32, - num_lwes: u32, + num_radix_blocks: u32, message_modulus: u32, carry_modulus: u32, pbs_type: u32, diff --git a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs index 518a57b730..eb289218f8 100644 --- a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs +++ b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs @@ -129,19 +129,19 @@ impl CudaCompressedCiphertextList { /// use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder; /// use tfhe::integer::gpu::ciphertext::{CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext}; /// use tfhe::integer::gpu::gen_keys_radix_gpu; - /// use tfhe::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64; - /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64; + /// use tfhe::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; + /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; /// - /// let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64); + /// let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64); /// /// let private_compression_key = - /// cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64); + /// cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64); /// /// let streams = CudaStreams::new_multi_gpu(); /// /// let num_blocks = 32; /// let (radix_cks, _) = gen_keys_radix_gpu( - /// PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64, + /// PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64, /// num_blocks, /// &streams, /// ); @@ -268,19 +268,19 @@ impl CompressedCiphertextList { /// use tfhe::integer::gpu::ciphertext::{CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext}; /// use tfhe::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; /// use tfhe::integer::gpu::gen_keys_radix_gpu; - /// use tfhe::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64; - /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64; + /// use tfhe::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; + /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; /// - /// let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64); + /// let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64); /// /// let private_compression_key = - /// cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64); + /// cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64); /// /// let streams = CudaStreams::new_multi_gpu(); /// /// let num_blocks = 32; /// let (radix_cks, _) = gen_keys_radix_gpu( - /// PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64, + /// PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64, /// num_blocks, /// &streams, /// ); @@ -514,8 +514,8 @@ mod tests { use super::*; use crate::integer::gpu::gen_keys_radix_gpu; use crate::integer::ClientKey; - use crate::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64; - use crate::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64; + use crate::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; + use crate::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; use rand::Rng; const NB_TESTS: usize = 10; @@ -523,16 +523,16 @@ mod tests { #[test] fn test_gpu_ciphertext_compression() { - let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64); + let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64); let private_compression_key = - cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64); + cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64); let streams = CudaStreams::new_multi_gpu(); let num_blocks = 32; let (radix_cks, _) = gen_keys_radix_gpu( - PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64, + PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64, num_blocks, &streams, ); diff --git a/tfhe/src/integer/gpu/list_compression/server_keys.rs b/tfhe/src/integer/gpu/list_compression/server_keys.rs index 683369e293..ebfb0f6496 100644 --- a/tfhe/src/integer/gpu/list_compression/server_keys.rs +++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs @@ -92,32 +92,27 @@ impl CudaCompressionKey { let lwe_ciphertext_count = LweCiphertextCount(total_num_blocks); let gpu_index = streams.gpu_indexes[0]; - let d_vec = unsafe { - let mut d_vec = CudaVec::new_async( - lwe_dimension.to_lwe_size().0 * lwe_ciphertext_count.0, - streams, - gpu_index, + let mut d_vec = CudaVec::new_async( + lwe_dimension.to_lwe_size().0 * lwe_ciphertext_count.0, + streams, + gpu_index, + ); + let mut offset: usize = 0; + for ciphertext in vec_ciphertexts { + let dest_ptr = d_vec + .as_mut_c_ptr(gpu_index) + .add(offset * std::mem::size_of::()); + let size = ciphertext.d_blocks.0.d_vec.len * std::mem::size_of::(); + cuda_memcpy_async_gpu_to_gpu( + dest_ptr, + ciphertext.d_blocks.0.d_vec.as_c_ptr(gpu_index), + size as u64, + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], ); - let mut offset: usize = 0; - for ciphertext in vec_ciphertexts { - let dest_ptr = d_vec - .as_mut_c_ptr(gpu_index) - .add(offset * std::mem::size_of::()); - let size = ciphertext.d_blocks.0.d_vec.len * std::mem::size_of::(); - cuda_memcpy_async_gpu_to_gpu( - dest_ptr, - ciphertext.d_blocks.0.d_vec.as_c_ptr(gpu_index), - size as u64, - streams.ptr[gpu_index as usize], - streams.gpu_indexes[gpu_index as usize], - ); - offset += ciphertext.d_blocks.0.d_vec.len; - } - - streams.synchronize(); - d_vec - }; + offset += ciphertext.d_blocks.0.d_vec.len; + } CudaLweCiphertextList::from_cuda_vec(d_vec, lwe_ciphertext_count, ciphertext_modulus) }