From 3ba61c0694b22b98da17f8936110c354f2077c49 Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Thu, 25 Jul 2024 16:54:21 -0300 Subject: [PATCH] refactor(gpu): fix sample extraction when nth > 0 and keep input unchanged --- .../cuda/include/ciphertext.h | 7 + .../cuda/src/crypto/ciphertext.cu | 56 ++++++++ .../cuda/src/crypto/ciphertext.cuh | 36 +++++ .../cuda/src/polynomial/functions.cuh | 58 ++++---- backends/tfhe-cuda-backend/src/cuda_bind.rs | 10 ++ .../gpu/algorithms/glwe_sample_extraction.rs | 60 +++++++++ tfhe/src/core_crypto/gpu/algorithms/mod.rs | 1 + .../algorithms/test/glwe_sample_extraction.rs | 125 ++++++++++++++++++ .../core_crypto/gpu/algorithms/test/mod.rs | 1 + tfhe/src/core_crypto/gpu/mod.rs | 26 ++++ 10 files changed, 346 insertions(+), 34 deletions(-) create mode 100644 tfhe/src/core_crypto/gpu/algorithms/glwe_sample_extraction.rs create mode 100644 tfhe/src/core_crypto/gpu/algorithms/test/glwe_sample_extraction.rs diff --git a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h index 2fc1d52c1d..bba21890b7 100644 --- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h +++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h @@ -1,6 +1,7 @@ #ifndef CUDA_CIPHERTEXT_H #define CUDA_CIPHERTEXT_H +#include "device.h" #include extern "C" { @@ -14,5 +15,11 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream, void *dest, void *src, uint32_t number_of_cts, uint32_t lwe_dimension); + +void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index, + void *lwe_array_out, void *glwe_array_in, + uint32_t *nth_array, uint32_t num_glwes, + uint32_t glwe_dimension, + uint32_t polynomial_size); }; #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu index ac7c591b00..869e482fd4 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu @@ -1,4 +1,5 @@ #include "ciphertext.cuh" +#include "polynomial/parameters.cuh" void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream, uint32_t gpu_index, @@ -19,3 +20,58 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream, static_cast(stream), gpu_index, (uint64_t *)dest, (uint64_t *)src, number_of_cts, lwe_dimension); } + +void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index, + void *lwe_array_out, void *glwe_array_in, + uint32_t *nth_array, uint32_t num_glwes, + uint32_t glwe_dimension, + uint32_t polynomial_size) { + + switch (polynomial_size) { + case 256: + host_sample_extract>( + static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, + (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes, + glwe_dimension); + break; + case 512: + host_sample_extract>( + static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, + (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes, + glwe_dimension); + break; + case 1024: + host_sample_extract>( + static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, + (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes, + glwe_dimension); + break; + case 2048: + host_sample_extract>( + static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, + (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes, + glwe_dimension); + break; + case 4096: + host_sample_extract>( + static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, + (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes, + glwe_dimension); + break; + case 8192: + host_sample_extract>( + static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, + (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes, + glwe_dimension); + break; + case 16384: + host_sample_extract>( + static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, + (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes, + glwe_dimension); + break; + default: + PANIC("Cuda error: unsupported polynomial size. Supported " + "N's are powers of two in the interval [256..16384].") + } +} diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh index 6d5baf6191..b45353a616 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh @@ -3,6 +3,7 @@ #include "ciphertext.h" #include "device.h" +#include "polynomial/functions.cuh" #include template @@ -25,4 +26,39 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream, cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index); } +template +__global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in, + uint32_t *nth_array, uint32_t glwe_dimension) { + + const int input_id = blockIdx.x; + + const int glwe_input_size = (glwe_dimension + 1) * params::degree; + const int lwe_output_size = glwe_dimension * params::degree + 1; + + auto lwe_out = lwe_array_out + input_id * lwe_output_size; + + // We assume each GLWE will store the first polynomial_size inputs + uint32_t nth_per_glwe = params::degree; + auto glwe_in = glwe_array_in + (input_id / nth_per_glwe) * glwe_input_size; + + auto nth = nth_array[input_id]; + + sample_extract_mask(lwe_out, glwe_in, glwe_dimension, nth); + sample_extract_body(lwe_out, glwe_in, glwe_dimension, nth); +} + +template +__host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index, + Torus *lwe_array_out, Torus *glwe_array_in, + uint32_t *nth_array, uint32_t num_glwes, + uint32_t glwe_dimension) { + cudaSetDevice(gpu_index); + + dim3 grid(num_glwes); + dim3 thds(params::degree / params::opt); + sample_extract<<>>( + lwe_array_out, glwe_array_in, nth_array, glwe_dimension); + check_cuda_error(cudaGetLastError()); +} + #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh index b2384769ef..f94c8e5848 100644 --- a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh @@ -191,65 +191,55 @@ __device__ void add_to_torus(double2 *m_values, Torus *result, // Extracts the body of the nth-LWE in a GLWE. template -__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator, +__device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe, uint32_t glwe_dimension, uint32_t nth = 0) { - // Set first coefficient of the accumulator as the body of the LWE sample + // Set first coefficient of the glwe as the body of the LWE sample lwe_array_out[glwe_dimension * params::degree] = - accumulator[glwe_dimension * params::degree + nth]; + glwe[glwe_dimension * params::degree + nth]; } // Extracts the mask from the nth-LWE in a GLWE. template -__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator, - uint32_t num_poly = 1, uint32_t nth = 0) { - for (int z = 0; z < num_poly; z++) { +__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *glwe, + uint32_t glwe_dimension = 1, + uint32_t nth = 0) { + for (int z = 0; z < glwe_dimension; z++) { Torus *lwe_array_out_slice = (Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree); - Torus *accumulator_slice = - (Torus *)accumulator + (ptrdiff_t)(z * params::degree); + Torus *glwe_slice = (Torus *)glwe + (ptrdiff_t)(z * params::degree); synchronize_threads_in_block(); - // Reverse the accumulator + // Reverse the glwe + // Set ACC = -ACC int tid = threadIdx.x; Torus result[params::opt]; #pragma unroll for (int i = 0; i < params::opt; i++) { - result[i] = accumulator_slice[params::degree - tid - 1]; - tid = tid + params::degree / params::opt; - } - synchronize_threads_in_block(); - - // Set ACC = -ACC - tid = threadIdx.x; -#pragma unroll - for (int i = 0; i < params::opt; i++) { - accumulator_slice[tid] = - SEL(-result[i], result[i], tid >= params::degree - nth); + auto x = glwe_slice[params::degree - tid - 1]; + result[i] = SEL(-x, x, tid >= params::degree - nth); tid = tid + params::degree / params::opt; } synchronize_threads_in_block(); // Perform ACC * X // (equivalent to multiply_by_monomial_negacyclic_inplace(1)) + // Copy to the mask of the LWE sample tid = threadIdx.x; - result[params::opt]; for (int i = 0; i < params::opt; i++) { // if (tid < 1) - // result[i] = -accumulator_slice[tid - 1 + params::degree]; + // result[i] = -glwe_slice[tid - 1 + params::degree]; // else - // result[i] = accumulator_slice[tid - 1]; - int x = tid - 1 + SEL(0, params::degree - nth, tid < 1); - result[i] = SEL(1, -1, tid < 1) * accumulator_slice[x]; - tid += params::degree / params::opt; - } - synchronize_threads_in_block(); + // result[i] = glwe_slice[tid - 1]; + uint32_t dst_idx = tid + 1 + nth; + if (dst_idx == params::degree) + lwe_array_out_slice[0] = -result[i]; + else { + dst_idx = + SEL(dst_idx, dst_idx - params::degree, dst_idx >= params::degree); + lwe_array_out_slice[dst_idx] = result[i]; + } - // Copy to the mask of the LWE sample - tid = threadIdx.x; -#pragma unroll - for (int i = 0; i < params::opt; i++) { - lwe_array_out_slice[tid] = result[i]; - tid = tid + params::degree / params::opt; + tid += params::degree / params::opt; } } } diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs index 10865097cd..9c1fe878cb 100644 --- a/backends/tfhe-cuda-backend/src/cuda_bind.rs +++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs @@ -596,6 +596,16 @@ extern "C" { gpu_count: u32, mem_ptr: *mut *mut i8, ); + pub fn cuda_glwe_sample_extract_64( + stream: *mut c_void, + gpu_index: u32, + lwe_array_out: *mut c_void, + glwe_array_in: *const c_void, + nth_array: *const u32, + num_glwes: u32, + glwe_dimension: u32, + polynomial_size: u32, + ); pub fn scratch_cuda_integer_radix_comparison_kb_64( streams: *const *mut c_void, diff --git a/tfhe/src/core_crypto/gpu/algorithms/glwe_sample_extraction.rs b/tfhe/src/core_crypto/gpu/algorithms/glwe_sample_extraction.rs new file mode 100644 index 0000000000..8f6e44a815 --- /dev/null +++ b/tfhe/src/core_crypto/gpu/algorithms/glwe_sample_extraction.rs @@ -0,0 +1,60 @@ +use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList; +use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; +use crate::core_crypto::gpu::vec::CudaVec; +use crate::core_crypto::gpu::{extract_lwe_samples_from_glwe_ciphertext_list_async, CudaStreams}; +use crate::core_crypto::prelude::{MonomialDegree, UnsignedTorus}; +use itertools::Itertools; + +/// For each [`GLWE Ciphertext`] (`CudaGlweCiphertextList`) given as input, extract the nth +/// coefficient from its body as an [`LWE ciphertext`](`CudaLweCiphertextList`). This variant is +/// GPU-accelerated. +pub fn cuda_extract_lwe_samples_from_glwe_ciphertext_list( + input_glwe_list: &CudaGlweCiphertextList, + output_lwe_list: &mut CudaLweCiphertextList, + vec_nth: &[MonomialDegree], + streams: &CudaStreams, +) where + // CastInto required for PBS modulus switch which returns a usize + Scalar: UnsignedTorus, +{ + let in_lwe_dim = input_glwe_list + .glwe_dimension() + .to_equivalent_lwe_dimension(input_glwe_list.polynomial_size()); + + let out_lwe_dim = output_lwe_list.lwe_dimension(); + + assert_eq!( + in_lwe_dim, out_lwe_dim, + "Mismatch between equivalent LweDimension of input ciphertext and output ciphertext. \ + Got {in_lwe_dim:?} for input and {out_lwe_dim:?} for output.", + ); + + assert_eq!( + vec_nth.len(), + input_glwe_list.glwe_ciphertext_count().0 * input_glwe_list.polynomial_size().0, + "Mismatch between number of nths and number of GLWEs provided.", + ); + + assert_eq!( + input_glwe_list.ciphertext_modulus(), + output_lwe_list.ciphertext_modulus(), + "Mismatched moduli between input_glwe ({:?}) and output_lwe ({:?})", + input_glwe_list.ciphertext_modulus(), + output_lwe_list.ciphertext_modulus() + ); + + let nth_array: Vec = vec_nth.iter().map(|x| x.0 as u32).collect_vec(); + let gpu_indexes = &streams.gpu_indexes; + unsafe { + let d_nth_array = CudaVec::from_cpu_async(&nth_array, streams, gpu_indexes[0]); + extract_lwe_samples_from_glwe_ciphertext_list_async( + streams, + &mut output_lwe_list.0.d_vec, + &input_glwe_list.0.d_vec, + &d_nth_array, + vec_nth.len() as u32, + input_glwe_list.glwe_dimension(), + input_glwe_list.polynomial_size(), + ); + } +} diff --git a/tfhe/src/core_crypto/gpu/algorithms/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/mod.rs index 9dafafcbfe..ee6ddd2b1d 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/mod.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/mod.rs @@ -2,6 +2,7 @@ pub mod lwe_linear_algebra; pub mod lwe_multi_bit_programmable_bootstrapping; pub mod lwe_programmable_bootstrapping; +pub mod glwe_sample_extraction; mod lwe_keyswitch; #[cfg(test)] mod test; diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/glwe_sample_extraction.rs b/tfhe/src/core_crypto/gpu/algorithms/test/glwe_sample_extraction.rs new file mode 100644 index 0000000000..51018d352f --- /dev/null +++ b/tfhe/src/core_crypto/gpu/algorithms/test/glwe_sample_extraction.rs @@ -0,0 +1,125 @@ +use super::*; +use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList; +use crate::core_crypto::gpu::glwe_sample_extraction::cuda_extract_lwe_samples_from_glwe_ciphertext_list; +use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; +use crate::core_crypto::gpu::CudaStreams; +use itertools::Itertools; + +#[cfg(not(tarpaulin))] +const NB_TESTS: usize = 10; +#[cfg(tarpaulin)] +const NB_TESTS: usize = 1; + +fn glwe_encrypt_sample_extract_decrypt_custom_mod( + params: ClassicTestParams, +) { + let glwe_dimension = params.glwe_dimension; + let polynomial_size = params.polynomial_size; + let glwe_noise_distribution = params.glwe_noise_distribution; + let ciphertext_modulus = params.ciphertext_modulus; + let message_modulus_log = params.message_modulus_log; + let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus); + + let mut rsc = TestResources::new(); + + let msg_modulus = Scalar::ONE.shl(message_modulus_log.0); + let delta: Scalar = encoding_with_padding / msg_modulus; + + let gpu_index = 0; + let streams = CudaStreams::new_single_gpu(gpu_index); + + let mut msgs = vec![]; + + // Build msg + // TODO: Can't we collect from (0..msg_modulus) if msg_modulus is Scalar? + let mut msg = msg_modulus; + msg = msg.wrapping_sub(Scalar::ONE); + while msg != Scalar::ZERO { + msgs.push(msg); + msg = msg.wrapping_sub(Scalar::ONE); + } + + // Run tests + for _ in 0..NB_TESTS { + let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key( + glwe_dimension, + polynomial_size, + &mut rsc.secret_random_generator, + ); + + let equivalent_lwe_sk = glwe_sk.clone().into_lwe_secret_key(); + + let mut glwe_list = GlweCiphertextList::new( + Scalar::ZERO, + glwe_dimension.to_glwe_size(), + polynomial_size, + GlweCiphertextCount(msgs.len()), + ciphertext_modulus, + ); + + let cleartext_list = msgs + .iter() + .flat_map(|&x| vec![x * delta; glwe_list.polynomial_size().0]) + .collect_vec(); + + let plaintext_list = PlaintextList::from_container(cleartext_list); + encrypt_glwe_ciphertext_list( + &glwe_sk, + &mut glwe_list, + &plaintext_list, + glwe_noise_distribution, + &mut rsc.encryption_random_generator, + ); + + let input_cuda_glwe_list = + CudaGlweCiphertextList::from_glwe_ciphertext_list(&glwe_list, &streams); + + let mut output_cuda_lwe_ciphertext_list = CudaLweCiphertextList::new( + equivalent_lwe_sk.lwe_dimension(), + LweCiphertextCount(msgs.len() * glwe_list.polynomial_size().0), + ciphertext_modulus, + &streams, + ); + + let nths = (0..(msgs.len() * glwe_list.polynomial_size().0)) + .map(|x| MonomialDegree(x % glwe_list.polynomial_size().0)) + .collect_vec(); + + cuda_extract_lwe_samples_from_glwe_ciphertext_list( + &input_cuda_glwe_list, + &mut output_cuda_lwe_ciphertext_list, + nths.as_slice(), + &streams, + ); + + let gpu_output_lwe_ciphertext_list = + output_cuda_lwe_ciphertext_list.to_lwe_ciphertext_list(&streams); + + let mut output_plaintext_list = PlaintextList::new( + Scalar::ZERO, + PlaintextCount(gpu_output_lwe_ciphertext_list.lwe_ciphertext_count().0), + ); + + decrypt_lwe_ciphertext_list( + &equivalent_lwe_sk, + &gpu_output_lwe_ciphertext_list, + &mut output_plaintext_list, + ); + + let mut decoded = vec![Scalar::ZERO; plaintext_list.plaintext_count().0]; + + decoded + .iter_mut() + .zip(output_plaintext_list.iter()) + .for_each(|(dst, src)| *dst = round_decode(*src.0, delta) % msg_modulus); + + let mut count = msg_modulus; + count = count.wrapping_sub(Scalar::ONE); + for result in decoded.chunks_exact(glwe_list.polynomial_size().0) { + assert!(result.iter().all(|&x| x == count)); + count = count.wrapping_sub(Scalar::ONE); + } + } +} + +create_gpu_parametrized_test!(glwe_encrypt_sample_extract_decrypt_custom_mod); diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs index 3b30f96b1c..59ba20af18 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs @@ -1,5 +1,6 @@ use crate::core_crypto::algorithms::test::*; +mod glwe_sample_extraction; mod lwe_keyswitch; mod lwe_linear_algebra; mod lwe_multi_bit_programmable_bootstrapping; diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index a52ff80d09..3341ba6e5e 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -326,6 +326,32 @@ pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async( + streams: &CudaStreams, + lwe_array_out: &mut CudaVec, + glwe_array_in: &CudaVec, + nth_array: &CudaVec, + num_nths: u32, + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, +) { + cuda_glwe_sample_extract_64( + streams.ptr[0], + streams.gpu_indexes[0], + lwe_array_out.as_mut_c_ptr(0), + glwe_array_in.as_c_ptr(0), + nth_array.as_c_ptr(0).cast::(), + num_nths, + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + ); +} + /// Discarding addition of a vector of LWE ciphertexts /// /// # Safety