diff --git a/backends/tfhe-cuda-backend/cuda/include/functions.h b/backends/tfhe-cuda-backend/cuda/include/functions.h new file mode 100644 index 0000000000..3ee0a11430 --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/include/functions.h @@ -0,0 +1,16 @@ +#ifndef CUDA_FUNCTIONS_H_ +#define CUDA_FUNCTIONS_H_ + +#include "polynomial/functions.cuh" +#include "polynomial/parameters.cuh" +#include + +extern "C" { +void cuda_glwe_sample_extract_64(void **streams, uint32_t *gpu_indexes, + uint32_t gpu_count, void *lwe_array_out, + void *glwe_array_in, uint32_t *nth_array, + uint32_t num_samples, uint32_t glwe_dimension, + uint32_t polynomial_size); +} + +#endif diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh index 177892e5a9..9fc4ad1d8a 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh @@ -1,6 +1,7 @@ #ifndef CNCRT_TORUS_CUH #define CNCRT_TORUS_CUH +#include "device.h" #include "types/int128.cuh" #include diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cu b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cu new file mode 100644 index 0000000000..38aedf8f67 --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cu @@ -0,0 +1,56 @@ +#include "functions.h" + +void cuda_glwe_sample_extract_64(void **streams, uint32_t *gpu_indexes, + uint32_t gpu_count, void *lwe_array_out, + void *glwe_in, uint32_t *nth_array, + uint32_t num_samples, uint32_t glwe_dimension, + uint32_t polynomial_size) { + + switch (polynomial_size) { + case 256: + host_sample_extract>( + (cudaStream_t *)(streams), (uint64_t *)lwe_array_out, + (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples, + glwe_dimension); + break; + case 512: + host_sample_extract>( + (cudaStream_t *)(streams), (uint64_t *)lwe_array_out, + (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples, + glwe_dimension); + break; + case 1024: + host_sample_extract>( + (cudaStream_t *)(streams), (uint64_t *)lwe_array_out, + (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples, + glwe_dimension); + break; + case 2048: + host_sample_extract>( + (cudaStream_t *)(streams), (uint64_t *)lwe_array_out, + (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples, + glwe_dimension); + break; + case 4096: + host_sample_extract>( + (cudaStream_t *)(streams), (uint64_t *)lwe_array_out, + (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples, + glwe_dimension); + break; + case 8192: + host_sample_extract>( + (cudaStream_t *)(streams), (uint64_t *)lwe_array_out, + (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples, + glwe_dimension); + break; + case 16384: + host_sample_extract>( + (cudaStream_t *)(streams), (uint64_t *)lwe_array_out, + (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples, + glwe_dimension); + break; + default: + PANIC("Cuda error: unsupported polynomial size. Supported " + "N's are powers of two in the interval [256..16384].") + } +} diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh index b2384769ef..1fa5a61b39 100644 --- a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh @@ -191,67 +191,86 @@ __device__ void add_to_torus(double2 *m_values, Torus *result, // Extracts the body of the nth-LWE in a GLWE. template -__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator, +__device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe, uint32_t glwe_dimension, uint32_t nth = 0) { - // Set first coefficient of the accumulator as the body of the LWE sample + // Set first coefficient of the glwe as the body of the LWE sample lwe_array_out[glwe_dimension * params::degree] = - accumulator[glwe_dimension * params::degree + nth]; + glwe[glwe_dimension * params::degree + nth]; } // Extracts the mask from the nth-LWE in a GLWE. template -__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator, - uint32_t num_poly = 1, uint32_t nth = 0) { - for (int z = 0; z < num_poly; z++) { +__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *glwe, + uint32_t glwe_dimension = 1, + uint32_t nth = 0) { + for (int z = 0; z < glwe_dimension; z++) { Torus *lwe_array_out_slice = (Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree); - Torus *accumulator_slice = - (Torus *)accumulator + (ptrdiff_t)(z * params::degree); + Torus *glwe_slice = (Torus *)glwe + (ptrdiff_t)(z * params::degree); synchronize_threads_in_block(); - // Reverse the accumulator + // Reverse the glwe + // Set ACC = -ACC int tid = threadIdx.x; Torus result[params::opt]; #pragma unroll for (int i = 0; i < params::opt; i++) { - result[i] = accumulator_slice[params::degree - tid - 1]; - tid = tid + params::degree / params::opt; - } - synchronize_threads_in_block(); - - // Set ACC = -ACC - tid = threadIdx.x; -#pragma unroll - for (int i = 0; i < params::opt; i++) { - accumulator_slice[tid] = - SEL(-result[i], result[i], tid >= params::degree - nth); + auto x = glwe_slice[params::degree - tid - 1]; + result[i] = SEL(-x, x, tid >= params::degree - nth); tid = tid + params::degree / params::opt; } synchronize_threads_in_block(); // Perform ACC * X // (equivalent to multiply_by_monomial_negacyclic_inplace(1)) + // Copy to the mask of the LWE sample tid = threadIdx.x; - result[params::opt]; for (int i = 0; i < params::opt; i++) { // if (tid < 1) - // result[i] = -accumulator_slice[tid - 1 + params::degree]; + // result[i] = -glwe_slice[tid - 1 + params::degree]; // else - // result[i] = accumulator_slice[tid - 1]; - int x = tid - 1 + SEL(0, params::degree - nth, tid < 1); - result[i] = SEL(1, -1, tid < 1) * accumulator_slice[x]; - tid += params::degree / params::opt; - } - synchronize_threads_in_block(); + // result[i] = glwe_slice[tid - 1]; + uint32_t dst_idx = tid + 1 + nth; + if (dst_idx == params::degree) + lwe_array_out_slice[0] = -result[i]; + else { + dst_idx = + SEL(dst_idx, dst_idx - params::degree, dst_idx >= params::degree); + lwe_array_out_slice[dst_idx] = result[i]; + } - // Copy to the mask of the LWE sample - tid = threadIdx.x; -#pragma unroll - for (int i = 0; i < params::opt; i++) { - lwe_array_out_slice[tid] = result[i]; - tid = tid + params::degree / params::opt; + tid += params::degree / params::opt; } } } +template +__global__ void apply_sample_extract(Torus *lwe_array_out, Torus *glwe_in, + uint32_t *nth_array, + uint32_t glwe_dimension) { + + const int input_id = blockIdx.x; + + const int lwe_output_size = glwe_dimension * params::degree + 1; + + auto lwe_out = lwe_array_out + input_id * lwe_output_size; + auto nth = nth_array[input_id]; + + sample_extract_mask(lwe_out, glwe_in, glwe_dimension, nth); + sample_extract_body(lwe_out, glwe_in, glwe_dimension, nth); +} + +template +__host__ void host_sample_extract(cudaStream_t *streams, Torus *lwe_array_out, + Torus *glwe_in, uint32_t *nth_array, + uint32_t num_samples, + uint32_t glwe_dimension) { + + dim3 grid(num_samples); + dim3 thds(params::degree / params::opt); + apply_sample_extract<<>>( + lwe_array_out, glwe_in, nth_array, glwe_dimension); + check_cuda_error(cudaGetLastError()); +} + #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/types/int128.cuh b/backends/tfhe-cuda-backend/cuda/src/types/int128.cuh index ae925fadb2..fd7e4921ab 100644 --- a/backends/tfhe-cuda-backend/cuda/src/types/int128.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/types/int128.cuh @@ -1,6 +1,7 @@ #ifndef CNCRT_INT128_CUH #define CNCRT_INT128_CUH +#include // abseil's int128 type // licensed under Apache license diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs index 322be12f2a..a0f0d317d5 100644 --- a/backends/tfhe-cuda-backend/src/cuda_bind.rs +++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs @@ -628,6 +628,17 @@ extern "C" { gpu_count: u32, mem_ptr: *mut *mut i8, ); + pub fn cuda_glwe_sample_extract_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array_out: *mut c_void, + glwe_in: *const c_void, + nth_array: *const u32, + num_samples: u32, + glwe_dimension: u32, + polynomial_size: u32, + ); pub fn scratch_cuda_integer_radix_comparison_kb_64( streams: *const *mut c_void, diff --git a/tfhe/src/core_crypto/gpu/algorithms/glwe_sample_extraction.rs b/tfhe/src/core_crypto/gpu/algorithms/glwe_sample_extraction.rs new file mode 100644 index 0000000000..d498888ba5 --- /dev/null +++ b/tfhe/src/core_crypto/gpu/algorithms/glwe_sample_extraction.rs @@ -0,0 +1,52 @@ +use crate::core_crypto::gpu::glwe_ciphertext::CudaGlweCiphertext; +use crate::core_crypto::gpu::lwe_ciphertext::CudaLweCiphertext; +use crate::core_crypto::gpu::vec::CudaVec; +use crate::core_crypto::gpu::{extract_lwe_sample_from_glwe_ciphertext_async, CudaStreams}; +use crate::core_crypto::prelude::{LweCiphertextCount, MonomialDegree, UnsignedTorus}; + +/// Extract the nth coefficient from the body of a [`GLWE Ciphertext`](`CudaGlweCiphertext`) as an +/// [`LWE ciphertext`](`CudaLweCiphertext`). This variant is GPU-accelerated. +pub fn cuda_extract_lwe_sample_from_glwe_ciphertext( + input_glwe: &CudaGlweCiphertext, + output_lwe: &mut CudaLweCiphertext, + nth: MonomialDegree, + streams: &CudaStreams, +) where + // CastInto required for PBS modulus switch which returns a usize + Scalar: UnsignedTorus, +{ + let in_lwe_dim = input_glwe + .glwe_dimension() + .to_equivalent_lwe_dimension(input_glwe.polynomial_size()); + + let out_lwe_dim = output_lwe.lwe_dimension(); + + assert_eq!( + in_lwe_dim, out_lwe_dim, + "Mismatch between equivalent LweDimension of input ciphertext and output ciphertext. \ + Got {in_lwe_dim:?} for input and {out_lwe_dim:?} for output.", + ); + + assert_eq!( + input_glwe.ciphertext_modulus(), + output_lwe.ciphertext_modulus(), + "Mismatched moduli between input_glwe ({:?}) and output_lwe ({:?})", + input_glwe.ciphertext_modulus(), + output_lwe.ciphertext_modulus() + ); + + let nth_array: Vec = vec![nth.0 as u32]; + let gpu_indexes = &streams.gpu_indexes; + unsafe { + let d_nth_array = CudaVec::from_cpu_async(&nth_array, streams, gpu_indexes[0]); + extract_lwe_sample_from_glwe_ciphertext_async( + streams, + &mut output_lwe.0.d_vec, + &input_glwe.0.d_vec, + &d_nth_array, + LweCiphertextCount(nth_array.len()), + input_glwe.glwe_dimension(), + input_glwe.polynomial_size(), + ); + } +} diff --git a/tfhe/src/core_crypto/gpu/algorithms/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/mod.rs index 9dafafcbfe..ee6ddd2b1d 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/mod.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/mod.rs @@ -2,6 +2,7 @@ pub mod lwe_linear_algebra; pub mod lwe_multi_bit_programmable_bootstrapping; pub mod lwe_programmable_bootstrapping; +pub mod glwe_sample_extraction; mod lwe_keyswitch; #[cfg(test)] mod test; diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/glwe_sample_extraction.rs b/tfhe/src/core_crypto/gpu/algorithms/test/glwe_sample_extraction.rs new file mode 100644 index 0000000000..2590fae4f8 --- /dev/null +++ b/tfhe/src/core_crypto/gpu/algorithms/test/glwe_sample_extraction.rs @@ -0,0 +1,143 @@ +use super::*; +use crate::core_crypto::gpu::glwe_ciphertext::CudaGlweCiphertext; +use crate::core_crypto::gpu::glwe_sample_extraction::cuda_extract_lwe_sample_from_glwe_ciphertext; +use crate::core_crypto::gpu::lwe_ciphertext::CudaLweCiphertext; +use crate::core_crypto::gpu::CudaStreams; +#[cfg(not(tarpaulin))] +const NB_TESTS: usize = 10; +#[cfg(tarpaulin)] +const NB_TESTS: usize = 1; + +fn glwe_encrypt_sample_extract_decrypt_custom_mod( + params: ClassicTestParams, +) { + let glwe_dimension = params.glwe_dimension; + let polynomial_size = params.polynomial_size; + let glwe_noise_distribution = params.glwe_noise_distribution; + let ciphertext_modulus = params.ciphertext_modulus; + let message_modulus_log = params.message_modulus_log; + let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus); + + let mut rsc = TestResources::new(); + + let msg_modulus = Scalar::ONE.shl(message_modulus_log.0); + let mut msg = msg_modulus; + let delta: Scalar = encoding_with_padding / msg_modulus; + + let gpu_index = 0; + let stream = CudaStreams::new_single_gpu(gpu_index); + + while msg != Scalar::ZERO { + msg = msg.wrapping_sub(Scalar::ONE); + for _ in 0..NB_TESTS { + let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key( + glwe_dimension, + polynomial_size, + &mut rsc.secret_random_generator, + ); + + let equivalent_lwe_sk = glwe_sk.clone().into_lwe_secret_key(); + + let mut glwe = GlweCiphertext::new( + Scalar::ZERO, + glwe_dimension.to_glwe_size(), + polynomial_size, + ciphertext_modulus, + ); + + let plaintext_list = + PlaintextList::new(msg * delta, PlaintextCount(glwe.polynomial_size().0)); + + encrypt_glwe_ciphertext( + &glwe_sk, + &mut glwe, + &plaintext_list, + glwe_noise_distribution, + &mut rsc.encryption_random_generator, + ); + + let d_glwe = CudaGlweCiphertext::from_glwe_ciphertext(&glwe, &stream); + + assert!(check_encrypted_content_respects_mod( + &glwe, + ciphertext_modulus + )); + + let mut vec_output_lwe_ciphertext_list = vec![]; + + for idx in 0..glwe.polynomial_size().0 { + let mut d_output_lwe_ciphertext = CudaLweCiphertext::new( + equivalent_lwe_sk.lwe_dimension(), + ciphertext_modulus, + &stream, + ); + cuda_extract_lwe_sample_from_glwe_ciphertext( + &d_glwe, + &mut d_output_lwe_ciphertext, + MonomialDegree(idx), + &stream, + ); + vec_output_lwe_ciphertext_list.extend( + d_output_lwe_ciphertext + .into_lwe_ciphertext(&stream) + .into_container(), + ); + } + + let gpu_output_lwe_ciphertext_list = LweCiphertextList::from_container( + vec_output_lwe_ciphertext_list, + equivalent_lwe_sk.lwe_dimension().to_lwe_size(), + ciphertext_modulus, + ); + + assert!(check_encrypted_content_respects_mod( + &gpu_output_lwe_ciphertext_list, + ciphertext_modulus + )); + + let mut cpu_output_lwe_ciphertext_list = LweCiphertextList::new( + Scalar::ZERO, + equivalent_lwe_sk.lwe_dimension().to_lwe_size(), + LweCiphertextCount(glwe.polynomial_size().0), + ciphertext_modulus, + ); + + for (idx, mut output_lwe_ciphertext) in + cpu_output_lwe_ciphertext_list.iter_mut().enumerate() + { + extract_lwe_sample_from_glwe_ciphertext( + &glwe, + &mut output_lwe_ciphertext, + MonomialDegree(idx), + ); + } + + let mut plaintext_list = PlaintextList::new( + Scalar::ZERO, + PlaintextCount(gpu_output_lwe_ciphertext_list.lwe_ciphertext_count().0), + ); + + decrypt_lwe_ciphertext_list( + &equivalent_lwe_sk, + &gpu_output_lwe_ciphertext_list, + &mut plaintext_list, + ); + + let mut decoded = vec![Scalar::ZERO; plaintext_list.plaintext_count().0]; + + decoded + .iter_mut() + .zip(plaintext_list.iter()) + .for_each(|(dst, src)| *dst = round_decode(*src.0, delta) % msg_modulus); + + assert!(decoded.iter().all(|&x| x == msg)); + } + + // In coverage, we break after one while loop iteration, changing message values does not + // yield higher coverage + #[cfg(tarpaulin)] + break; + } +} + +create_gpu_parametrized_test!(glwe_encrypt_sample_extract_decrypt_custom_mod); diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs index 3b30f96b1c..59ba20af18 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs @@ -1,5 +1,6 @@ use crate::core_crypto::algorithms::test::*; +mod glwe_sample_extraction; mod lwe_keyswitch; mod lwe_linear_algebra; mod lwe_multi_bit_programmable_bootstrapping; diff --git a/tfhe/src/core_crypto/gpu/entities/glwe_ciphertext.rs b/tfhe/src/core_crypto/gpu/entities/glwe_ciphertext.rs new file mode 100644 index 0000000000..fd2a9143c6 --- /dev/null +++ b/tfhe/src/core_crypto/gpu/entities/glwe_ciphertext.rs @@ -0,0 +1,97 @@ +use crate::core_crypto::gpu::vec::CudaVec; +use crate::core_crypto::gpu::{CudaGlweList, CudaStreams}; +use crate::core_crypto::prelude::{ + glwe_ciphertext_size, CiphertextModulus, Container, GlweCiphertext, GlweCiphertextCount, + GlweDimension, PolynomialSize, UnsignedInteger, +}; + +/// A structure representing a GLWE ciphertext with 64 bits of precision on the GPU. +#[derive(Debug)] +pub struct CudaGlweCiphertext(pub(crate) CudaGlweList); + +#[allow(dead_code)] +impl CudaGlweCiphertext { + pub fn new( + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + ciphertext_modulus: CiphertextModulus, + streams: &CudaStreams, + ) -> Self { + // Allocate memory in the device + let d_vec = CudaVec::new( + glwe_ciphertext_size(glwe_dimension.to_glwe_size(), polynomial_size), + streams, + 0, + ); + let cuda_glwe_list = CudaGlweList { + d_vec, + glwe_ciphertext_count: GlweCiphertextCount(1), + glwe_dimension, + polynomial_size, + ciphertext_modulus, + }; + + Self(cuda_glwe_list) + } + + pub(crate) fn to_glwe_ciphertext(&self, streams: &CudaStreams) -> GlweCiphertext> { + let glwe_ct_size = + glwe_ciphertext_size(self.0.glwe_dimension.to_glwe_size(), self.0.polynomial_size); + let mut container: Vec = vec![T::ZERO; glwe_ct_size]; + + unsafe { + self.0 + .d_vec + .copy_to_cpu_async(container.as_mut_slice(), streams, 0); + streams.synchronize(); + } + + GlweCiphertext::from_container(container, self.polynomial_size(), self.ciphertext_modulus()) + } + + pub fn from_glwe_ciphertext>( + h_ct: &GlweCiphertext, + streams: &CudaStreams, + ) -> Self { + let glwe_dimension = h_ct.glwe_size().to_glwe_dimension(); + let glwe_ciphertext_count = GlweCiphertextCount(1); + let polynomial_size = h_ct.polynomial_size(); + let ciphertext_modulus = h_ct.ciphertext_modulus(); + + let mut d_vec = CudaVec::new( + glwe_ciphertext_size(glwe_dimension.to_glwe_size(), polynomial_size) + * glwe_ciphertext_count.0, + streams, + 0, + ); + + // Copy to the GPU + let h_input = h_ct.as_view().into_container(); + unsafe { + d_vec.copy_from_cpu_async(h_input.as_ref(), streams, 0); + } + streams.synchronize(); + + let cuda_glwe_list = CudaGlweList { + d_vec, + glwe_ciphertext_count, + glwe_dimension, + polynomial_size, + ciphertext_modulus, + }; + + Self(cuda_glwe_list) + } + + pub(crate) fn glwe_dimension(&self) -> GlweDimension { + self.0.glwe_dimension + } + + pub(crate) fn polynomial_size(&self) -> PolynomialSize { + self.0.polynomial_size + } + + pub(crate) fn ciphertext_modulus(&self) -> CiphertextModulus { + self.0.ciphertext_modulus + } +} diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext.rs b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext.rs new file mode 100644 index 0000000000..969062a687 --- /dev/null +++ b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext.rs @@ -0,0 +1,147 @@ +use crate::core_crypto::gpu::vec::CudaVec; +use crate::core_crypto::gpu::{CudaLweList, CudaStreams}; +use crate::core_crypto::prelude::{ + CiphertextModulus, Container, LweCiphertext, LweCiphertextCount, LweCiphertextList, + LweDimension, LweSize, UnsignedInteger, +}; + +/// A structure representing a vector of LWE ciphertexts with 64 bits of precision on the GPU. +#[derive(Debug)] +pub struct CudaLweCiphertext(pub(crate) CudaLweList); + +#[allow(dead_code)] +impl CudaLweCiphertext { + pub fn new( + lwe_dimension: LweDimension, + ciphertext_modulus: CiphertextModulus, + streams: &CudaStreams, + ) -> Self { + // Allocate memory in the gpu_index + let d_vec = unsafe { CudaVec::new_async(lwe_dimension.to_lwe_size().0, streams, 0) }; + streams.synchronize(); + + let cuda_lwe_list = CudaLweList { + d_vec, + lwe_ciphertext_count: LweCiphertextCount(1), + lwe_dimension, + ciphertext_modulus, + }; + + Self(cuda_lwe_list) + } + + pub fn from_cuda_vec(d_vec: CudaVec, ciphertext_modulus: CiphertextModulus) -> Self { + let lwe_dimension = LweSize(d_vec.len()).to_lwe_dimension(); + let cuda_lwe_list = CudaLweList { + d_vec, + lwe_ciphertext_count: LweCiphertextCount(1), + lwe_dimension, + ciphertext_modulus, + }; + Self(cuda_lwe_list) + } + + pub fn to_lwe_ciphertext_list(&self, streams: &CudaStreams) -> LweCiphertextList> { + let lwe_ct_size = self.0.lwe_dimension.to_lwe_size().0; + let mut container: Vec = vec![T::ZERO; lwe_ct_size]; + + unsafe { + self.0 + .d_vec + .copy_to_cpu_async(container.as_mut_slice(), streams, 0); + } + streams.synchronize(); + + LweCiphertextList::from_container( + container, + self.lwe_dimension().to_lwe_size(), + self.ciphertext_modulus(), + ) + } + + pub fn from_lwe_ciphertext>( + h_ct: &LweCiphertext, + streams: &CudaStreams, + ) -> Self { + let lwe_dimension = h_ct.lwe_size().to_lwe_dimension(); + let lwe_ciphertext_count = LweCiphertextCount(1); + let ciphertext_modulus = h_ct.ciphertext_modulus(); + + // Copy to the GPU + let mut d_vec = CudaVec::new(lwe_dimension.to_lwe_size().0, streams, 0); + unsafe { + d_vec.copy_from_cpu_async(h_ct.as_ref(), streams, 0); + } + streams.synchronize(); + + let cuda_lwe_list = CudaLweList { + d_vec, + lwe_ciphertext_count, + lwe_dimension, + ciphertext_modulus, + }; + Self(cuda_lwe_list) + } + + pub fn into_lwe_ciphertext(&self, streams: &CudaStreams) -> LweCiphertext> { + let lwe_ct_size = self.0.lwe_dimension.to_lwe_size().0; + let mut container: Vec = vec![T::ZERO; lwe_ct_size]; + + unsafe { + self.0 + .d_vec + .copy_to_cpu_async(container.as_mut_slice(), streams, 0); + } + streams.synchronize(); + + LweCiphertext::from_container(container, self.ciphertext_modulus()) + } + + /// ```rust + /// use tfhe::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; + /// use tfhe::core_crypto::gpu::CudaStreams; + /// use tfhe::core_crypto::prelude::{ + /// CiphertextModulus, LweCiphertextCount, LweCiphertextList, LweSize, + /// }; + /// + /// let mut streams = CudaStreams::new_single_gpu(0); + /// + /// let lwe_size = LweSize(743); + /// let ciphertext_modulus = CiphertextModulus::new_native(); + /// let lwe_ciphertext_count = LweCiphertextCount(2); + /// + /// // Create a new LweCiphertextList + /// let lwe_list = LweCiphertextList::new(0u64, lwe_size, lwe_ciphertext_count, ciphertext_modulus); + /// + /// // Copy to GPU + /// let d_lwe_list = CudaLweCiphertextList::from_lwe_ciphertext_list(&lwe_list, &mut streams); + /// let d_lwe_list_copied = d_lwe_list.duplicate(&mut streams); + /// + /// let lwe_list_copied = d_lwe_list_copied.to_lwe_ciphertext_list(&mut streams); + /// + /// assert_eq!(lwe_list, lwe_list_copied); + /// ``` + pub fn duplicate(&self, streams: &CudaStreams) -> Self { + let lwe_dimension = self.lwe_dimension(); + let ciphertext_modulus = self.ciphertext_modulus(); + + // Copy to the GPU + let d_vec = unsafe { self.0.d_vec.duplicate(streams, 0) }; + + let cuda_lwe_list = CudaLweList { + d_vec, + lwe_ciphertext_count: LweCiphertextCount(1), + lwe_dimension, + ciphertext_modulus, + }; + Self(cuda_lwe_list) + } + + pub(crate) fn lwe_dimension(&self) -> LweDimension { + self.0.lwe_dimension + } + + pub(crate) fn ciphertext_modulus(&self) -> CiphertextModulus { + self.0.ciphertext_modulus + } +} diff --git a/tfhe/src/core_crypto/gpu/entities/mod.rs b/tfhe/src/core_crypto/gpu/entities/mod.rs index 34dcb5f03e..38a3e27bab 100644 --- a/tfhe/src/core_crypto/gpu/entities/mod.rs +++ b/tfhe/src/core_crypto/gpu/entities/mod.rs @@ -1,5 +1,7 @@ +pub mod glwe_ciphertext; pub mod glwe_ciphertext_list; pub mod lwe_bootstrap_key; +pub mod lwe_ciphertext; pub mod lwe_ciphertext_list; pub mod lwe_keyswitch_key; pub mod lwe_multi_bit_bootstrap_key; diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index 5a54d2a569..22edcfba73 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -341,6 +341,33 @@ pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async( + streams: &CudaStreams, + lwe_array_out: &mut CudaVec, + glwe_array_in: &CudaVec, + nth_array: &CudaVec, + num_samples: LweCiphertextCount, + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, +) { + cuda_glwe_sample_extract_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + lwe_array_out.as_mut_c_ptr(0), + glwe_array_in.as_c_ptr(0), + nth_array.as_c_ptr(0) as *const u32, + num_samples.0 as u32, + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + ); +} + /// Discarding addition of a vector of LWE ciphertexts /// /// # Safety diff --git a/tfhe/src/core_crypto/gpu/vec.rs b/tfhe/src/core_crypto/gpu/vec.rs index e86195a9c3..bc9cd24be3 100644 --- a/tfhe/src/core_crypto/gpu/vec.rs +++ b/tfhe/src/core_crypto/gpu/vec.rs @@ -345,6 +345,15 @@ impl CudaVec { } } + pub unsafe fn duplicate(&self, streams: &CudaStreams, gpu_index: u32) -> Self { + // Copy to the GPU + let mut d_vec = CudaVec::new_async(self.len, streams, gpu_index); + d_vec.copy_from_gpu_async(self, streams, gpu_index); + streams.synchronize(); + + d_vec + } + #[allow(clippy::needless_pass_by_ref_mut)] pub(crate) fn as_mut_c_ptr(&mut self, gpu_index: u32) -> *mut c_void { self.ptr[gpu_index as usize]