refactor(gpu): fix sample extraction when nth > 0 and keep input unch…

…anged
zama-ai · Jul 25, 2024 · d3ea654 · d3ea654
1 parent 2004333
commit d3ea654
Show file tree

Hide file tree

Showing 15 changed files with 617 additions and 34 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/include/functions.h b/backends/tfhe-cuda-backend/cuda/include/functions.h
@@ -0,0 +1,16 @@
+#ifndef CUDA_FUNCTIONS_H_
+#define CUDA_FUNCTIONS_H_
+
+#include "polynomial/functions.cuh"
+#include "polynomial/parameters.cuh"
+#include <cstdint>
+
+extern "C" {
+void cuda_glwe_sample_extract_64(void **streams, uint32_t *gpu_indexes,
+                                 uint32_t gpu_count, void *lwe_array_out,
+                                 void *glwe_array_in, uint32_t *nth_array,
+                                 uint32_t num_samples, uint32_t glwe_dimension,
+                                 uint32_t polynomial_size);
+}
+
+#endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -1,6 +1,7 @@
 #ifndef CNCRT_TORUS_CUH
 #define CNCRT_TORUS_CUH
 
+#include "device.h"
 #include "types/int128.cuh"
 #include <limits>
 

diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cu b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cu
@@ -0,0 +1,56 @@
+#include "functions.h"
+
+void cuda_glwe_sample_extract_64(void **streams, uint32_t *gpu_indexes,
+                                 uint32_t gpu_count, void *lwe_array_out,
+                                 void *glwe_in, uint32_t *nth_array,
+                                 uint32_t num_samples, uint32_t glwe_dimension,
+                                 uint32_t polynomial_size) {
+
+  switch (polynomial_size) {
+  case 256:
+    host_sample_extract<uint64_t, AmortizedDegree<256>>(
+        (cudaStream_t *)(streams), (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples,
+        glwe_dimension);
+    break;
+  case 512:
+    host_sample_extract<uint64_t, AmortizedDegree<512>>(
+        (cudaStream_t *)(streams), (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples,
+        glwe_dimension);
+    break;
+  case 1024:
+    host_sample_extract<uint64_t, AmortizedDegree<1024>>(
+        (cudaStream_t *)(streams), (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples,
+        glwe_dimension);
+    break;
+  case 2048:
+    host_sample_extract<uint64_t, AmortizedDegree<2048>>(
+        (cudaStream_t *)(streams), (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples,
+        glwe_dimension);
+    break;
+  case 4096:
+    host_sample_extract<uint64_t, AmortizedDegree<4096>>(
+        (cudaStream_t *)(streams), (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples,
+        glwe_dimension);
+    break;
+  case 8192:
+    host_sample_extract<uint64_t, AmortizedDegree<8192>>(
+        (cudaStream_t *)(streams), (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples,
+        glwe_dimension);
+    break;
+  case 16384:
+    host_sample_extract<uint64_t, AmortizedDegree<16384>>(
+        (cudaStream_t *)(streams), (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples,
+        glwe_dimension);
+    break;
+  default:
+    PANIC("Cuda error: unsupported polynomial size. Supported "
+          "N's are powers of two in the interval [256..16384].")
+  }
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
@@ -191,67 +191,86 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,
 
 // Extracts the body of the nth-LWE in a GLWE.
 template <typename Torus, class params>
-__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator,
+__device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe,
                                     uint32_t glwe_dimension, uint32_t nth = 0) {
-  // Set first coefficient of the accumulator as the body of the LWE sample
+  // Set first coefficient of the glwe as the body of the LWE sample
   lwe_array_out[glwe_dimension * params::degree] =
-      accumulator[glwe_dimension * params::degree + nth];
+      glwe[glwe_dimension * params::degree + nth];
 }
 
 // Extracts the mask from the nth-LWE in a GLWE.
 template <typename Torus, class params>
-__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator,
-                                    uint32_t num_poly = 1, uint32_t nth = 0) {
-  for (int z = 0; z < num_poly; z++) {
+__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *glwe,
+                                    uint32_t glwe_dimension = 1,
+                                    uint32_t nth = 0) {
+  for (int z = 0; z < glwe_dimension; z++) {
     Torus *lwe_array_out_slice =
         (Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);
-    Torus *accumulator_slice =
-        (Torus *)accumulator + (ptrdiff_t)(z * params::degree);
+    Torus *glwe_slice = (Torus *)glwe + (ptrdiff_t)(z * params::degree);
 
     synchronize_threads_in_block();
-    // Reverse the accumulator
+    // Reverse the glwe
+    // Set ACC = -ACC
     int tid = threadIdx.x;
     Torus result[params::opt];
 #pragma unroll
     for (int i = 0; i < params::opt; i++) {
-      result[i] = accumulator_slice[params::degree - tid - 1];
-      tid = tid + params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-
-    // Set ACC = -ACC
-    tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt; i++) {
-      accumulator_slice[tid] =
-          SEL(-result[i], result[i], tid >= params::degree - nth);
+      auto x = glwe_slice[params::degree - tid - 1];
+      result[i] = SEL(-x, x, tid >= params::degree - nth);
       tid = tid + params::degree / params::opt;
     }
     synchronize_threads_in_block();
 
     // Perform ACC * X
     // (equivalent to multiply_by_monomial_negacyclic_inplace(1))
+    // Copy to the mask of the LWE sample
     tid = threadIdx.x;
-    result[params::opt];
     for (int i = 0; i < params::opt; i++) {
       // if (tid < 1)
-      //  result[i] = -accumulator_slice[tid - 1 + params::degree];
+      //  result[i] = -glwe_slice[tid - 1 + params::degree];
       // else
-      //  result[i] = accumulator_slice[tid - 1];
-      int x = tid - 1 + SEL(0, params::degree - nth, tid < 1);
-      result[i] = SEL(1, -1, tid < 1) * accumulator_slice[x];
-      tid += params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
+      //  result[i] = glwe_slice[tid - 1];
+      uint32_t dst_idx = tid + 1 + nth;
+      if (dst_idx == params::degree)
+        lwe_array_out_slice[0] = -result[i];
+      else {
+        dst_idx =
+            SEL(dst_idx, dst_idx - params::degree, dst_idx >= params::degree);
+        lwe_array_out_slice[dst_idx] = result[i];
+      }
 
-    // Copy to the mask of the LWE sample
-    tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt; i++) {
-      lwe_array_out_slice[tid] = result[i];
-      tid = tid + params::degree / params::opt;
+      tid += params::degree / params::opt;
     }
   }
 }
 
+template <typename Torus, class params>
+__global__ void apply_sample_extract(Torus *lwe_array_out, Torus *glwe_in,
+                                     uint32_t *nth_array,
+                                     uint32_t glwe_dimension) {
+
+  const int input_id = blockIdx.x;
+
+  const int lwe_output_size = glwe_dimension * params::degree + 1;
+
+  auto lwe_out = lwe_array_out + input_id * lwe_output_size;
+  auto nth = nth_array[input_id];
+
+  sample_extract_mask<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
+  sample_extract_body<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
+}
+
+template <typename Torus, class params>
+__host__ void host_sample_extract(cudaStream_t *streams, Torus *lwe_array_out,
+                                  Torus *glwe_in, uint32_t *nth_array,
+                                  uint32_t num_samples,
+                                  uint32_t glwe_dimension) {
+
+  dim3 grid(num_samples);
+  dim3 thds(params::degree / params::opt);
+  apply_sample_extract<Torus, params><<<grid, thds, 0, streams[0]>>>(
+      lwe_array_out, glwe_in, nth_array, glwe_dimension);
+  check_cuda_error(cudaGetLastError());
+}
+
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/types/int128.cuh b/backends/tfhe-cuda-backend/cuda/src/types/int128.cuh
@@ -1,6 +1,7 @@
 #ifndef CNCRT_INT128_CUH
 #define CNCRT_INT128_CUH
 
+#include <cstdint>
 // abseil's int128 type
 // licensed under Apache license
 

diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -628,6 +628,17 @@ extern "C" {
         gpu_count: u32,
         mem_ptr: *mut *mut i8,
     );
+    pub fn cuda_glwe_sample_extract_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        lwe_array_out: *mut c_void,
+        glwe_in: *const c_void,
+        nth_array: *const u32,
+        num_samples: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+    );
 
     pub fn scratch_cuda_integer_radix_comparison_kb_64(
         streams: *const *mut c_void,

diff --git a/tfhe/src/core_crypto/gpu/algorithms/glwe_sample_extraction.rs b/tfhe/src/core_crypto/gpu/algorithms/glwe_sample_extraction.rs
@@ -0,0 +1,52 @@
+use crate::core_crypto::gpu::glwe_ciphertext::CudaGlweCiphertext;
+use crate::core_crypto::gpu::lwe_ciphertext::CudaLweCiphertext;
+use crate::core_crypto::gpu::vec::CudaVec;
+use crate::core_crypto::gpu::{extract_lwe_sample_from_glwe_ciphertext_async, CudaStreams};
+use crate::core_crypto::prelude::{LweCiphertextCount, MonomialDegree, UnsignedTorus};
+
+/// Extract the nth coefficient from the body of a [`GLWE Ciphertext`](`CudaGlweCiphertext`) as an
+/// [`LWE ciphertext`](`CudaLweCiphertext`). This variant is GPU-accelerated.
+pub fn cuda_extract_lwe_sample_from_glwe_ciphertext<Scalar>(
+    input_glwe: &CudaGlweCiphertext<Scalar>,
+    output_lwe: &mut CudaLweCiphertext<Scalar>,
+    nth: MonomialDegree,
+    streams: &CudaStreams,
+) where
+    // CastInto required for PBS modulus switch which returns a usize
+    Scalar: UnsignedTorus,
+{
+    let in_lwe_dim = input_glwe
+        .glwe_dimension()
+        .to_equivalent_lwe_dimension(input_glwe.polynomial_size());
+
+    let out_lwe_dim = output_lwe.lwe_dimension();
+
+    assert_eq!(
+        in_lwe_dim, out_lwe_dim,
+        "Mismatch between equivalent LweDimension of input ciphertext and output ciphertext. \
+        Got {in_lwe_dim:?} for input and {out_lwe_dim:?} for output.",
+    );
+
+    assert_eq!(
+        input_glwe.ciphertext_modulus(),
+        output_lwe.ciphertext_modulus(),
+        "Mismatched moduli between input_glwe ({:?}) and output_lwe ({:?})",
+        input_glwe.ciphertext_modulus(),
+        output_lwe.ciphertext_modulus()
+    );
+
+    let nth_array: Vec<u32> = vec![nth.0 as u32];
+    let gpu_indexes = &streams.gpu_indexes;
+    unsafe {
+        let d_nth_array = CudaVec::from_cpu_async(&nth_array, streams, gpu_indexes[0]);
+        extract_lwe_sample_from_glwe_ciphertext_async(
+            streams,
+            &mut output_lwe.0.d_vec,
+            &input_glwe.0.d_vec,
+            &d_nth_array,
+            LweCiphertextCount(nth_array.len()),
+            input_glwe.glwe_dimension(),
+            input_glwe.polynomial_size(),
+        );
+    }
+}
diff --git a/tfhe/src/core_crypto/gpu/algorithms/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/mod.rs
@@ -2,6 +2,7 @@ pub mod lwe_linear_algebra;
 pub mod lwe_multi_bit_programmable_bootstrapping;
 pub mod lwe_programmable_bootstrapping;
 
+pub mod glwe_sample_extraction;
 mod lwe_keyswitch;
 #[cfg(test)]
 mod test;