From 3ba61c0694b22b98da17f8936110c354f2077c49 Mon Sep 17 00:00:00 2001
From: Pedro Alves <pedro.alves@zama.ai>
Date: Thu, 25 Jul 2024 16:54:21 -0300
Subject: [PATCH] refactor(gpu): fix sample extraction when nth > 0 and keep
 input unchanged

---
 .../cuda/include/ciphertext.h                 |   7 +
 .../cuda/src/crypto/ciphertext.cu             |  56 ++++++++
 .../cuda/src/crypto/ciphertext.cuh            |  36 +++++
 .../cuda/src/polynomial/functions.cuh         |  58 ++++----
 backends/tfhe-cuda-backend/src/cuda_bind.rs   |  10 ++
 .../gpu/algorithms/glwe_sample_extraction.rs  |  60 +++++++++
 tfhe/src/core_crypto/gpu/algorithms/mod.rs    |   1 +
 .../algorithms/test/glwe_sample_extraction.rs | 125 ++++++++++++++++++
 .../core_crypto/gpu/algorithms/test/mod.rs    |   1 +
 tfhe/src/core_crypto/gpu/mod.rs               |  26 ++++
 10 files changed, 346 insertions(+), 34 deletions(-)
 create mode 100644 tfhe/src/core_crypto/gpu/algorithms/glwe_sample_extraction.rs
 create mode 100644 tfhe/src/core_crypto/gpu/algorithms/test/glwe_sample_extraction.rs
diff --git a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
index 2fc1d52c1d..bba21890b7 100644
--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -1,6 +1,7 @@
 #ifndef CUDA_CIPHERTEXT_H
 #define CUDA_CIPHERTEXT_H
 
+#include "device.h"
 #include <cstdint>
 
 extern "C" {
@@ -14,5 +15,11 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
                                                   void *dest, void *src,
                                                   uint32_t number_of_cts,
                                                   uint32_t lwe_dimension);
+
+void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
+                                 void *lwe_array_out, void *glwe_array_in,
+                                 uint32_t *nth_array, uint32_t num_glwes,
+                                 uint32_t glwe_dimension,
+                                 uint32_t polynomial_size);
 };
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
index ac7c591b00..869e482fd4 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -1,4 +1,5 @@
 #include "ciphertext.cuh"
+#include "polynomial/parameters.cuh"
 
 void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
                                                   uint32_t gpu_index,
@@ -19,3 +20,58 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
       static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)dest,
       (uint64_t *)src, number_of_cts, lwe_dimension);
 }
+
+void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
+                                 void *lwe_array_out, void *glwe_array_in,
+                                 uint32_t *nth_array, uint32_t num_glwes,
+                                 uint32_t glwe_dimension,
+                                 uint32_t polynomial_size) {
+
+  switch (polynomial_size) {
+  case 256:
+    host_sample_extract<uint64_t, AmortizedDegree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        glwe_dimension);
+    break;
+  case 512:
+    host_sample_extract<uint64_t, AmortizedDegree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        glwe_dimension);
+    break;
+  case 1024:
+    host_sample_extract<uint64_t, AmortizedDegree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        glwe_dimension);
+    break;
+  case 2048:
+    host_sample_extract<uint64_t, AmortizedDegree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        glwe_dimension);
+    break;
+  case 4096:
+    host_sample_extract<uint64_t, AmortizedDegree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        glwe_dimension);
+    break;
+  case 8192:
+    host_sample_extract<uint64_t, AmortizedDegree<8192>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        glwe_dimension);
+    break;
+  case 16384:
+    host_sample_extract<uint64_t, AmortizedDegree<16384>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        glwe_dimension);
+    break;
+  default:
+    PANIC("Cuda error: unsupported polynomial size. Supported "
+          "N's are powers of two in the interval [256..16384].")
+  }
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
index 6d5baf6191..b45353a616 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
@@ -3,6 +3,7 @@
 
 #include "ciphertext.h"
 #include "device.h"
+#include "polynomial/functions.cuh"
 #include <cstdint>
 
 template <typename T>
@@ -25,4 +26,39 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
   cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
 }
 
+template <typename Torus, class params>
+__global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
+                               uint32_t *nth_array, uint32_t glwe_dimension) {
+
+  const int input_id = blockIdx.x;
+
+  const int glwe_input_size = (glwe_dimension + 1) * params::degree;
+  const int lwe_output_size = glwe_dimension * params::degree + 1;
+
+  auto lwe_out = lwe_array_out + input_id * lwe_output_size;
+
+  // We assume each GLWE will store the first polynomial_size inputs
+  uint32_t nth_per_glwe = params::degree;
+  auto glwe_in = glwe_array_in + (input_id / nth_per_glwe) * glwe_input_size;
+
+  auto nth = nth_array[input_id];
+
+  sample_extract_mask<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
+  sample_extract_body<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
+}
+
+template <typename Torus, class params>
+__host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
+                                  Torus *lwe_array_out, Torus *glwe_array_in,
+                                  uint32_t *nth_array, uint32_t num_glwes,
+                                  uint32_t glwe_dimension) {
+  cudaSetDevice(gpu_index);
+
+  dim3 grid(num_glwes);
+  dim3 thds(params::degree / params::opt);
+  sample_extract<Torus, params><<<grid, thds, 0, stream>>>(
+      lwe_array_out, glwe_array_in, nth_array, glwe_dimension);
+  check_cuda_error(cudaGetLastError());
+}
+
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
index b2384769ef..f94c8e5848 100644
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
@@ -191,65 +191,55 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,
 
 // Extracts the body of the nth-LWE in a GLWE.
 template <typename Torus, class params>
-__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator,
+__device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe,
                                     uint32_t glwe_dimension, uint32_t nth = 0) {
-  // Set first coefficient of the accumulator as the body of the LWE sample
+  // Set first coefficient of the glwe as the body of the LWE sample
   lwe_array_out[glwe_dimension * params::degree] =
-      accumulator[glwe_dimension * params::degree + nth];
+      glwe[glwe_dimension * params::degree + nth];
 }
 
 // Extracts the mask from the nth-LWE in a GLWE.
 template <typename Torus, class params>
-__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator,
-                                    uint32_t num_poly = 1, uint32_t nth = 0) {
-  for (int z = 0; z < num_poly; z++) {
+__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *glwe,
+                                    uint32_t glwe_dimension = 1,
+                                    uint32_t nth = 0) {
+  for (int z = 0; z < glwe_dimension; z++) {
     Torus *lwe_array_out_slice =
         (Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);
-    Torus *accumulator_slice =
-        (Torus *)accumulator + (ptrdiff_t)(z * params::degree);
+    Torus *glwe_slice = (Torus *)glwe + (ptrdiff_t)(z * params::degree);
 
     synchronize_threads_in_block();
-    // Reverse the accumulator
+    // Reverse the glwe
+    // Set ACC = -ACC
     int tid = threadIdx.x;
     Torus result[params::opt];
 #pragma unroll
     for (int i = 0; i < params::opt; i++) {
-      result[i] = accumulator_slice[params::degree - tid - 1];
-      tid = tid + params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-
-    // Set ACC = -ACC
-    tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt; i++) {
-      accumulator_slice[tid] =
-          SEL(-result[i], result[i], tid >= params::degree - nth);
+      auto x = glwe_slice[params::degree - tid - 1];
+      result[i] = SEL(-x, x, tid >= params::degree - nth);
       tid = tid + params::degree / params::opt;
     }
     synchronize_threads_in_block();
 
     // Perform ACC * X
     // (equivalent to multiply_by_monomial_negacyclic_inplace(1))
+    // Copy to the mask of the LWE sample
     tid = threadIdx.x;
-    result[params::opt];
     for (int i = 0; i < params::opt; i++) {
       // if (tid < 1)
-      //  result[i] = -accumulator_slice[tid - 1 + params::degree];
+      //  result[i] = -glwe_slice[tid - 1 + params::degree];
       // else
-      //  result[i] = accumulator_slice[tid - 1];
-      int x = tid - 1 + SEL(0, params::degree - nth, tid < 1);
-      result[i] = SEL(1, -1, tid < 1) * accumulator_slice[x];
-      tid += params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
+      //  result[i] = glwe_slice[tid - 1];
+      uint32_t dst_idx = tid + 1 + nth;
+      if (dst_idx == params::degree)
+        lwe_array_out_slice[0] = -result[i];
+      else {
+        dst_idx =
+            SEL(dst_idx, dst_idx - params::degree, dst_idx >= params::degree);
+        lwe_array_out_slice[dst_idx] = result[i];
+      }
 
-    // Copy to the mask of the LWE sample
-    tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt; i++) {
-      lwe_array_out_slice[tid] = result[i];
-      tid = tid + params::degree / params::opt;
+      tid += params::degree / params::opt;
     }
   }
 }
diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs
index 10865097cd..9c1fe878cb 100644
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -596,6 +596,16 @@ extern "C" {
         gpu_count: u32,
         mem_ptr: *mut *mut i8,
     );
+    pub fn cuda_glwe_sample_extract_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut c_void,
+        glwe_array_in: *const c_void,
+        nth_array: *const u32,
+        num_glwes: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+    );
 
     pub fn scratch_cuda_integer_radix_comparison_kb_64(
         streams: *const *mut c_void,
diff --git a/tfhe/src/core_crypto/gpu/algorithms/glwe_sample_extraction.rs b/tfhe/src/core_crypto/gpu/algorithms/glwe_sample_extraction.rs
new file mode 100644
index 0000000000..8f6e44a815
--- /dev/null
+++ b/tfhe/src/core_crypto/gpu/algorithms/glwe_sample_extraction.rs
@@ -0,0 +1,60 @@
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::vec::CudaVec;
+use crate::core_crypto::gpu::{extract_lwe_samples_from_glwe_ciphertext_list_async, CudaStreams};
+use crate::core_crypto::prelude::{MonomialDegree, UnsignedTorus};
+use itertools::Itertools;
+
+/// For each [`GLWE Ciphertext`] (`CudaGlweCiphertextList`) given as input, extract the nth
+/// coefficient from its body as an [`LWE ciphertext`](`CudaLweCiphertextList`). This variant is
+/// GPU-accelerated.
+pub fn cuda_extract_lwe_samples_from_glwe_ciphertext_list<Scalar>(
+    input_glwe_list: &CudaGlweCiphertextList<Scalar>,
+    output_lwe_list: &mut CudaLweCiphertextList<Scalar>,
+    vec_nth: &[MonomialDegree],
+    streams: &CudaStreams,
+) where
+    // CastInto required for PBS modulus switch which returns a usize
+    Scalar: UnsignedTorus,
+{
+    let in_lwe_dim = input_glwe_list
+        .glwe_dimension()
+        .to_equivalent_lwe_dimension(input_glwe_list.polynomial_size());
+
+    let out_lwe_dim = output_lwe_list.lwe_dimension();
+
+    assert_eq!(
+        in_lwe_dim, out_lwe_dim,
+        "Mismatch between equivalent LweDimension of input ciphertext and output ciphertext. \
+        Got {in_lwe_dim:?} for input and {out_lwe_dim:?} for output.",
+    );
+
+    assert_eq!(
+        vec_nth.len(),
+        input_glwe_list.glwe_ciphertext_count().0 * input_glwe_list.polynomial_size().0,
+        "Mismatch between number of nths and number of GLWEs provided.",
+    );
+
+    assert_eq!(
+        input_glwe_list.ciphertext_modulus(),
+        output_lwe_list.ciphertext_modulus(),
+        "Mismatched moduli between input_glwe ({:?}) and output_lwe ({:?})",
+        input_glwe_list.ciphertext_modulus(),
+        output_lwe_list.ciphertext_modulus()
+    );
+
+    let nth_array: Vec<u32> = vec_nth.iter().map(|x| x.0 as u32).collect_vec();
+    let gpu_indexes = &streams.gpu_indexes;
+    unsafe {
+        let d_nth_array = CudaVec::from_cpu_async(&nth_array, streams, gpu_indexes[0]);
+        extract_lwe_samples_from_glwe_ciphertext_list_async(
+            streams,
+            &mut output_lwe_list.0.d_vec,
+            &input_glwe_list.0.d_vec,
+            &d_nth_array,
+            vec_nth.len() as u32,
+            input_glwe_list.glwe_dimension(),
+            input_glwe_list.polynomial_size(),
+        );
+    }
+}
diff --git a/tfhe/src/core_crypto/gpu/algorithms/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/mod.rs
index 9dafafcbfe..ee6ddd2b1d 100644
--- a/tfhe/src/core_crypto/gpu/algorithms/mod.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/mod.rs
@@ -2,6 +2,7 @@ pub mod lwe_linear_algebra;
 pub mod lwe_multi_bit_programmable_bootstrapping;
 pub mod lwe_programmable_bootstrapping;
 
+pub mod glwe_sample_extraction;
 mod lwe_keyswitch;
 #[cfg(test)]
 mod test;
diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/glwe_sample_extraction.rs b/tfhe/src/core_crypto/gpu/algorithms/test/glwe_sample_extraction.rs
new file mode 100644
index 0000000000..51018d352f
--- /dev/null
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/glwe_sample_extraction.rs
@@ -0,0 +1,125 @@
+use super::*;
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::glwe_sample_extraction::cuda_extract_lwe_samples_from_glwe_ciphertext_list;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::CudaStreams;
+use itertools::Itertools;
+
+#[cfg(not(tarpaulin))]
+const NB_TESTS: usize = 10;
+#[cfg(tarpaulin)]
+const NB_TESTS: usize = 1;
+
+fn glwe_encrypt_sample_extract_decrypt_custom_mod<Scalar: UnsignedTorus + Send + Sync>(
+    params: ClassicTestParams<Scalar>,
+) {
+    let glwe_dimension = params.glwe_dimension;
+    let polynomial_size = params.polynomial_size;
+    let glwe_noise_distribution = params.glwe_noise_distribution;
+    let ciphertext_modulus = params.ciphertext_modulus;
+    let message_modulus_log = params.message_modulus_log;
+    let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
+
+    let mut rsc = TestResources::new();
+
+    let msg_modulus = Scalar::ONE.shl(message_modulus_log.0);
+    let delta: Scalar = encoding_with_padding / msg_modulus;
+
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(gpu_index);
+
+    let mut msgs = vec![];
+
+    // Build msg
+    // TODO: Can't we collect from (0..msg_modulus) if msg_modulus is Scalar?
+    let mut msg = msg_modulus;
+    msg = msg.wrapping_sub(Scalar::ONE);
+    while msg != Scalar::ZERO {
+        msgs.push(msg);
+        msg = msg.wrapping_sub(Scalar::ONE);
+    }
+
+    // Run tests
+    for _ in 0..NB_TESTS {
+        let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key(
+            glwe_dimension,
+            polynomial_size,
+            &mut rsc.secret_random_generator,
+        );
+
+        let equivalent_lwe_sk = glwe_sk.clone().into_lwe_secret_key();
+
+        let mut glwe_list = GlweCiphertextList::new(
+            Scalar::ZERO,
+            glwe_dimension.to_glwe_size(),
+            polynomial_size,
+            GlweCiphertextCount(msgs.len()),
+            ciphertext_modulus,
+        );
+
+        let cleartext_list = msgs
+            .iter()
+            .flat_map(|&x| vec![x * delta; glwe_list.polynomial_size().0])
+            .collect_vec();
+
+        let plaintext_list = PlaintextList::from_container(cleartext_list);
+        encrypt_glwe_ciphertext_list(
+            &glwe_sk,
+            &mut glwe_list,
+            &plaintext_list,
+            glwe_noise_distribution,
+            &mut rsc.encryption_random_generator,
+        );
+
+        let input_cuda_glwe_list =
+            CudaGlweCiphertextList::from_glwe_ciphertext_list(&glwe_list, &streams);
+
+        let mut output_cuda_lwe_ciphertext_list = CudaLweCiphertextList::new(
+            equivalent_lwe_sk.lwe_dimension(),
+            LweCiphertextCount(msgs.len() * glwe_list.polynomial_size().0),
+            ciphertext_modulus,
+            &streams,
+        );
+
+        let nths = (0..(msgs.len() * glwe_list.polynomial_size().0))
+            .map(|x| MonomialDegree(x % glwe_list.polynomial_size().0))
+            .collect_vec();
+
+        cuda_extract_lwe_samples_from_glwe_ciphertext_list(
+            &input_cuda_glwe_list,
+            &mut output_cuda_lwe_ciphertext_list,
+            nths.as_slice(),
+            &streams,
+        );
+
+        let gpu_output_lwe_ciphertext_list =
+            output_cuda_lwe_ciphertext_list.to_lwe_ciphertext_list(&streams);
+
+        let mut output_plaintext_list = PlaintextList::new(
+            Scalar::ZERO,
+            PlaintextCount(gpu_output_lwe_ciphertext_list.lwe_ciphertext_count().0),
+        );
+
+        decrypt_lwe_ciphertext_list(
+            &equivalent_lwe_sk,
+            &gpu_output_lwe_ciphertext_list,
+            &mut output_plaintext_list,
+        );
+
+        let mut decoded = vec![Scalar::ZERO; plaintext_list.plaintext_count().0];
+
+        decoded
+            .iter_mut()
+            .zip(output_plaintext_list.iter())
+            .for_each(|(dst, src)| *dst = round_decode(*src.0, delta) % msg_modulus);
+
+        let mut count = msg_modulus;
+        count = count.wrapping_sub(Scalar::ONE);
+        for result in decoded.chunks_exact(glwe_list.polynomial_size().0) {
+            assert!(result.iter().all(|&x| x == count));
+            count = count.wrapping_sub(Scalar::ONE);
+        }
+    }
+}
+
+create_gpu_parametrized_test!(glwe_encrypt_sample_extract_decrypt_custom_mod);
diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
index 3b30f96b1c..59ba20af18 100644
--- a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
@@ -1,5 +1,6 @@
 use crate::core_crypto::algorithms::test::*;
 
+mod glwe_sample_extraction;
 mod lwe_keyswitch;
 mod lwe_linear_algebra;
 mod lwe_multi_bit_programmable_bootstrapping;
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
index a52ff80d09..3341ba6e5e 100644
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -326,6 +326,32 @@ pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async<T: Unsigned
     }
 }
 
+/// # Safety
+///
+/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
+/// required
+#[allow(clippy::too_many_arguments)]
+pub unsafe fn extract_lwe_samples_from_glwe_ciphertext_list_async<T: UnsignedInteger>(
+    streams: &CudaStreams,
+    lwe_array_out: &mut CudaVec<T>,
+    glwe_array_in: &CudaVec<T>,
+    nth_array: &CudaVec<u32>,
+    num_nths: u32,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+) {
+    cuda_glwe_sample_extract_64(
+        streams.ptr[0],
+        streams.gpu_indexes[0],
+        lwe_array_out.as_mut_c_ptr(0),
+        glwe_array_in.as_c_ptr(0),
+        nth_array.as_c_ptr(0).cast::<u32>(),
+        num_nths,
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+    );
+}
+
 /// Discarding addition of a vector of LWE ciphertexts
 ///
 /// # Safety