diff --git a/backends/tfhe-cuda-backend/cuda/include/functions.h b/backends/tfhe-cuda-backend/cuda/include/functions.h
new file mode 100644
index 0000000000..3ee0a11430
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/include/functions.h
@@ -0,0 +1,16 @@
+#ifndef CUDA_FUNCTIONS_H_
+#define CUDA_FUNCTIONS_H_
+
+#include "polynomial/functions.cuh"
+#include "polynomial/parameters.cuh"
+#include <cstdint>
+
+extern "C" {
+void cuda_glwe_sample_extract_64(void **streams, uint32_t *gpu_indexes,
+                                 uint32_t gpu_count, void *lwe_array_out,
+                                 void *glwe_array_in, uint32_t *nth_array,
+                                 uint32_t num_samples, uint32_t glwe_dimension,
+                                 uint32_t polynomial_size);
+}
+
+#endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
index 177892e5a9..9fc4ad1d8a 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -1,6 +1,7 @@
 #ifndef CNCRT_TORUS_CUH
 #define CNCRT_TORUS_CUH
 
+#include "device.h"
 #include "types/int128.cuh"
 #include <limits>
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cu b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cu
new file mode 100644
index 0000000000..38aedf8f67
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cu
@@ -0,0 +1,56 @@
+#include "functions.h"
+
+void cuda_glwe_sample_extract_64(void **streams, uint32_t *gpu_indexes,
+                                 uint32_t gpu_count, void *lwe_array_out,
+                                 void *glwe_in, uint32_t *nth_array,
+                                 uint32_t num_samples, uint32_t glwe_dimension,
+                                 uint32_t polynomial_size) {
+
+  switch (polynomial_size) {
+  case 256:
+    host_sample_extract<uint64_t, AmortizedDegree<256>>(
+        (cudaStream_t *)(streams), (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples,
+        glwe_dimension);
+    break;
+  case 512:
+    host_sample_extract<uint64_t, AmortizedDegree<512>>(
+        (cudaStream_t *)(streams), (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples,
+        glwe_dimension);
+    break;
+  case 1024:
+    host_sample_extract<uint64_t, AmortizedDegree<1024>>(
+        (cudaStream_t *)(streams), (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples,
+        glwe_dimension);
+    break;
+  case 2048:
+    host_sample_extract<uint64_t, AmortizedDegree<2048>>(
+        (cudaStream_t *)(streams), (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples,
+        glwe_dimension);
+    break;
+  case 4096:
+    host_sample_extract<uint64_t, AmortizedDegree<4096>>(
+        (cudaStream_t *)(streams), (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples,
+        glwe_dimension);
+    break;
+  case 8192:
+    host_sample_extract<uint64_t, AmortizedDegree<8192>>(
+        (cudaStream_t *)(streams), (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples,
+        glwe_dimension);
+    break;
+  case 16384:
+    host_sample_extract<uint64_t, AmortizedDegree<16384>>(
+        (cudaStream_t *)(streams), (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_in, (uint32_t *)nth_array, num_samples,
+        glwe_dimension);
+    break;
+  default:
+    PANIC("Cuda error: unsupported polynomial size. Supported "
+          "N's are powers of two in the interval [256..16384].")
+  }
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
index b2384769ef..1fa5a61b39 100644
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
@@ -191,67 +191,86 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,
 
 // Extracts the body of the nth-LWE in a GLWE.
 template <typename Torus, class params>
-__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator,
+__device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe,
                                     uint32_t glwe_dimension, uint32_t nth = 0) {
-  // Set first coefficient of the accumulator as the body of the LWE sample
+  // Set first coefficient of the glwe as the body of the LWE sample
   lwe_array_out[glwe_dimension * params::degree] =
-      accumulator[glwe_dimension * params::degree + nth];
+      glwe[glwe_dimension * params::degree + nth];
 }
 
 // Extracts the mask from the nth-LWE in a GLWE.
 template <typename Torus, class params>
-__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator,
-                                    uint32_t num_poly = 1, uint32_t nth = 0) {
-  for (int z = 0; z < num_poly; z++) {
+__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *glwe,
+                                    uint32_t glwe_dimension = 1,
+                                    uint32_t nth = 0) {
+  for (int z = 0; z < glwe_dimension; z++) {
     Torus *lwe_array_out_slice =
         (Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);
-    Torus *accumulator_slice =
-        (Torus *)accumulator + (ptrdiff_t)(z * params::degree);
+    Torus *glwe_slice = (Torus *)glwe + (ptrdiff_t)(z * params::degree);
 
     synchronize_threads_in_block();
-    // Reverse the accumulator
+    // Reverse the glwe
+    // Set ACC = -ACC
     int tid = threadIdx.x;
     Torus result[params::opt];
 #pragma unroll
     for (int i = 0; i < params::opt; i++) {
-      result[i] = accumulator_slice[params::degree - tid - 1];
-      tid = tid + params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-
-    // Set ACC = -ACC
-    tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt; i++) {
-      accumulator_slice[tid] =
-          SEL(-result[i], result[i], tid >= params::degree - nth);
+      auto x = glwe_slice[params::degree - tid - 1];
+      result[i] = SEL(-x, x, tid >= params::degree - nth);
       tid = tid + params::degree / params::opt;
     }
     synchronize_threads_in_block();
 
     // Perform ACC * X
     // (equivalent to multiply_by_monomial_negacyclic_inplace(1))
+    // Copy to the mask of the LWE sample
     tid = threadIdx.x;
-    result[params::opt];
     for (int i = 0; i < params::opt; i++) {
       // if (tid < 1)
-      //  result[i] = -accumulator_slice[tid - 1 + params::degree];
+      //  result[i] = -glwe_slice[tid - 1 + params::degree];
       // else
-      //  result[i] = accumulator_slice[tid - 1];
-      int x = tid - 1 + SEL(0, params::degree - nth, tid < 1);
-      result[i] = SEL(1, -1, tid < 1) * accumulator_slice[x];
-      tid += params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
+      //  result[i] = glwe_slice[tid - 1];
+      uint32_t dst_idx = tid + 1 + nth;
+      if (dst_idx == params::degree)
+        lwe_array_out_slice[0] = -result[i];
+      else {
+        dst_idx =
+            SEL(dst_idx, dst_idx - params::degree, dst_idx >= params::degree);
+        lwe_array_out_slice[dst_idx] = result[i];
+      }
 
-    // Copy to the mask of the LWE sample
-    tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt; i++) {
-      lwe_array_out_slice[tid] = result[i];
-      tid = tid + params::degree / params::opt;
+      tid += params::degree / params::opt;
     }
   }
 }
 
+template <typename Torus, class params>
+__global__ void apply_sample_extract(Torus *lwe_array_out, Torus *glwe_in,
+                                     uint32_t *nth_array,
+                                     uint32_t glwe_dimension) {
+
+  const int input_id = blockIdx.x;
+
+  const int lwe_output_size = glwe_dimension * params::degree + 1;
+
+  auto lwe_out = lwe_array_out + input_id * lwe_output_size;
+  auto nth = nth_array[input_id];
+
+  sample_extract_mask<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
+  sample_extract_body<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
+}
+
+template <typename Torus, class params>
+__host__ void host_sample_extract(cudaStream_t *streams, Torus *lwe_array_out,
+                                  Torus *glwe_in, uint32_t *nth_array,
+                                  uint32_t num_samples,
+                                  uint32_t glwe_dimension) {
+
+  dim3 grid(num_samples);
+  dim3 thds(params::degree / params::opt);
+  apply_sample_extract<Torus, params><<<grid, thds, 0, streams[0]>>>(
+      lwe_array_out, glwe_in, nth_array, glwe_dimension);
+  check_cuda_error(cudaGetLastError());
+}
+
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/types/int128.cuh b/backends/tfhe-cuda-backend/cuda/src/types/int128.cuh
index ae925fadb2..fd7e4921ab 100644
--- a/backends/tfhe-cuda-backend/cuda/src/types/int128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/types/int128.cuh
@@ -1,6 +1,7 @@
 #ifndef CNCRT_INT128_CUH
 #define CNCRT_INT128_CUH
 
+#include <cstdint>
 // abseil's int128 type
 // licensed under Apache license
 
diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs
index 322be12f2a..a0f0d317d5 100644
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -628,6 +628,17 @@ extern "C" {
         gpu_count: u32,
         mem_ptr: *mut *mut i8,
     );
+    pub fn cuda_glwe_sample_extract_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        lwe_array_out: *mut c_void,
+        glwe_in: *const c_void,
+        nth_array: *const u32,
+        num_samples: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+    );
 
     pub fn scratch_cuda_integer_radix_comparison_kb_64(
         streams: *const *mut c_void,
diff --git a/tfhe/src/core_crypto/gpu/algorithms/glwe_sample_extraction.rs b/tfhe/src/core_crypto/gpu/algorithms/glwe_sample_extraction.rs
new file mode 100644
index 0000000000..d498888ba5
--- /dev/null
+++ b/tfhe/src/core_crypto/gpu/algorithms/glwe_sample_extraction.rs
@@ -0,0 +1,52 @@
+use crate::core_crypto::gpu::glwe_ciphertext::CudaGlweCiphertext;
+use crate::core_crypto::gpu::lwe_ciphertext::CudaLweCiphertext;
+use crate::core_crypto::gpu::vec::CudaVec;
+use crate::core_crypto::gpu::{extract_lwe_sample_from_glwe_ciphertext_async, CudaStreams};
+use crate::core_crypto::prelude::{LweCiphertextCount, MonomialDegree, UnsignedTorus};
+
+/// Extract the nth coefficient from the body of a [`GLWE Ciphertext`](`CudaGlweCiphertext`) as an
+/// [`LWE ciphertext`](`CudaLweCiphertext`). This variant is GPU-accelerated.
+pub fn cuda_extract_lwe_sample_from_glwe_ciphertext<Scalar>(
+    input_glwe: &CudaGlweCiphertext<Scalar>,
+    output_lwe: &mut CudaLweCiphertext<Scalar>,
+    nth: MonomialDegree,
+    streams: &CudaStreams,
+) where
+    // CastInto required for PBS modulus switch which returns a usize
+    Scalar: UnsignedTorus,
+{
+    let in_lwe_dim = input_glwe
+        .glwe_dimension()
+        .to_equivalent_lwe_dimension(input_glwe.polynomial_size());
+
+    let out_lwe_dim = output_lwe.lwe_dimension();
+
+    assert_eq!(
+        in_lwe_dim, out_lwe_dim,
+        "Mismatch between equivalent LweDimension of input ciphertext and output ciphertext. \
+        Got {in_lwe_dim:?} for input and {out_lwe_dim:?} for output.",
+    );
+
+    assert_eq!(
+        input_glwe.ciphertext_modulus(),
+        output_lwe.ciphertext_modulus(),
+        "Mismatched moduli between input_glwe ({:?}) and output_lwe ({:?})",
+        input_glwe.ciphertext_modulus(),
+        output_lwe.ciphertext_modulus()
+    );
+
+    let nth_array: Vec<u32> = vec![nth.0 as u32];
+    let gpu_indexes = &streams.gpu_indexes;
+    unsafe {
+        let d_nth_array = CudaVec::from_cpu_async(&nth_array, streams, gpu_indexes[0]);
+        extract_lwe_sample_from_glwe_ciphertext_async(
+            streams,
+            &mut output_lwe.0.d_vec,
+            &input_glwe.0.d_vec,
+            &d_nth_array,
+            LweCiphertextCount(nth_array.len()),
+            input_glwe.glwe_dimension(),
+            input_glwe.polynomial_size(),
+        );
+    }
+}
diff --git a/tfhe/src/core_crypto/gpu/algorithms/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/mod.rs
index 9dafafcbfe..ee6ddd2b1d 100644
--- a/tfhe/src/core_crypto/gpu/algorithms/mod.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/mod.rs
@@ -2,6 +2,7 @@ pub mod lwe_linear_algebra;
 pub mod lwe_multi_bit_programmable_bootstrapping;
 pub mod lwe_programmable_bootstrapping;
 
+pub mod glwe_sample_extraction;
 mod lwe_keyswitch;
 #[cfg(test)]
 mod test;
diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/glwe_sample_extraction.rs b/tfhe/src/core_crypto/gpu/algorithms/test/glwe_sample_extraction.rs
new file mode 100644
index 0000000000..2590fae4f8
--- /dev/null
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/glwe_sample_extraction.rs
@@ -0,0 +1,143 @@
+use super::*;
+use crate::core_crypto::gpu::glwe_ciphertext::CudaGlweCiphertext;
+use crate::core_crypto::gpu::glwe_sample_extraction::cuda_extract_lwe_sample_from_glwe_ciphertext;
+use crate::core_crypto::gpu::lwe_ciphertext::CudaLweCiphertext;
+use crate::core_crypto::gpu::CudaStreams;
+#[cfg(not(tarpaulin))]
+const NB_TESTS: usize = 10;
+#[cfg(tarpaulin)]
+const NB_TESTS: usize = 1;
+
+fn glwe_encrypt_sample_extract_decrypt_custom_mod<Scalar: UnsignedTorus + Send + Sync>(
+    params: ClassicTestParams<Scalar>,
+) {
+    let glwe_dimension = params.glwe_dimension;
+    let polynomial_size = params.polynomial_size;
+    let glwe_noise_distribution = params.glwe_noise_distribution;
+    let ciphertext_modulus = params.ciphertext_modulus;
+    let message_modulus_log = params.message_modulus_log;
+    let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
+
+    let mut rsc = TestResources::new();
+
+    let msg_modulus = Scalar::ONE.shl(message_modulus_log.0);
+    let mut msg = msg_modulus;
+    let delta: Scalar = encoding_with_padding / msg_modulus;
+
+    let gpu_index = 0;
+    let stream = CudaStreams::new_single_gpu(gpu_index);
+
+    while msg != Scalar::ZERO {
+        msg = msg.wrapping_sub(Scalar::ONE);
+        for _ in 0..NB_TESTS {
+            let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key(
+                glwe_dimension,
+                polynomial_size,
+                &mut rsc.secret_random_generator,
+            );
+
+            let equivalent_lwe_sk = glwe_sk.clone().into_lwe_secret_key();
+
+            let mut glwe = GlweCiphertext::new(
+                Scalar::ZERO,
+                glwe_dimension.to_glwe_size(),
+                polynomial_size,
+                ciphertext_modulus,
+            );
+
+            let plaintext_list =
+                PlaintextList::new(msg * delta, PlaintextCount(glwe.polynomial_size().0));
+
+            encrypt_glwe_ciphertext(
+                &glwe_sk,
+                &mut glwe,
+                &plaintext_list,
+                glwe_noise_distribution,
+                &mut rsc.encryption_random_generator,
+            );
+
+            let d_glwe = CudaGlweCiphertext::from_glwe_ciphertext(&glwe, &stream);
+
+            assert!(check_encrypted_content_respects_mod(
+                &glwe,
+                ciphertext_modulus
+            ));
+
+            let mut vec_output_lwe_ciphertext_list = vec![];
+
+            for idx in 0..glwe.polynomial_size().0 {
+                let mut d_output_lwe_ciphertext = CudaLweCiphertext::new(
+                    equivalent_lwe_sk.lwe_dimension(),
+                    ciphertext_modulus,
+                    &stream,
+                );
+                cuda_extract_lwe_sample_from_glwe_ciphertext(
+                    &d_glwe,
+                    &mut d_output_lwe_ciphertext,
+                    MonomialDegree(idx),
+                    &stream,
+                );
+                vec_output_lwe_ciphertext_list.extend(
+                    d_output_lwe_ciphertext
+                        .into_lwe_ciphertext(&stream)
+                        .into_container(),
+                );
+            }
+
+            let gpu_output_lwe_ciphertext_list = LweCiphertextList::from_container(
+                vec_output_lwe_ciphertext_list,
+                equivalent_lwe_sk.lwe_dimension().to_lwe_size(),
+                ciphertext_modulus,
+            );
+
+            assert!(check_encrypted_content_respects_mod(
+                &gpu_output_lwe_ciphertext_list,
+                ciphertext_modulus
+            ));
+
+            let mut cpu_output_lwe_ciphertext_list = LweCiphertextList::new(
+                Scalar::ZERO,
+                equivalent_lwe_sk.lwe_dimension().to_lwe_size(),
+                LweCiphertextCount(glwe.polynomial_size().0),
+                ciphertext_modulus,
+            );
+
+            for (idx, mut output_lwe_ciphertext) in
+                cpu_output_lwe_ciphertext_list.iter_mut().enumerate()
+            {
+                extract_lwe_sample_from_glwe_ciphertext(
+                    &glwe,
+                    &mut output_lwe_ciphertext,
+                    MonomialDegree(idx),
+                );
+            }
+
+            let mut plaintext_list = PlaintextList::new(
+                Scalar::ZERO,
+                PlaintextCount(gpu_output_lwe_ciphertext_list.lwe_ciphertext_count().0),
+            );
+
+            decrypt_lwe_ciphertext_list(
+                &equivalent_lwe_sk,
+                &gpu_output_lwe_ciphertext_list,
+                &mut plaintext_list,
+            );
+
+            let mut decoded = vec![Scalar::ZERO; plaintext_list.plaintext_count().0];
+
+            decoded
+                .iter_mut()
+                .zip(plaintext_list.iter())
+                .for_each(|(dst, src)| *dst = round_decode(*src.0, delta) % msg_modulus);
+
+            assert!(decoded.iter().all(|&x| x == msg));
+        }
+
+        // In coverage, we break after one while loop iteration, changing message values does not
+        // yield higher coverage
+        #[cfg(tarpaulin)]
+        break;
+    }
+}
+
+create_gpu_parametrized_test!(glwe_encrypt_sample_extract_decrypt_custom_mod);
diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
index 3b30f96b1c..59ba20af18 100644
--- a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
@@ -1,5 +1,6 @@
 use crate::core_crypto::algorithms::test::*;
 
+mod glwe_sample_extraction;
 mod lwe_keyswitch;
 mod lwe_linear_algebra;
 mod lwe_multi_bit_programmable_bootstrapping;
diff --git a/tfhe/src/core_crypto/gpu/entities/glwe_ciphertext.rs b/tfhe/src/core_crypto/gpu/entities/glwe_ciphertext.rs
new file mode 100644
index 0000000000..fd2a9143c6
--- /dev/null
+++ b/tfhe/src/core_crypto/gpu/entities/glwe_ciphertext.rs
@@ -0,0 +1,97 @@
+use crate::core_crypto::gpu::vec::CudaVec;
+use crate::core_crypto::gpu::{CudaGlweList, CudaStreams};
+use crate::core_crypto::prelude::{
+    glwe_ciphertext_size, CiphertextModulus, Container, GlweCiphertext, GlweCiphertextCount,
+    GlweDimension, PolynomialSize, UnsignedInteger,
+};
+
+/// A structure representing a GLWE ciphertext with 64 bits of precision on the GPU.
+#[derive(Debug)]
+pub struct CudaGlweCiphertext<T: UnsignedInteger>(pub(crate) CudaGlweList<T>);
+
+#[allow(dead_code)]
+impl<T: UnsignedInteger> CudaGlweCiphertext<T> {
+    pub fn new(
+        glwe_dimension: GlweDimension,
+        polynomial_size: PolynomialSize,
+        ciphertext_modulus: CiphertextModulus<T>,
+        streams: &CudaStreams,
+    ) -> Self {
+        // Allocate memory in the device
+        let d_vec = CudaVec::new(
+            glwe_ciphertext_size(glwe_dimension.to_glwe_size(), polynomial_size),
+            streams,
+            0,
+        );
+        let cuda_glwe_list = CudaGlweList {
+            d_vec,
+            glwe_ciphertext_count: GlweCiphertextCount(1),
+            glwe_dimension,
+            polynomial_size,
+            ciphertext_modulus,
+        };
+
+        Self(cuda_glwe_list)
+    }
+
+    pub(crate) fn to_glwe_ciphertext(&self, streams: &CudaStreams) -> GlweCiphertext<Vec<T>> {
+        let glwe_ct_size =
+            glwe_ciphertext_size(self.0.glwe_dimension.to_glwe_size(), self.0.polynomial_size);
+        let mut container: Vec<T> = vec![T::ZERO; glwe_ct_size];
+
+        unsafe {
+            self.0
+                .d_vec
+                .copy_to_cpu_async(container.as_mut_slice(), streams, 0);
+            streams.synchronize();
+        }
+
+        GlweCiphertext::from_container(container, self.polynomial_size(), self.ciphertext_modulus())
+    }
+
+    pub fn from_glwe_ciphertext<C: Container<Element = T>>(
+        h_ct: &GlweCiphertext<C>,
+        streams: &CudaStreams,
+    ) -> Self {
+        let glwe_dimension = h_ct.glwe_size().to_glwe_dimension();
+        let glwe_ciphertext_count = GlweCiphertextCount(1);
+        let polynomial_size = h_ct.polynomial_size();
+        let ciphertext_modulus = h_ct.ciphertext_modulus();
+
+        let mut d_vec = CudaVec::new(
+            glwe_ciphertext_size(glwe_dimension.to_glwe_size(), polynomial_size)
+                * glwe_ciphertext_count.0,
+            streams,
+            0,
+        );
+
+        // Copy to the GPU
+        let h_input = h_ct.as_view().into_container();
+        unsafe {
+            d_vec.copy_from_cpu_async(h_input.as_ref(), streams, 0);
+        }
+        streams.synchronize();
+
+        let cuda_glwe_list = CudaGlweList {
+            d_vec,
+            glwe_ciphertext_count,
+            glwe_dimension,
+            polynomial_size,
+            ciphertext_modulus,
+        };
+
+        Self(cuda_glwe_list)
+    }
+
+    pub(crate) fn glwe_dimension(&self) -> GlweDimension {
+        self.0.glwe_dimension
+    }
+
+    pub(crate) fn polynomial_size(&self) -> PolynomialSize {
+        self.0.polynomial_size
+    }
+
+    pub(crate) fn ciphertext_modulus(&self) -> CiphertextModulus<T> {
+        self.0.ciphertext_modulus
+    }
+}
diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext.rs b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext.rs
new file mode 100644
index 0000000000..969062a687
--- /dev/null
+++ b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext.rs
@@ -0,0 +1,147 @@
+use crate::core_crypto::gpu::vec::CudaVec;
+use crate::core_crypto::gpu::{CudaLweList, CudaStreams};
+use crate::core_crypto::prelude::{
+    CiphertextModulus, Container, LweCiphertext, LweCiphertextCount, LweCiphertextList,
+    LweDimension, LweSize, UnsignedInteger,
+};
+
+/// A structure representing a vector of LWE ciphertexts with 64 bits of precision on the GPU.
+#[derive(Debug)]
+pub struct CudaLweCiphertext<T: UnsignedInteger>(pub(crate) CudaLweList<T>);
+
+#[allow(dead_code)]
+impl<T: UnsignedInteger> CudaLweCiphertext<T> {
+    pub fn new(
+        lwe_dimension: LweDimension,
+        ciphertext_modulus: CiphertextModulus<T>,
+        streams: &CudaStreams,
+    ) -> Self {
+        // Allocate memory in the gpu_index
+        let d_vec = unsafe { CudaVec::new_async(lwe_dimension.to_lwe_size().0, streams, 0) };
+        streams.synchronize();
+
+        let cuda_lwe_list = CudaLweList {
+            d_vec,
+            lwe_ciphertext_count: LweCiphertextCount(1),
+            lwe_dimension,
+            ciphertext_modulus,
+        };
+
+        Self(cuda_lwe_list)
+    }
+
+    pub fn from_cuda_vec(d_vec: CudaVec<T>, ciphertext_modulus: CiphertextModulus<T>) -> Self {
+        let lwe_dimension = LweSize(d_vec.len()).to_lwe_dimension();
+        let cuda_lwe_list = CudaLweList {
+            d_vec,
+            lwe_ciphertext_count: LweCiphertextCount(1),
+            lwe_dimension,
+            ciphertext_modulus,
+        };
+        Self(cuda_lwe_list)
+    }
+
+    pub fn to_lwe_ciphertext_list(&self, streams: &CudaStreams) -> LweCiphertextList<Vec<T>> {
+        let lwe_ct_size = self.0.lwe_dimension.to_lwe_size().0;
+        let mut container: Vec<T> = vec![T::ZERO; lwe_ct_size];
+
+        unsafe {
+            self.0
+                .d_vec
+                .copy_to_cpu_async(container.as_mut_slice(), streams, 0);
+        }
+        streams.synchronize();
+
+        LweCiphertextList::from_container(
+            container,
+            self.lwe_dimension().to_lwe_size(),
+            self.ciphertext_modulus(),
+        )
+    }
+
+    pub fn from_lwe_ciphertext<C: Container<Element = T>>(
+        h_ct: &LweCiphertext<C>,
+        streams: &CudaStreams,
+    ) -> Self {
+        let lwe_dimension = h_ct.lwe_size().to_lwe_dimension();
+        let lwe_ciphertext_count = LweCiphertextCount(1);
+        let ciphertext_modulus = h_ct.ciphertext_modulus();
+
+        // Copy to the GPU
+        let mut d_vec = CudaVec::new(lwe_dimension.to_lwe_size().0, streams, 0);
+        unsafe {
+            d_vec.copy_from_cpu_async(h_ct.as_ref(), streams, 0);
+        }
+        streams.synchronize();
+
+        let cuda_lwe_list = CudaLweList {
+            d_vec,
+            lwe_ciphertext_count,
+            lwe_dimension,
+            ciphertext_modulus,
+        };
+        Self(cuda_lwe_list)
+    }
+
+    pub fn into_lwe_ciphertext(&self, streams: &CudaStreams) -> LweCiphertext<Vec<T>> {
+        let lwe_ct_size = self.0.lwe_dimension.to_lwe_size().0;
+        let mut container: Vec<T> = vec![T::ZERO; lwe_ct_size];
+
+        unsafe {
+            self.0
+                .d_vec
+                .copy_to_cpu_async(container.as_mut_slice(), streams, 0);
+        }
+        streams.synchronize();
+
+        LweCiphertext::from_container(container, self.ciphertext_modulus())
+    }
+
+    /// ```rust
+    /// use tfhe::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+    /// use tfhe::core_crypto::gpu::CudaStreams;
+    /// use tfhe::core_crypto::prelude::{
+    ///     CiphertextModulus, LweCiphertextCount, LweCiphertextList, LweSize,
+    /// };
+    ///
+    /// let mut streams = CudaStreams::new_single_gpu(0);
+    ///
+    /// let lwe_size = LweSize(743);
+    /// let ciphertext_modulus = CiphertextModulus::new_native();
+    /// let lwe_ciphertext_count = LweCiphertextCount(2);
+    ///
+    /// // Create a new LweCiphertextList
+    /// let lwe_list = LweCiphertextList::new(0u64, lwe_size, lwe_ciphertext_count, ciphertext_modulus);
+    ///
+    /// // Copy to GPU
+    /// let d_lwe_list = CudaLweCiphertextList::from_lwe_ciphertext_list(&lwe_list, &mut streams);
+    /// let d_lwe_list_copied = d_lwe_list.duplicate(&mut streams);
+    ///
+    /// let lwe_list_copied = d_lwe_list_copied.to_lwe_ciphertext_list(&mut streams);
+    ///
+    /// assert_eq!(lwe_list, lwe_list_copied);
+    /// ```
+    pub fn duplicate(&self, streams: &CudaStreams) -> Self {
+        let lwe_dimension = self.lwe_dimension();
+        let ciphertext_modulus = self.ciphertext_modulus();
+
+        // Copy to the GPU
+        let d_vec = unsafe { self.0.d_vec.duplicate(streams, 0) };
+
+        let cuda_lwe_list = CudaLweList {
+            d_vec,
+            lwe_ciphertext_count: LweCiphertextCount(1),
+            lwe_dimension,
+            ciphertext_modulus,
+        };
+        Self(cuda_lwe_list)
+    }
+
+    pub(crate) fn lwe_dimension(&self) -> LweDimension {
+        self.0.lwe_dimension
+    }
+
+    pub(crate) fn ciphertext_modulus(&self) -> CiphertextModulus<T> {
+        self.0.ciphertext_modulus
+    }
+}
diff --git a/tfhe/src/core_crypto/gpu/entities/mod.rs b/tfhe/src/core_crypto/gpu/entities/mod.rs
index 34dcb5f03e..38a3e27bab 100644
--- a/tfhe/src/core_crypto/gpu/entities/mod.rs
+++ b/tfhe/src/core_crypto/gpu/entities/mod.rs
@@ -1,5 +1,7 @@
+pub mod glwe_ciphertext;
 pub mod glwe_ciphertext_list;
 pub mod lwe_bootstrap_key;
+pub mod lwe_ciphertext;
 pub mod lwe_ciphertext_list;
 pub mod lwe_keyswitch_key;
 pub mod lwe_multi_bit_bootstrap_key;
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
index 5a54d2a569..22edcfba73 100644
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -341,6 +341,33 @@ pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async<T: Unsigned
     }
 }
 
+/// # Safety
+///
+/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
+/// required
+#[allow(clippy::too_many_arguments)]
+pub unsafe fn extract_lwe_sample_from_glwe_ciphertext_async<T: UnsignedInteger>(
+    streams: &CudaStreams,
+    lwe_array_out: &mut CudaVec<T>,
+    glwe_array_in: &CudaVec<T>,
+    nth_array: &CudaVec<u32>,
+    num_samples: LweCiphertextCount,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+) {
+    cuda_glwe_sample_extract_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        lwe_array_out.as_mut_c_ptr(0),
+        glwe_array_in.as_c_ptr(0),
+        nth_array.as_c_ptr(0) as *const u32,
+        num_samples.0 as u32,
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+    );
+}
+
 /// Discarding addition of a vector of LWE ciphertexts
 ///
 /// # Safety
diff --git a/tfhe/src/core_crypto/gpu/vec.rs b/tfhe/src/core_crypto/gpu/vec.rs
index e86195a9c3..bc9cd24be3 100644
--- a/tfhe/src/core_crypto/gpu/vec.rs
+++ b/tfhe/src/core_crypto/gpu/vec.rs
@@ -345,6 +345,15 @@ impl<T: Numeric> CudaVec<T> {
         }
     }
 
+    pub unsafe fn duplicate(&self, streams: &CudaStreams, gpu_index: u32) -> Self {
+        // Copy to the GPU
+        let mut d_vec = CudaVec::new_async(self.len, streams, gpu_index);
+        d_vec.copy_from_gpu_async(self, streams, gpu_index);
+        streams.synchronize();
+
+        d_vec
+    }
+
     #[allow(clippy::needless_pass_by_ref_mut)]
     pub(crate) fn as_mut_c_ptr(&mut self, gpu_index: u32) -> *mut c_void {
         self.ptr[gpu_index as usize]