diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
index 670b9f46c0..f7ee72f9db 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -35,8 +35,11 @@ enum COMPARISON_TYPE {
   MAX = 6,
   MIN = 7,
 };
+
 enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };
 
+enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
+
 extern "C" {
 void scratch_cuda_apply_univariate_lut_kb_64(
     void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
@@ -355,6 +358,24 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
 void cleanup_cuda_integer_div_rem(void **streams, uint32_t *gpu_indexes,
                                   uint32_t gpu_count, int8_t **mem_ptr_void);
 
+void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory);
+
+void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lhs,
+    void *rhs, void *overflowed, int8_t signed_operation, int8_t *mem_ptr,
+    void **bsks, void **ksks, uint32_t num_blocks_in_radix);
+
+void cleanup_signed_overflowing_add_or_sub(void **streams,
+                                           uint32_t *gpu_indexes,
+                                           uint32_t gpu_count,
+                                           int8_t **mem_ptr_void);
 } // extern C
 
 template <typename Torus>
@@ -2884,6 +2905,232 @@ template <typename Torus> struct int_div_rem_memory {
   }
 };
 
+template <typename Torus> struct int_last_block_inner_propagate_memory {
+
+  int_radix_lut<Torus> *last_block_inner_propagation_lut;
+  int_radix_params params;
+
+  int_last_block_inner_propagate_memory(
+      cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+      int_radix_params params, SIGNED_OPERATION op, uint32_t num_radix_blocks,
+      bool allocate_gpu_memory) {
+
+    this->params = params;
+    auto message_modulus = params.message_modulus;
+    uint32_t bits_of_message =
+        static_cast<uint32_t>(std::log2(params.message_modulus));
+    Torus message_bit_mask = (1 << bits_of_message) - 1;
+
+    // declare lambda function for last_block_inner_propagation_lut generation
+    auto f_last_block_inner_propagation_lut =
+        [op, message_modulus, message_bit_mask,
+         bits_of_message](Torus lhs_block, Torus rhs_block) -> Torus {
+      Torus local_rhs_block = 0;
+      if (op == SIGNED_OPERATION::SUBTRACTION) {
+        Torus flipped_rhs = !rhs_block;
+        local_rhs_block = (flipped_rhs << 1) & message_bit_mask;
+      } else {
+        local_rhs_block = (rhs_block << 1) & message_bit_mask;
+      };
+
+      Torus local_lhs_block = (lhs_block << 1) & message_bit_mask;
+
+      // whole_result contains the result of addition with
+      // the carry being in the first bit of carry space
+      // the message space contains the message, but with one 0
+      // on the right (lsb)
+      Torus whole_result = local_lhs_block + local_rhs_block;
+      Torus carry = whole_result >> bits_of_message;
+      Torus result = (whole_result & message_bit_mask) >> 1;
+      Torus propagation_result = 0;
+      if (carry == 1) {
+        // Addition of bits before last one generates a carry
+        propagation_result = OUTPUT_CARRY::GENERATED;
+      } else if (result == ((message_modulus - 1) >> 1)) {
+        // Addition of bits before last one puts the bits
+        // in a state that makes it so that an input carry into last block
+        // gets propagated to last bit.
+        propagation_result = OUTPUT_CARRY::PROPAGATED;
+      } else {
+        propagation_result = OUTPUT_CARRY::NONE;
+      };
+
+      // Shift the propagation result in carry part
+      // to have less noise growth later
+      return propagation_result << bits_of_message;
+    };
+
+    last_block_inner_propagation_lut = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);
+
+    generate_device_accumulator_bivariate<Torus>(
+        streams[0], gpu_indexes[0],
+        last_block_inner_propagation_lut->get_lut(gpu_indexes[0], 0),
+        params.glwe_dimension, params.polynomial_size, message_modulus,
+        params.carry_modulus, f_last_block_inner_propagation_lut);
+    last_block_inner_propagation_lut->broadcast_lut(streams, gpu_indexes,
+                                                    gpu_indexes[0]);
+  }
+
+  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
+               uint32_t gpu_count) {
+    last_block_inner_propagation_lut->release(streams, gpu_indexes, gpu_count);
+    delete last_block_inner_propagation_lut;
+  }
+};
+
+template <typename Torus> struct int_resolve_signed_overflow_memory {
+
+  int_radix_lut<Torus> *resolve_overflow_lut;
+  int_radix_params params;
+
+  Torus *x;
+
+  int_resolve_signed_overflow_memory(cudaStream_t *streams,
+                                     uint32_t *gpu_indexes, uint32_t gpu_count,
+                                     int_radix_params params,
+                                     bool allocate_gpu_memory) {
+
+    this->params = params;
+
+    auto message_modulus = params.message_modulus;
+
+    uint32_t bits_of_message =
+        static_cast<uint32_t>(std::log2(message_modulus));
+
+    x = (Torus *)cuda_malloc_async((params.big_lwe_dimension + 1) *
+                                       sizeof(Torus),
+                                   streams[0], gpu_indexes[0]);
+
+    // declare lambda function for resolve_overflow_lut generation
+    auto f_resolve_overflow_lut = [bits_of_message](Torus x) -> Torus {
+      Torus carry_propagation = x >> bits_of_message;
+      Torus output_carry_of_block = (x >> 1) & 1;
+      Torus input_carry_of_block = x & 1;
+
+      // Resolve the carry that the last bit actually receives as input
+      Torus input_carry_to_last_bit;
+      if (carry_propagation == OUTPUT_CARRY::PROPAGATED) {
+        input_carry_to_last_bit = input_carry_of_block;
+      } else if (carry_propagation == OUTPUT_CARRY::GENERATED) {
+        input_carry_to_last_bit = 1;
+      } else {
+        input_carry_to_last_bit = 0;
+      };
+
+      return input_carry_to_last_bit != output_carry_of_block;
+    };
+
+    resolve_overflow_lut = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);
+
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0],
+        resolve_overflow_lut->get_lut(gpu_indexes[0], 0), params.glwe_dimension,
+        params.polynomial_size, message_modulus, params.carry_modulus,
+        f_resolve_overflow_lut);
+    resolve_overflow_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+  }
+
+  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
+               uint32_t gpu_count) {
+    resolve_overflow_lut->release(streams, gpu_indexes, gpu_count);
+    delete resolve_overflow_lut;
+    cuda_drop_async(x, streams[0], gpu_indexes[0]);
+  }
+};
+
+template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
+  int_radix_params params;
+
+  // memory objects for other operations
+  int_sc_prop_memory<Torus> *scp_mem;
+  int_last_block_inner_propagate_memory<Torus> *las_block_prop_mem;
+  int_resolve_signed_overflow_memory<Torus> *resolve_overflow_mem;
+  // lookupt tables
+
+  // sub streams
+  cudaStream_t *sub_streams_1;
+  cudaStream_t *sub_streams_2;
+
+  // temporary device buffers
+  Torus *result;                       // num_blocks
+  Torus *input_carries;                // num_blocks
+  Torus *output_carry;                 // single block
+  Torus *last_block_inner_propagation; // single block
+
+  // allocate temporary arrays used to calculate
+  // cuda integer signed overflowing add or sub
+  void allocate_temporary_buffers(cudaStream_t *streams, uint32_t *gpu_indexes,
+                                  uint32_t gpu_count, uint32_t num_blocks) {
+    uint32_t big_lwe_size = params.big_lwe_dimension + 1;
+
+    result = (Torus *)cuda_malloc_async(
+        big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
+
+    input_carries = (Torus *)cuda_malloc_async(
+        big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
+    output_carry = (Torus *)cuda_malloc_async(big_lwe_size * sizeof(Torus),
+                                              streams[0], gpu_indexes[0]);
+    last_block_inner_propagation = (Torus *)cuda_malloc_async(
+        big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
+  }
+
+  // constructor without memory reuse
+  int_signed_overflowing_add_or_sub_memory(
+      cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+      int_radix_params params, uint32_t num_blocks, SIGNED_OPERATION op,
+      bool allocate_gpu_memory) {
+    this->params = params;
+
+    allocate_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks);
+
+    // initialize streams
+    sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
+    sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
+    for (uint j = 0; j < gpu_count; j++) {
+      sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
+      sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
+    }
+
+    // initialize memory objects for other operations
+    scp_mem =
+        new int_sc_prop_memory<Torus>(streams, gpu_indexes, gpu_count, params,
+                                      num_blocks, allocate_gpu_memory);
+    las_block_prop_mem = new int_last_block_inner_propagate_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, op, num_blocks,
+        allocate_gpu_memory);
+
+    resolve_overflow_mem = new int_resolve_signed_overflow_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, allocate_gpu_memory);
+  }
+
+  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
+               uint32_t gpu_count) {
+    // memory objects for other operations
+    scp_mem->release(streams, gpu_indexes, gpu_count);
+    las_block_prop_mem->release(streams, gpu_indexes, gpu_count);
+    resolve_overflow_mem->release(streams, gpu_indexes, gpu_count);
+
+    delete scp_mem;
+    delete las_block_prop_mem;
+    delete resolve_overflow_mem;
+
+    // temporary device buffers
+    cuda_drop_async(result, streams[0], gpu_indexes[0]);
+    cuda_drop_async(input_carries, streams[0], gpu_indexes[0]);
+    cuda_drop_async(output_carry, streams[0], gpu_indexes[0]);
+    cuda_drop_async(last_block_inner_propagation, streams[0], gpu_indexes[0]);
+
+    // sub streams
+    for (uint i = 0; i < gpu_count; i++) {
+      cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
+      cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
+    }
+    free(sub_streams_1);
+    free(sub_streams_2);
+  }
+};
 template <typename Torus> struct int_bitop_buffer {
 
   int_radix_params params;
@@ -2981,8 +3228,8 @@ template <typename Torus> struct int_scalar_mul_buffer {
       size_t num_ciphertext_bits = msg_bits * num_radix_blocks;
 
       //// Contains all shifted values of lhs for shift in range (0..msg_bits)
-      //// The idea is that with these we can create all other shift that are in
-      //// range (0..total_bits) for free (block rotation)
+      //// The idea is that with these we can create all other shift that are
+      /// in / range (0..total_bits) for free (block rotation)
       preshifted_buffer = (Torus *)cuda_malloc_async(
           num_ciphertext_bits * lwe_size_bytes, streams[0], gpu_indexes[0]);
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu
new file mode 100644
index 0000000000..cd78bf7b6d
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu
@@ -0,0 +1,93 @@
+#include "integer/addition.cuh"
+
+void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory) {
+
+  SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION
+                                                : SIGNED_OPERATION::SUBTRACTION;
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  scratch_cuda_integer_signed_overflowing_add_or_sub_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_signed_overflowing_add_or_sub_memory<uint64_t> **)mem_ptr,
+      num_blocks, op, params, allocate_gpu_memory);
+}
+
+void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lhs,
+    void *rhs, void *overflowed, int8_t signed_operation, int8_t *mem_ptr,
+    void **bsks, void **ksks, uint32_t num_blocks) {
+
+  auto mem = (int_signed_overflowing_add_or_sub_memory<uint64_t> *)mem_ptr;
+  SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION
+                                                : SIGNED_OPERATION::SUBTRACTION;
+
+  switch (mem->params.polynomial_size) {
+  case 512:
+    host_integer_signed_overflowing_add_or_sub_kb<uint64_t, Degree<512>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lhs), static_cast<uint64_t *>(rhs),
+        static_cast<uint64_t *>(overflowed), op, bsks, (uint64_t **)(ksks), mem,
+        num_blocks);
+    break;
+  case 1024:
+
+    host_integer_signed_overflowing_add_or_sub_kb<uint64_t, Degree<512>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lhs), static_cast<uint64_t *>(rhs),
+        static_cast<uint64_t *>(overflowed), op, bsks, (uint64_t **)(ksks), mem,
+        num_blocks);
+    break;
+  case 2048:
+    host_integer_signed_overflowing_add_or_sub_kb<uint64_t, Degree<512>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lhs), static_cast<uint64_t *>(rhs),
+        static_cast<uint64_t *>(overflowed), op, bsks, (uint64_t **)(ksks), mem,
+        num_blocks);
+    break;
+  case 4096:
+    host_integer_signed_overflowing_add_or_sub_kb<uint64_t, Degree<512>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lhs), static_cast<uint64_t *>(rhs),
+        static_cast<uint64_t *>(overflowed), op, bsks, (uint64_t **)(ksks), mem,
+        num_blocks);
+    break;
+  case 8192:
+    host_integer_signed_overflowing_add_or_sub_kb<uint64_t, Degree<512>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lhs), static_cast<uint64_t *>(rhs),
+        static_cast<uint64_t *>(overflowed), op, bsks, (uint64_t **)(ksks), mem,
+        num_blocks);
+    break;
+  case 16384:
+    host_integer_signed_overflowing_add_or_sub_kb<uint64_t, Degree<512>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lhs), static_cast<uint64_t *>(rhs),
+        static_cast<uint64_t *>(overflowed), op, bsks, (uint64_t **)(ksks), mem,
+        num_blocks);
+    break;
+  default:
+    PANIC("Cuda error (integer signed_overflowing_add_or_sub): unsupported "
+          "polynomial size. "
+          "Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
+  }
+}
+
+void cleanup_signed_overflowing_add_or_sub(void **streams,
+                                           uint32_t *gpu_indexes,
+                                           uint32_t gpu_count,
+                                           int8_t **mem_ptr_void) {
+  int_signed_overflowing_add_or_sub_memory<uint64_t> *mem_ptr =
+      (int_signed_overflowing_add_or_sub_memory<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
new file mode 100644
index 0000000000..5feef93e6d
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
@@ -0,0 +1,146 @@
+#ifndef TFHE_RS_ADDITION_CUH
+#define TFHE_RS_ADDITION_CUH
+
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer.h"
+#include "integer/comparison.cuh"
+#include "integer/integer.cuh"
+#include "integer/negation.cuh"
+#include "integer/scalar_shifts.cuh"
+#include "linear_algebra.h"
+#include "programmable_bootstrap.h"
+#include "utils/helper.cuh"
+#include "utils/kernel_dimensions.cuh"
+#include <fstream>
+#include <iostream>
+#include <omp.h>
+#include <sstream>
+#include <string>
+#include <vector>
+
+template <typename Torus>
+void host_resolve_signed_overflow(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *result, Torus *last_block_inner_propagation,
+    Torus *last_block_input_carry, Torus *last_block_output_carry,
+    int_resolve_signed_overflow_memory<Torus> *mem, void **bsks, Torus **ksks) {
+
+  auto x = mem->x;
+
+  Torus *d_clears =
+      (Torus *)cuda_malloc_async(sizeof(Torus), streams[0], gpu_indexes[0]);
+
+  cuda_set_value_async<Torus>(streams[0], gpu_indexes[0], d_clears, 2, 1);
+
+  // replace with host function call
+  cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
+      streams[0], gpu_indexes[0], x, last_block_output_carry, d_clears,
+      mem->params.big_lwe_dimension, 1);
+
+  host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
+                last_block_inner_propagation, x, mem->params.big_lwe_dimension,
+                1);
+  host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
+                last_block_inner_propagation, last_block_input_carry,
+                mem->params.big_lwe_dimension, 1);
+
+  host_apply_univariate_lut_kb<Torus>(streams, gpu_indexes, gpu_count, result,
+                                      last_block_inner_propagation,
+                                      mem->resolve_overflow_lut, ksks, bsks, 1);
+
+  cuda_drop_async(d_clears, streams[0], gpu_indexes[0]);
+}
+
+template <typename Torus>
+__host__ void scratch_cuda_integer_signed_overflowing_add_or_sub_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_signed_overflowing_add_or_sub_memory<Torus> **mem_ptr,
+    uint32_t num_blocks, SIGNED_OPERATION op, int_radix_params params,
+    bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_signed_overflowing_add_or_sub_memory<Torus>(
+      streams, gpu_indexes, gpu_count, params, num_blocks, op,
+      allocate_gpu_memory);
+}
+
+/*
+ * Addition - signed_operation = 1
+ * Subtraction - signed_operation = -1
+ */
+template <typename Torus, class params>
+__host__ void host_integer_signed_overflowing_add_or_sub_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lhs, Torus *rhs, Torus *overflowed, SIGNED_OPERATION op, void **bsks,
+    uint64_t **ksks,
+    int_signed_overflowing_add_or_sub_memory<uint64_t> *mem_ptr,
+    uint32_t num_blocks) {
+
+  auto radix_params = mem_ptr->params;
+
+  uint32_t big_lwe_dimension = radix_params.big_lwe_dimension;
+  uint32_t big_lwe_size = big_lwe_dimension + 1;
+  uint32_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+  assert(radix_params.message_modulus >= 4 && radix_params.carry_modulus >= 4);
+
+  auto result = mem_ptr->result;
+  auto input_carries = mem_ptr->input_carries;
+  auto output_carry = mem_ptr->output_carry;
+  auto last_block_inner_propagation = mem_ptr->last_block_inner_propagation;
+
+  cuda_memcpy_async_gpu_to_gpu(result, lhs, num_blocks * big_lwe_size_bytes,
+                               streams[0], gpu_indexes[0]);
+
+  // phase 1
+  if (op == SIGNED_OPERATION::ADDITION) {
+    host_addition(streams[0], gpu_indexes[0], result, lhs, rhs,
+                  big_lwe_dimension, num_blocks);
+  } else {
+    host_subtraction(streams[0], gpu_indexes[0], result, lhs, rhs,
+                     big_lwe_dimension, num_blocks);
+  }
+
+  // phase 2
+  for (uint j = 0; j < gpu_count; j++) {
+    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+  }
+
+#pragma omp parallel sections
+  {
+    // generate input_carries and output_carry
+#pragma omp section
+    {
+      host_propagate_single_carry(
+          mem_ptr->sub_streams_1, gpu_indexes, gpu_count, result, output_carry,
+          input_carries, mem_ptr->scp_mem, bsks, ksks, num_blocks);
+    }
+
+    // generate generate_last_block_inner_propagation
+#pragma omp section
+    {
+      host_generate_last_block_inner_propagation(
+          mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
+          last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size],
+          &rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem,
+          bsks, ksks);
+    }
+  }
+
+  for (uint j = 0; j < gpu_count; j++) {
+    cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+    cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+  }
+
+  // phase 3
+  auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size];
+
+  host_resolve_signed_overflow(
+      streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation,
+      input_carry, output_carry, mem_ptr->resolve_overflow_mem, bsks, ksks);
+
+  cuda_memcpy_async_gpu_to_gpu(lhs, result, num_blocks * big_lwe_size_bytes,
+                               streams[0], gpu_indexes[0]);
+}
+
+#endif // TFHE_RS_ADDITION_CUH
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index 77b9a77178..99ae8e0bba 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -496,6 +496,19 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
       num_blocks, message_acc);
 }
 
+template <typename Torus>
+void host_generate_last_block_inner_propagation(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *last_block_inner_propagation, Torus *lhs, Torus *rhs,
+    int_last_block_inner_propagate_memory<Torus> *mem, void **bsks,
+    Torus **ksks) {
+
+  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, last_block_inner_propagation, lhs, rhs,
+      bsks, ksks, 1, mem->last_block_inner_propagation_lut,
+      mem->params.message_modulus);
+}
+
 template <typename Torus>
 void host_propagate_single_sub_borrow(cudaStream_t *streams,
                                       uint32_t *gpu_indexes, uint32_t gpu_count,
diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs
index 621b0ba45d..e36e3c94de 100644
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -1192,4 +1192,46 @@ extern "C" {
         mem_ptr: *mut *mut i8,
     );
 
+    pub fn scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_blocks: u32,
+        signed_operation: i8,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: u32,
+        allocate_gpu_memory: bool,
+    );
+
+    pub fn cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        lhs: *mut c_void,
+        rhs: *const c_void,
+        overflowed: *mut c_void,
+        signed_operation: i8,
+        mem_ptr: *mut i8,
+        bsks: *const *mut c_void,
+        ksks: *const *mut c_void,
+        num_blocks: u32,
+    );
+
+    pub fn cleanup_signed_overflowing_add_or_sub(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+    );
 } // extern "C"
diff --git a/tfhe/benches/integer/signed_bench.rs b/tfhe/benches/integer/signed_bench.rs
index 306e84a612..1ff428d87c 100644
--- a/tfhe/benches/integer/signed_bench.rs
+++ b/tfhe/benches/integer/signed_bench.rs
@@ -1683,6 +1683,11 @@ mod cuda {
         display_name: max
     );
 
+    define_cuda_server_key_bench_clean_input_signed_fn!(
+        method_name: unchecked_signed_overflowing_add,
+        display_name: overflowing_add
+    );
+
     define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
         method_name: unchecked_scalar_add,
         display_name: add,
@@ -1895,6 +1900,11 @@ mod cuda {
         display_name: max
     );
 
+    define_cuda_server_key_bench_clean_input_signed_fn!(
+        method_name: signed_overflowing_add,
+        display_name: overflowing_add
+    );
+
     define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
         method_name: scalar_add,
         display_name: add,
@@ -2025,6 +2035,7 @@ mod cuda {
         cuda_unchecked_le,
         cuda_unchecked_min,
         cuda_unchecked_max,
+        cuda_unchecked_signed_overflowing_add,
     );
 
     criterion_group!(
@@ -2072,6 +2083,7 @@ mod cuda {
         cuda_min,
         cuda_max,
         cuda_if_then_else,
+        cuda_signed_overflowing_add,
     );
 
     criterion_group!(
diff --git a/tfhe/src/high_level_api/integers/signed/overflowing_ops.rs b/tfhe/src/high_level_api/integers/signed/overflowing_ops.rs
index 3ae8d62492..6656248a5a 100644
--- a/tfhe/src/high_level_api/integers/signed/overflowing_ops.rs
+++ b/tfhe/src/high_level_api/integers/signed/overflowing_ops.rs
@@ -1,5 +1,7 @@
 use crate::core_crypto::prelude::SignedNumeric;
 use crate::high_level_api::global_state;
+#[cfg(feature = "gpu")]
+use crate::high_level_api::global_state::with_thread_local_cuda_streams;
 use crate::high_level_api::integers::FheIntId;
 use crate::high_level_api::keys::InternalServerKey;
 use crate::integer::block_decomposition::DecomposableInto;
@@ -48,9 +50,14 @@ where
                 (FheInt::new(result), FheBool::new(overflow))
             }
             #[cfg(feature = "gpu")]
-            InternalServerKey::Cuda(_) => {
-                todo!("Cuda devices do not support signed integer");
-            }
+            InternalServerKey::Cuda(cuda_key) => with_thread_local_cuda_streams(|streams| {
+                let (result, overflow) = cuda_key.key.signed_overflowing_add(
+                    &self.ciphertext.on_gpu(),
+                    &other.ciphertext.on_gpu(),
+                    streams,
+                );
+                (FheInt::new(result), FheBool::new(overflow))
+            }),
         })
     }
 }
diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs
index 1c8e307c2d..654ddce257 100644
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -2339,3 +2339,76 @@ pub unsafe fn unchecked_unsigned_div_rem_integer_radix_kb_assign_async<
         std::ptr::addr_of_mut!(mem_ptr),
     );
 }
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub unsafe fn unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async<
+    T: UnsignedInteger,
+    B: Numeric,
+>(
+    streams: &CudaStreams,
+    lhs: &mut CudaVec<T>,
+    rhs: &CudaVec<T>,
+    overflowed: &mut CudaVec<T>,
+    signed_operation: i8,
+    bootstrapping_key: &CudaVec<B>,
+    keyswitch_key: &CudaVec<T>,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    big_lwe_dimension: LweDimension,
+    small_lwe_dimension: LweDimension,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    num_blocks: u32,
+    pbs_type: PBSType,
+    grouping_factor: LweBskGroupingFactor,
+) {
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+        big_lwe_dimension.0 as u32,
+        small_lwe_dimension.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        grouping_factor.0 as u32,
+        num_blocks,
+        signed_operation,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        pbs_type as u32,
+        true,
+    );
+    cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        lhs.as_mut_c_ptr(0),
+        rhs.as_c_ptr(0),
+        overflowed.as_mut_c_ptr(0),
+        signed_operation,
+        mem_ptr,
+        bootstrapping_key.ptr.as_ptr(),
+        keyswitch_key.ptr.as_ptr(),
+        num_blocks,
+    );
+    cleanup_signed_overflowing_add_or_sub(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+    );
+}
diff --git a/tfhe/src/integer/gpu/server_key/radix/add.rs b/tfhe/src/integer/gpu/server_key/radix/add.rs
index d59b9d9197..826f29acc1 100644
--- a/tfhe/src/integer/gpu/server_key/radix/add.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/add.rs
@@ -2,12 +2,16 @@ use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
 use crate::core_crypto::gpu::CudaStreams;
 use crate::core_crypto::prelude::LweBskGroupingFactor;
 use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
-use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaUnsignedRadixCiphertext};
+use crate::integer::gpu::ciphertext::{
+    CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext,
+};
 use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
 use crate::integer::gpu::{
     unchecked_add_integer_radix_assign_async,
+    unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async,
     unchecked_sum_ciphertexts_integer_radix_kb_assign_async, PBSType,
 };
+use crate::integer::server_key::radix_parallel::sub::SignedOperation;
 use crate::shortint::ciphertext::NoiseLevel;
 
 impl CudaServerKey {
@@ -491,4 +495,240 @@ impl CudaServerKey {
 
         (ct_res, ct_overflowed)
     }
+
+    /// ```rust
+    /// use tfhe::core_crypto::gpu::CudaStreams;
+    /// use tfhe::integer::gpu::ciphertext::{CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext};
+    /// use tfhe::integer::gpu::gen_keys_radix_gpu;
+    /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
+    ///
+    /// let gpu_index = 0;
+    /// let streams = CudaStreams::new_single_gpu(gpu_index);
+    ///
+    /// // Generate the client key and the server key:
+    /// let num_blocks = 4;
+    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, num_blocks, &streams);
+    /// let total_bits = num_blocks * cks.parameters().message_modulus().0.ilog2() as usize;
+    /// let modulus = 1 << total_bits;
+    ///
+    /// let msg1: i8 = 120;
+    /// let msg2: i8 = 8;
+    ///
+    /// let ct1 = cks.encrypt_signed(msg1);
+    /// let ct2 = cks.encrypt_signed(msg2);
+    ///
+    /// // Copy to GPU
+    /// let d_ct1 = CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct1, &streams);
+    /// let d_ct2 = CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct2, &streams);
+    ///
+    /// // Compute homomorphically an overflowing addition:
+    /// let (d_ct_res, d_ct_overflowed) = sks.signed_overflowing_add(&d_ct1, &d_ct2, &streams);
+    ///
+    /// let ct_res = d_ct_res.to_signed_radix_ciphertext(&streams);
+    /// let ct_overflowed = d_ct_overflowed.to_boolean_block(&streams);
+    ///
+    /// // Decrypt:
+    /// let dec_result: i8 = cks.decrypt_signed(&ct_res);
+    /// let dec_overflowed: bool = cks.decrypt_bool(&ct_overflowed);
+    /// let (clear_result, clear_overflowed) = msg1.overflowing_add(msg2);
+    /// assert_eq!(dec_result, clear_result);
+    /// assert_eq!(dec_overflowed, clear_overflowed);
+    /// ```
+    pub fn signed_overflowing_add(
+        &self,
+        ct_left: &CudaSignedRadixCiphertext,
+        ct_right: &CudaSignedRadixCiphertext,
+        stream: &CudaStreams,
+    ) -> (CudaSignedRadixCiphertext, CudaBooleanBlock) {
+        let mut tmp_lhs;
+        let mut tmp_rhs;
+        let (lhs, rhs) = match (
+            ct_left.block_carries_are_empty(),
+            ct_right.block_carries_are_empty(),
+        ) {
+            (true, true) => (ct_left, ct_right),
+            (true, false) => {
+                unsafe {
+                    tmp_rhs = ct_right.duplicate_async(stream);
+                    self.full_propagate_assign_async(&mut tmp_rhs, stream);
+                }
+                (ct_left, &tmp_rhs)
+            }
+            (false, true) => {
+                unsafe {
+                    tmp_lhs = ct_left.duplicate_async(stream);
+                    self.full_propagate_assign_async(&mut tmp_lhs, stream);
+                }
+                (&tmp_lhs, ct_right)
+            }
+            (false, false) => {
+                unsafe {
+                    tmp_lhs = ct_left.duplicate_async(stream);
+                    tmp_rhs = ct_right.duplicate_async(stream);
+
+                    self.full_propagate_assign_async(&mut tmp_lhs, stream);
+                    self.full_propagate_assign_async(&mut tmp_rhs, stream);
+                }
+
+                (&tmp_lhs, &tmp_rhs)
+            }
+        };
+
+        self.unchecked_signed_overflowing_add(lhs, rhs, stream)
+    }
+
+    pub fn unchecked_signed_overflowing_add(
+        &self,
+        ct_left: &CudaSignedRadixCiphertext,
+        ct_right: &CudaSignedRadixCiphertext,
+        stream: &CudaStreams,
+    ) -> (CudaSignedRadixCiphertext, CudaBooleanBlock) {
+        assert_eq!(
+            ct_left.as_ref().d_blocks.lwe_ciphertext_count().0,
+            ct_right.as_ref().d_blocks.lwe_ciphertext_count().0,
+            "lhs and rhs must have the name number of blocks ({} vs {})",
+            ct_left.as_ref().d_blocks.lwe_ciphertext_count().0,
+            ct_right.as_ref().d_blocks.lwe_ciphertext_count().0
+        );
+        assert!(
+            ct_left.as_ref().d_blocks.lwe_ciphertext_count().0 > 0,
+            "inputs cannot be empty"
+        );
+
+        self.unchecked_signed_overflowing_add_or_sub(
+            ct_left,
+            ct_right,
+            SignedOperation::Addition,
+            stream,
+        )
+    }
+
+    pub(crate) fn unchecked_signed_overflowing_add_or_sub(
+        &self,
+        lhs: &CudaSignedRadixCiphertext,
+        rhs: &CudaSignedRadixCiphertext,
+        signed_operation: SignedOperation,
+        streams: &CudaStreams,
+    ) -> (CudaSignedRadixCiphertext, CudaBooleanBlock) {
+        assert!(self.message_modulus.0 >= 4 && self.carry_modulus.0 >= 4);
+
+        let mut result: CudaSignedRadixCiphertext;
+        unsafe {
+            result = lhs.duplicate_async(streams);
+        }
+        let carry_out: CudaSignedRadixCiphertext = self.create_trivial_zero_radix(1, streams);
+        let mut overflowed = CudaBooleanBlock::from_cuda_radix_ciphertext(carry_out.ciphertext);
+
+        unsafe {
+            self.unchecked_signed_overflowing_add_or_sub_assign_async(
+                &mut result,
+                rhs,
+                &mut overflowed,
+                signed_operation,
+                streams,
+            );
+            streams.synchronize();
+        }
+
+        (result, overflowed)
+    }
+
+    /// # Safety
+    ///
+    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until stream is synchronized
+    pub(crate) unsafe fn unchecked_signed_overflowing_add_or_sub_assign_async(
+        &self,
+        lhs: &mut CudaSignedRadixCiphertext,
+        rhs: &CudaSignedRadixCiphertext,
+        overflowed: &mut CudaBooleanBlock,
+        signed_operation: SignedOperation,
+        streams: &CudaStreams,
+    ) {
+        if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
+            && rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
+        {
+            overflowed.as_mut().ciphertext.info = overflowed
+                .as_ref()
+                .ciphertext
+                .info
+                .boolean_info(NoiseLevel::ZERO);
+        } else {
+            overflowed.as_mut().ciphertext.info = overflowed
+                .as_ref()
+                .ciphertext
+                .info
+                .boolean_info(NoiseLevel::NOMINAL);
+        }
+        let num_blocks = lhs.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
+        let signed_operation_numeric: i8 =
+            if matches!(signed_operation, SignedOperation::Subtraction) {
+                -1
+            } else {
+                1
+            };
+        match &self.bootstrapping_key {
+            CudaBootstrappingKey::Classic(d_bsk) => {
+                unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async(
+                    streams,
+                    &mut lhs.as_mut().d_blocks.0.d_vec,
+                    &rhs.as_ref().d_blocks.0.d_vec,
+                    &mut overflowed.as_mut().ciphertext.d_blocks.0.d_vec,
+                    signed_operation_numeric,
+                    &d_bsk.d_vec,
+                    &self.key_switching_key.d_vec,
+                    self.message_modulus,
+                    self.carry_modulus,
+                    d_bsk.glwe_dimension,
+                    d_bsk.polynomial_size,
+                    self.key_switching_key
+                        .input_key_lwe_size()
+                        .to_lwe_dimension(),
+                    self.key_switching_key
+                        .output_key_lwe_size()
+                        .to_lwe_dimension(),
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_bsk.decomp_level_count,
+                    d_bsk.decomp_base_log,
+                    num_blocks,
+                    PBSType::Classical,
+                    LweBskGroupingFactor(0),
+                );
+            }
+            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async(
+                    streams,
+                    &mut lhs.as_mut().d_blocks.0.d_vec,
+                    &rhs.as_ref().d_blocks.0.d_vec,
+                    &mut overflowed.as_mut().ciphertext.d_blocks.0.d_vec,
+                    signed_operation_numeric,
+                    &d_multibit_bsk.d_vec,
+                    &self.key_switching_key.d_vec,
+                    self.message_modulus,
+                    self.carry_modulus,
+                    d_multibit_bsk.glwe_dimension,
+                    d_multibit_bsk.polynomial_size,
+                    self.key_switching_key
+                        .input_key_lwe_size()
+                        .to_lwe_dimension(),
+                    self.key_switching_key
+                        .output_key_lwe_size()
+                        .to_lwe_dimension(),
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_multibit_bsk.decomp_level_count,
+                    d_multibit_bsk.decomp_base_log,
+                    num_blocks,
+                    PBSType::MultiBit,
+                    d_multibit_bsk.grouping_factor,
+                );
+            }
+        };
+
+        lhs.as_mut().info = lhs
+            .as_ref()
+            .info
+            .after_overflowing_add(&rhs.ciphertext.info);
+    }
 }
diff --git a/tfhe/src/integer/gpu/server_key/radix/tests_signed/mod.rs b/tfhe/src/integer/gpu/server_key/radix/tests_signed/mod.rs
index 74368a1ffe..1279e6d727 100644
--- a/tfhe/src/integer/gpu/server_key/radix/tests_signed/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_signed/mod.rs
@@ -350,3 +350,42 @@ where
         d_res.to_signed_radix_ciphertext(&context.streams)
     }
 }
+impl<'a, F>
+    FunctionExecutor<
+        (&'a SignedRadixCiphertext, &'a SignedRadixCiphertext),
+        (SignedRadixCiphertext, BooleanBlock),
+    > for GpuFunctionExecutor<F>
+where
+    F: Fn(
+        &CudaServerKey,
+        &CudaSignedRadixCiphertext,
+        &CudaSignedRadixCiphertext,
+        &CudaStreams,
+    ) -> (CudaSignedRadixCiphertext, CudaBooleanBlock),
+{
+    fn setup(&mut self, cks: &RadixClientKey, sks: Arc<ServerKey>) {
+        self.setup_from_keys(cks, &sks);
+    }
+
+    fn execute(
+        &mut self,
+        input: (&'a SignedRadixCiphertext, &'a SignedRadixCiphertext),
+    ) -> (SignedRadixCiphertext, BooleanBlock) {
+        let context = self
+            .context
+            .as_ref()
+            .expect("setup was not properly called");
+
+        let d_ctxt_1: CudaSignedRadixCiphertext =
+            CudaSignedRadixCiphertext::from_signed_radix_ciphertext(input.0, &context.streams);
+        let d_ctxt_2: CudaSignedRadixCiphertext =
+            CudaSignedRadixCiphertext::from_signed_radix_ciphertext(input.1, &context.streams);
+
+        let (d_res, d_res_bool) = (self.func)(&context.sks, &d_ctxt_1, &d_ctxt_2, &context.streams);
+
+        (
+            d_res.to_signed_radix_ciphertext(&context.streams),
+            d_res_bool.to_boolean_block(&context.streams),
+        )
+    }
+}
diff --git a/tfhe/src/integer/gpu/server_key/radix/tests_signed/test_add.rs b/tfhe/src/integer/gpu/server_key/radix/tests_signed/test_add.rs
index 185a1ae45d..d1ac54a8ec 100644
--- a/tfhe/src/integer/gpu/server_key/radix/tests_signed/test_add.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_signed/test_add.rs
@@ -3,13 +3,16 @@ use crate::integer::gpu::server_key::radix::tests_unsigned::{
 };
 use crate::integer::gpu::CudaServerKey;
 use crate::integer::server_key::radix_parallel::tests_signed::test_add::{
-    signed_default_add_test, signed_unchecked_add_test,
+    signed_default_add_test, signed_unchecked_add_test, signed_unchecked_overflowing_add_test,
 };
 use crate::shortint::parameters::*;
 
 create_gpu_parametrized_test!(integer_unchecked_add);
 create_gpu_parametrized_test!(integer_add);
 
+create_gpu_parametrized_test!(integer_unchecked_signed_overflowing_add);
+create_gpu_parametrized_test!(integer_signed_overflowing_add);
+
 fn integer_unchecked_add<P>(param: P)
 where
     P: Into<PBSParameters>,
@@ -25,3 +28,19 @@ where
     let executor = GpuFunctionExecutor::new(&CudaServerKey::add);
     signed_default_add_test(param, executor);
 }
+
+fn integer_unchecked_signed_overflowing_add<P>(param: P)
+where
+    P: Into<PBSParameters>,
+{
+    let executor = GpuFunctionExecutor::new(&CudaServerKey::unchecked_signed_overflowing_add);
+    signed_unchecked_overflowing_add_test(param, executor);
+}
+
+fn integer_signed_overflowing_add<P>(param: P)
+where
+    P: Into<PBSParameters>,
+{
+    let executor = GpuFunctionExecutor::new(&CudaServerKey::signed_overflowing_add);
+    signed_unchecked_overflowing_add_test(param, executor);
+}