Skip to content

Commit

Permalink
feat(gpu): add abs operation on gpu backend
Browse files Browse the repository at this point in the history
  • Loading branch information
bbarbakadze authored and agnesLeroy committed Nov 15, 2024
1 parent f9e8df4 commit 0aee4c5
Show file tree
Hide file tree
Showing 11 changed files with 470 additions and 0 deletions.
19 changes: 19 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -417,5 +417,24 @@ void cuda_integer_reverse_blocks_64_inplace(void *const *streams,
uint32_t num_blocks,
uint32_t lwe_size);

void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, bool is_signed, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory);

void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks,
void *const *ksks, uint32_t num_blocks);

void cleanup_cuda_integer_abs_inplace(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);

} // extern C
#endif // CUDA_INTEGER_H
Original file line number Diff line number Diff line change
Expand Up @@ -3012,4 +3012,52 @@ template <typename Torus> struct int_scalar_mul_buffer {
}
};

template <typename Torus> struct int_abs_buffer {
int_radix_params params;

int_arithmetic_scalar_shift_buffer<Torus> *arithmetic_scalar_shift_mem;
int_sc_prop_memory<Torus> *scp_mem;
int_bitop_buffer<Torus> *bitxor_mem;

Torus *mask;
int_abs_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
this->params = params;

if (allocate_gpu_memory) {
arithmetic_scalar_shift_mem =
new int_arithmetic_scalar_shift_buffer<Torus>(
streams, gpu_indexes, gpu_count,
SHIFT_OR_ROTATE_TYPE::RIGHT_SHIFT, params, num_radix_blocks,
allocate_gpu_memory);
scp_mem =
new int_sc_prop_memory<Torus>(streams, gpu_indexes, gpu_count, params,
num_radix_blocks, allocate_gpu_memory);
bitxor_mem = new int_bitop_buffer<Torus>(
streams, gpu_indexes, gpu_count, BITOP_TYPE::BITXOR, params,
num_radix_blocks, allocate_gpu_memory);

uint32_t lwe_size = params.big_lwe_dimension + 1;
uint32_t lwe_size_bytes = lwe_size * sizeof(Torus);

mask = (Torus *)cuda_malloc_async(num_radix_blocks * lwe_size_bytes,
streams[0], gpu_indexes[0]);
}
}

void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
arithmetic_scalar_shift_mem->release(streams, gpu_indexes, gpu_count);
scp_mem->release(streams, gpu_indexes, gpu_count);
bitxor_mem->release(streams, gpu_indexes, gpu_count);

delete arithmetic_scalar_shift_mem;
delete scp_mem;
delete bitxor_mem;

cuda_drop_async(mask, streams[0], gpu_indexes[0]);
}
};

#endif // CUDA_INTEGER_UTILITIES_H
43 changes: 43 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#include "integer/abs.cuh"

void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, bool is_signed, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory) {

int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);

scratch_cuda_integer_abs_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_abs_buffer<uint64_t> **)mem_ptr, is_signed, num_blocks, params,
allocate_gpu_memory);
}

void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks,
void *const *ksks, uint32_t num_blocks) {

auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;

host_integer_abs_kb<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
gpu_count, static_cast<uint64_t *>(ct), bsks,
(uint64_t **)(ksks), mem, is_signed,
num_blocks);
}

void cleanup_cuda_integer_abs_inplace(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_abs_buffer<uint64_t> *mem_ptr =
(int_abs_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}
69 changes: 69 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#ifndef TFHE_RS_ABS_CUH
#define TFHE_RS_ABS_CUH

#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer/bitwise_ops.cuh"
#include "integer/comparison.cuh"
#include "integer/integer.cuh"
#include "integer/integer_utilities.h"
#include "integer/negation.cuh"
#include "integer/scalar_shifts.cuh"
#include "linear_algebra.h"
#include "pbs/programmable_bootstrap.h"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>

template <typename Torus>
__host__ void scratch_cuda_integer_abs_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_abs_buffer<Torus> **mem_ptr, bool is_signed,
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {

if (is_signed)
*mem_ptr =
new int_abs_buffer<Torus>(streams, gpu_indexes, gpu_count, params,
num_blocks, allocate_gpu_memory);
}

template <typename Torus>
__host__ void
host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *ct, void *const *bsks,
uint64_t *const *ksks, int_abs_buffer<uint64_t> *mem_ptr,
bool is_signed, uint32_t num_blocks) {
if (!is_signed)
return;

auto radix_params = mem_ptr->params;
auto mask = mem_ptr->mask;

auto big_lwe_dimension = radix_params.big_lwe_dimension;
auto big_lwe_size = big_lwe_dimension + 1;
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
uint32_t num_bits_in_ciphertext =
(31 - __builtin_clz(radix_params.message_modulus)) * num_blocks;

cuda_memcpy_async_gpu_to_gpu(mask, ct, num_blocks * big_lwe_size_bytes,
streams[0], gpu_indexes[0]);

host_integer_radix_arithmetic_scalar_shift_kb_inplace(
streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, num_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
radix_params.big_lwe_dimension, num_blocks);

host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count, ct,
nullptr, nullptr, mem_ptr->scp_mem, bsks,
ksks, num_blocks);

host_integer_radix_bitop_kb(streams, gpu_indexes, gpu_count, ct, mask, ct,
mem_ptr->bitxor_mem, bsks, ksks, num_blocks);
}

#endif // TFHE_RS_ABS_CUH
44 changes: 44 additions & 0 deletions backends/tfhe-cuda-backend/src/bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1036,6 +1036,50 @@ extern "C" {
lwe_size: u32,
);
}
extern "C" {
pub fn scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr: *mut *mut i8,
is_signed: bool,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
allocate_gpu_memory: bool,
);
}
extern "C" {
pub fn cuda_integer_abs_inplace_radix_ciphertext_kb_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
ct: *mut ffi::c_void,
mem_ptr: *mut i8,
is_signed: bool,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
num_blocks: u32,
);
}
extern "C" {
pub fn cleanup_cuda_integer_abs_inplace(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr_void: *mut *mut i8,
);
}
extern "C" {
pub fn cuda_keyswitch_lwe_ciphertext_vector_32(
stream: *mut ffi::c_void,
Expand Down
11 changes: 11 additions & 0 deletions tfhe/benches/integer/signed_bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1602,6 +1602,10 @@ mod cuda {
display_name: neg
);

define_cuda_server_key_bench_clean_input_signed_unary_fn!(
method_name: unchecked_abs,
display_name: abs
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: unchecked_mul,
display_name: mul
Expand Down Expand Up @@ -1842,6 +1846,11 @@ mod cuda {
display_name: neg
);

define_cuda_server_key_bench_clean_input_signed_unary_fn!(
method_name: abs,
display_name: abs
);

define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: mul,
display_name: mul
Expand Down Expand Up @@ -2056,6 +2065,7 @@ mod cuda {
cuda_unchecked_add,
cuda_unchecked_sub,
cuda_unchecked_neg,
cuda_unchecked_abs,
cuda_unchecked_mul,
cuda_unchecked_bitand,
cuda_unchecked_bitnot,
Expand Down Expand Up @@ -2105,6 +2115,7 @@ mod cuda {
cuda_add,
cuda_sub,
cuda_neg,
cuda_abs,
cuda_mul,
cuda_bitand,
cuda_bitnot,
Expand Down
65 changes: 65 additions & 0 deletions tfhe/src/integer/gpu/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2770,3 +2770,68 @@ pub unsafe fn reverse_blocks_inplace_async<T: UnsignedInteger>(
);
}
}

#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
pub unsafe fn unchecked_signed_abs_radix_kb_assign_async<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
ct: &mut CudaVec<T>,
bootstrapping_key: &CudaVec<B>,
keyswitch_key: &CudaVec<T>,
message_modulus: MessageModulus,
carry_modulus: CarryModulus,
glwe_dimension: GlweDimension,
polynomial_size: PolynomialSize,
big_lwe_dimension: LweDimension,
small_lwe_dimension: LweDimension,
ks_level: DecompositionLevelCount,
ks_base_log: DecompositionBaseLog,
pbs_level: DecompositionLevelCount,
pbs_base_log: DecompositionBaseLog,
num_blocks: u32,
pbs_type: PBSType,
grouping_factor: LweBskGroupingFactor,
) {
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
streams.ptr.as_ptr(),
streams.gpu_indexes.as_ptr(),
streams.len() as u32,
std::ptr::addr_of_mut!(mem_ptr),
true,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
big_lwe_dimension.0 as u32,
small_lwe_dimension.0 as u32,
ks_level.0 as u32,
ks_base_log.0 as u32,
pbs_level.0 as u32,
pbs_base_log.0 as u32,
grouping_factor.0 as u32,
num_blocks,
message_modulus.0 as u32,
carry_modulus.0 as u32,
pbs_type as u32,
true,
);
cuda_integer_abs_inplace_radix_ciphertext_kb_64(
streams.ptr.as_ptr(),
streams.gpu_indexes.as_ptr(),
streams.len() as u32,
ct.as_mut_c_ptr(0),
mem_ptr,
true,
bootstrapping_key.ptr.as_ptr(),
keyswitch_key.ptr.as_ptr(),
num_blocks,
);
cleanup_cuda_integer_abs_inplace(
streams.ptr.as_ptr(),
streams.gpu_indexes.as_ptr(),
streams.len() as u32,
std::ptr::addr_of_mut!(mem_ptr),
);
}
Loading

0 comments on commit 0aee4c5

Please sign in to comment.