Skip to content

Commit

Permalink
fix(gpu): small fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Jul 22, 2024
1 parent 418a093 commit 46c9f06
Show file tree
Hide file tree
Showing 8 changed files with 24 additions and 21 deletions.
18 changes: 11 additions & 7 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -1736,7 +1736,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
int_radix_params params,
uint32_t num_radix_blocks,
bool allocate_gpu_memory) {
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
active_gpu_count = get_active_gpu_count(1, gpu_count);
// In the arithmetic shift, a PBS has to be applied to the last rotated
// block twice: once to shift it, once to compute the padding block to be
// copied onto all blocks to the left of the last rotated block
Expand Down Expand Up @@ -3074,12 +3074,12 @@ template <typename Torus> struct int_resolve_signed_overflow_memory {

template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
int_radix_params params;
uint32_t active_gpu_count;

// memory objects for other operations
int_sc_prop_memory<Torus> *scp_mem;
int_last_block_inner_propagate_memory<Torus> *las_block_prop_mem;
int_resolve_signed_overflow_memory<Torus> *resolve_overflow_mem;
// lookupt tables

// sub streams
cudaStream_t *sub_streams_1;
Expand Down Expand Up @@ -3118,13 +3118,17 @@ template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
int_radix_params params, uint32_t num_blocks, SIGNED_OPERATION op,
bool allocate_gpu_memory) {
this->params = params;
active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);

allocate_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks);
allocate_temporary_buffers(streams, gpu_indexes, active_gpu_count,
num_blocks);

// initialize streams
sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
for (uint j = 0; j < gpu_count; j++) {
sub_streams_1 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
sub_streams_2 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
for (uint j = 0; j < active_gpu_count; j++) {
sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
}
Expand Down Expand Up @@ -3160,7 +3164,7 @@ template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
cuda_drop_async(last_block_inner_propagation, streams[0], gpu_indexes[0]);

// sub streams
for (uint i = 0; i < gpu_count; i++) {
for (uint i = 0; i < active_gpu_count; i++) {
cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
}
Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
}
}

for (uint j = 0; j < gpu_count; j++) {
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
}
Expand Down
4 changes: 3 additions & 1 deletion backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,10 @@ __host__ void host_integer_radix_cmux_kb(
mem_ptr->predicate_lut, bsks, ksks, num_radix_blocks);
}
}
for (uint j = 0; j < gpu_count; j++) {
for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
}
for (uint j = 0; j < mem_ptr->zero_if_false_buffer->active_gpu_count; j++) {
cuda_synchronize_stream(false_streams[j], gpu_indexes[j]);
}

Expand Down
1 change: 0 additions & 1 deletion backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,6 @@ __host__ void host_compare_with_zero_equality(
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {

cudaSetDevice(gpu_indexes[0]);
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
Expand Down
8 changes: 4 additions & 4 deletions backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
gpu_count);
}
}
for (uint j = 0; j < gpu_count; j++) {
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
Expand Down Expand Up @@ -510,7 +510,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
gpu_indexes, gpu_count);
}
}
for (uint j = 0; j < gpu_count; j++) {
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
Expand Down Expand Up @@ -587,7 +587,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
}
}
for (uint j = 0; j < gpu_count; j++) {
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
Expand Down Expand Up @@ -628,7 +628,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
bsks, ksks, num_blocks, mem_ptr->message_extract_lut_2);
}
}
for (uint j = 0; j < gpu_count; j++) {
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
mem_ptr, bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
}
}
for (uint j = 0; j < gpu_count; j++) {
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
}
Expand Down Expand Up @@ -205,7 +205,6 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
std::function<Torus(Torus)> sign_handler_f, void **bsks, Torus **ksks,
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {

cudaSetDevice(gpu_indexes[0]);
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto glwe_dimension = params.glwe_dimension;
Expand Down Expand Up @@ -397,7 +396,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
signed_msb_lut->params.message_modulus);
}
}
for (uint j = 0; j < gpu_count; j++) {
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
}
Expand Down Expand Up @@ -465,7 +464,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
}
}
for (uint j = 0; j < gpu_count; j++) {
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
}
Expand Down Expand Up @@ -737,7 +736,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
}
}

for (uint j = 0; j < gpu_count; j++) {
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
}
}
}
for (uint j = 0; j < gpu_count; j++) {
for (uint j = 0; j < mem->active_gpu_count; j++) {
cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ int cuda_setup_multi_gpu() {
num_used_gpus += 1;
}
} else {
int has_peer_access_to_device_0;
for (int i = 1; i < num_gpus; i++)
num_used_gpus += 1;
}
Expand Down

0 comments on commit 46c9f06

Please sign in to comment.