From 3dcf7f24925bfc50d537378cd81a374df290ae28 Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Wed, 4 Dec 2024 12:21:54 +0100 Subject: [PATCH] chore(gpu): reduce throughput integer bench time --- .../benchmark_gpu_integer_common.yml | 3 +- tfhe/benches/integer/bench.rs | 33 +++++++++++++++++-- tfhe/benches/utilities.rs | 4 +-- 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/.github/workflows/benchmark_gpu_integer_common.yml b/.github/workflows/benchmark_gpu_integer_common.yml index 063640c270..4e0bb0a297 100644 --- a/.github/workflows/benchmark_gpu_integer_common.yml +++ b/.github/workflows/benchmark_gpu_integer_common.yml @@ -236,7 +236,8 @@ jobs: --commit-date "${{ env.COMMIT_DATE }}" \ --bench-date "${{ env.BENCH_DATE }}" \ --walk-subdirs \ - --name-suffix avx512 + --name-suffix avx512 \ + --bench-type ${{ matrix.bench_type }} - name: Upload parsed results artifact uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 diff --git a/tfhe/benches/integer/bench.rs b/tfhe/benches/integer/bench.rs index 67c75fcd79..d6ead085b7 100644 --- a/tfhe/benches/integer/bench.rs +++ b/tfhe/benches/integer/bench.rs @@ -144,6 +144,9 @@ fn bench_server_key_binary_function_clean_inputs( } BenchmarkType::Throughput => { bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); + bench_group + .sample_size(10) + .measurement_time(std::time::Duration::from_secs(30)); let elements = throughput_num_threads(num_block); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { @@ -292,6 +295,9 @@ fn bench_server_key_unary_function_clean_inputs( } BenchmarkType::Throughput => { bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); + bench_group + .sample_size(10) + .measurement_time(std::time::Duration::from_secs(30)); let elements = throughput_num_threads(num_block); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { @@ -446,6 +452,9 @@ fn bench_server_key_binary_scalar_function_clean_inputs( } BenchmarkType::Throughput => { bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); + bench_group + .sample_size(10) + .measurement_time(std::time::Duration::from_secs(30)); let elements = throughput_num_threads(num_block); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { @@ -559,6 +568,9 @@ fn if_then_else_parallelized(c: &mut Criterion) { } BenchmarkType::Throughput => { bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); + bench_group + .sample_size(10) + .measurement_time(std::time::Duration::from_secs(30)); let elements = throughput_num_threads(num_block); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { @@ -654,6 +666,9 @@ fn ciphertexts_sum_parallelized(c: &mut Criterion) { bench_id = format!( "{bench_name}_{len}_ctxts::throughput::{param_name}::{bit_size}_bits" ); + bench_group + .sample_size(10) + .measurement_time(std::time::Duration::from_secs(30)); let elements = throughput_num_threads(num_block); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { @@ -1344,6 +1359,9 @@ mod cuda { } BenchmarkType::Throughput => { bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); + bench_group + .sample_size(10) + .measurement_time(std::time::Duration::from_secs(30)); let elements = throughput_num_threads(num_block); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { @@ -1440,6 +1458,9 @@ mod cuda { } BenchmarkType::Throughput => { bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); + bench_group + .sample_size(10) + .measurement_time(std::time::Duration::from_secs(30)); let elements = throughput_num_threads(num_block); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { @@ -1496,9 +1517,6 @@ mod cuda { G: Fn(&mut ThreadRng, usize) -> ScalarType, { let mut bench_group = c.benchmark_group(bench_name); - bench_group - .sample_size(15) - .measurement_time(std::time::Duration::from_secs(30)); let mut rng = rand::thread_rng(); let streams = CudaStreams::new_multi_gpu(); @@ -1516,6 +1534,9 @@ mod cuda { match BENCH_TYPE.get().unwrap() { BenchmarkType::Latency => { + bench_group + .sample_size(15) + .measurement_time(std::time::Duration::from_secs(30)); bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits_scalar_{bit_size}"); // FIXME it makes no sense to duplicate `bit_size` bench_group.bench_function(&bench_id, |b| { @@ -1543,6 +1564,9 @@ mod cuda { }); } BenchmarkType::Throughput => { + bench_group + .sample_size(10) + .measurement_time(std::time::Duration::from_secs(30)); bench_id = format!( "{bench_name}::throughput::{param_name}::{bit_size}_bits_scalar_{bit_size}" ); @@ -1644,6 +1668,9 @@ mod cuda { } BenchmarkType::Throughput => { bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); + bench_group + .sample_size(10) + .measurement_time(std::time::Duration::from_secs(30)); let elements = throughput_num_threads(num_block); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { diff --git a/tfhe/benches/utilities.rs b/tfhe/benches/utilities.rs index 24c723ab51..5348010941 100644 --- a/tfhe/benches/utilities.rs +++ b/tfhe/benches/utilities.rs @@ -399,9 +399,9 @@ pub mod integer_utils { #[cfg(feature = "gpu")] { // This value is for Nvidia H100 GPU - let streaming_multiprocessors = 144; + let streaming_multiprocessors = 132; let num_gpus = unsafe { cuda_get_number_of_gpus() }; - ((streaming_multiprocessors * 16 * num_gpus) as f64 * block_multiplicator) as u64 + ((streaming_multiprocessors * num_gpus) as f64 * block_multiplicator) as u64 } #[cfg(not(feature = "gpu"))] {