Skip to content

Commit

Permalink
chore(gpu): reduce throughput integer bench time
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Dec 5, 2024
1 parent f24fa62 commit 3dcf7f2
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 6 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/benchmark_gpu_integer_common.yml
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,8 @@ jobs:
--commit-date "${{ env.COMMIT_DATE }}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512
--name-suffix avx512 \
--bench-type ${{ matrix.bench_type }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
Expand Down
33 changes: 30 additions & 3 deletions tfhe/benches/integer/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,9 @@ fn bench_server_key_binary_function_clean_inputs<F>(
}
BenchmarkType::Throughput => {
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
let elements = throughput_num_threads(num_block);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
Expand Down Expand Up @@ -292,6 +295,9 @@ fn bench_server_key_unary_function_clean_inputs<F>(
}
BenchmarkType::Throughput => {
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
let elements = throughput_num_threads(num_block);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
Expand Down Expand Up @@ -446,6 +452,9 @@ fn bench_server_key_binary_scalar_function_clean_inputs<F, G>(
}
BenchmarkType::Throughput => {
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
let elements = throughput_num_threads(num_block);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
Expand Down Expand Up @@ -559,6 +568,9 @@ fn if_then_else_parallelized(c: &mut Criterion) {
}
BenchmarkType::Throughput => {
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
let elements = throughput_num_threads(num_block);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
Expand Down Expand Up @@ -654,6 +666,9 @@ fn ciphertexts_sum_parallelized(c: &mut Criterion) {
bench_id = format!(
"{bench_name}_{len}_ctxts::throughput::{param_name}::{bit_size}_bits"
);
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
let elements = throughput_num_threads(num_block);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
Expand Down Expand Up @@ -1344,6 +1359,9 @@ mod cuda {
}
BenchmarkType::Throughput => {
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
let elements = throughput_num_threads(num_block);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
Expand Down Expand Up @@ -1440,6 +1458,9 @@ mod cuda {
}
BenchmarkType::Throughput => {
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
let elements = throughput_num_threads(num_block);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
Expand Down Expand Up @@ -1496,9 +1517,6 @@ mod cuda {
G: Fn(&mut ThreadRng, usize) -> ScalarType,
{
let mut bench_group = c.benchmark_group(bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(30));
let mut rng = rand::thread_rng();

let streams = CudaStreams::new_multi_gpu();
Expand All @@ -1516,6 +1534,9 @@ mod cuda {

match BENCH_TYPE.get().unwrap() {
BenchmarkType::Latency => {
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(30));
bench_id =
format!("{bench_name}::{param_name}::{bit_size}_bits_scalar_{bit_size}"); // FIXME it makes no sense to duplicate `bit_size`
bench_group.bench_function(&bench_id, |b| {
Expand Down Expand Up @@ -1543,6 +1564,9 @@ mod cuda {
});
}
BenchmarkType::Throughput => {
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
bench_id = format!(
"{bench_name}::throughput::{param_name}::{bit_size}_bits_scalar_{bit_size}"
);
Expand Down Expand Up @@ -1644,6 +1668,9 @@ mod cuda {
}
BenchmarkType::Throughput => {
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
let elements = throughput_num_threads(num_block);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
Expand Down
4 changes: 2 additions & 2 deletions tfhe/benches/utilities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -399,9 +399,9 @@ pub mod integer_utils {
#[cfg(feature = "gpu")]
{
// This value is for Nvidia H100 GPU
let streaming_multiprocessors = 144;
let streaming_multiprocessors = 132;
let num_gpus = unsafe { cuda_get_number_of_gpus() };
((streaming_multiprocessors * 16 * num_gpus) as f64 * block_multiplicator) as u64
((streaming_multiprocessors * num_gpus) as f64 * block_multiplicator) as u64
}
#[cfg(not(feature = "gpu"))]
{
Expand Down

0 comments on commit 3dcf7f2

Please sign in to comment.