From 2cfe61ab730ea57c827edd11fc0a8b85b5f3bc96 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 5 Jun 2024 10:08:55 -0600 Subject: [PATCH 01/46] reformated name for benchmark_block_adjacent_difference --- CMakeLists.txt | 5 +++-- .../benchmark_block_adjacent_difference.cpp | 19 +++++++++++++------ benchmark/benchmark_block_shuffle.cpp | 1 + 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 516b1598..89514e1e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,7 @@ cmake_minimum_required(VERSION 3.16 FATAL_ERROR) cmake_policy(VERSION 3.16...3.25) # Install prefix -set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories") +set(CMAKE_INSTALL_PREFIX "/opt/rocm-6.0.1" CACHE PATH "Install path prefix, prepended onto install directories") # hipCUB project project(hipcub LANGUAGES CXX) @@ -33,9 +33,10 @@ project(hipcub LANGUAGES CXX) if(WIN32) set(ROCM_ROOT "$ENV{HIP_PATH}" CACHE PATH "Root directory of the ROCm installation") else() - set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") + set(ROCM_ROOT "/opt/rocm-6.0.1" CACHE PATH "Root directory of the ROCm installation") endif() + # Build options option(BUILD_TEST "Build tests (requires googletest)" OFF) option(DEPENDENCIES_FORCE_DOWNLOAD "Download dependencies and do not search for packages" OFF) diff --git a/benchmark/benchmark_block_adjacent_difference.cpp b/benchmark/benchmark_block_adjacent_difference.cpp index 63af7c7d..7cba54b7 100644 --- a/benchmark/benchmark_block_adjacent_difference.cpp +++ b/benchmark/benchmark_block_adjacent_difference.cpp @@ -338,12 +338,19 @@ auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ -benchmark::RegisterBenchmark( \ - (std::string("block_adjacent_difference<" #T ", " #BS ">.") + name + ("<" #IPT ", " #WITH_TILE ">")).c_str(), \ - &run_benchmark, \ - stream, size \ -) +# define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ + benchmark::RegisterBenchmark( \ + std::string(std::string("block_adjacent_difference.SubAlgorithm Name:") + name \ + + std::string("") \ + ).c_str(), \ + &run_benchmark, \ + stream, \ + size \ + ) + #define BENCHMARK_TYPE(type, block, with_tile) \ CREATE_BENCHMARK(type, block, 1, with_tile), \ diff --git a/benchmark/benchmark_block_shuffle.cpp b/benchmark/benchmark_block_shuffle.cpp index 2f0d8cb5..233e53ad 100644 --- a/benchmark/benchmark_block_shuffle.cpp +++ b/benchmark/benchmark_block_shuffle.cpp @@ -303,6 +303,7 @@ int main(int argc, char* argv[]) hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; From c32e95922940e6abf466ee49b573b541b822bea0 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 5 Jun 2024 10:29:35 -0600 Subject: [PATCH 02/46] added key:item fields to name in benchmark_block_radix_rank --- .../benchmark_block_adjacent_difference.cpp | 2 +- benchmark/benchmark_block_radix_rank.cpp | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/benchmark/benchmark_block_adjacent_difference.cpp b/benchmark/benchmark_block_adjacent_difference.cpp index 7cba54b7..2711e523 100644 --- a/benchmark/benchmark_block_adjacent_difference.cpp +++ b/benchmark/benchmark_block_adjacent_difference.cpp @@ -340,7 +340,7 @@ auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) # define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ benchmark::RegisterBenchmark( \ - std::string(std::string("block_adjacent_difference.SubAlgorithm Name:") + name \ + std::string(".") + name).c_str(), \ - &run_benchmark, \ - stream, \ - size) +#define CREATE_BENCHMARK(T, KIND, BS, IPT) \ + benchmark::RegisterBenchmark( \ + (std::string("block_radix_rank.") + name \ + ).c_str(), \ + &run_benchmark, \ + stream, \ + size \ + ) + // clang-format off #define CREATE_BENCHMARK_KINDS(type, block, ipt) \ From 6b14c6cc49893454421bb7d97b2335536ca11452 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 5 Jun 2024 11:00:16 -0600 Subject: [PATCH 03/46] finished reformating benchmark_device_adjacent_difference --- .../benchmark_block_adjacent_difference.cpp | 10 +++++----- .../benchmark_device_adjacent_difference.cpp | 19 ++++++++++++------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/benchmark/benchmark_block_adjacent_difference.cpp b/benchmark/benchmark_block_adjacent_difference.cpp index 2711e523..16a51b01 100644 --- a/benchmark/benchmark_block_adjacent_difference.cpp +++ b/benchmark/benchmark_block_adjacent_difference.cpp @@ -340,11 +340,11 @@ auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) # define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ benchmark::RegisterBenchmark( \ - (std::string("block_adjacent_difference.SubAlgorithm Name:") + name \ - + std::string("") \ + (std::string("block_adjacent_difference.SubAlgorithm Name:") + name \ + + std::string("") \ ).c_str(), \ &run_benchmark, \ stream, \ diff --git a/benchmark/benchmark_device_adjacent_difference.cpp b/benchmark/benchmark_device_adjacent_difference.cpp index f42ceb76..29f7aa1d 100644 --- a/benchmark/benchmark_device_adjacent_difference.cpp +++ b/benchmark/benchmark_device_adjacent_difference.cpp @@ -180,13 +180,18 @@ void run_benchmark(benchmark::State& state, const std::size_t size, const hipStr using namespace std::string_literals; -#define CREATE_BENCHMARK(T, left, copy) \ - benchmark::RegisterBenchmark(("Subtract" + (left ? "Left"s : "Right"s) \ - + (copy ? "Copy"s : ""s) + "<" #T ">") \ - .c_str(), \ - &run_benchmark, \ - size, \ - stream) +#define CREATE_BENCHMARK(T, left, copy) \ + benchmark::RegisterBenchmark( \ + (std::string("device_adjacent_difference" \ + "." \ + "SubAlgorithm Name:Subtract") \ + + std::string(left ? "Left" : "Right") \ + + std::string(copy ? "Copy" : "") \ + ).c_str(), \ + &run_benchmark, \ + size, \ + stream \ + ) // clang-format off #define CREATE_BENCHMARKS(T) \ From fc4634917804934df4e1b826819488833c14e82e Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 5 Jun 2024 11:16:24 -0600 Subject: [PATCH 04/46] reformated key:field for benchmark_device_memcpy --- benchmark/benchmark_device_batch_memcpy.cpp | 35 +++++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/benchmark/benchmark_device_batch_memcpy.cpp b/benchmark/benchmark_device_batch_memcpy.cpp index 3d72e349..abd17a40 100644 --- a/benchmark/benchmark_device_batch_memcpy.cpp +++ b/benchmark/benchmark_device_batch_memcpy.cpp @@ -337,16 +337,31 @@ void run_benchmark(benchmark::State& state, HIP_CHECK(hipFree(d_temp_storage)); } -#define CREATE_BENCHMARK(item_size, item_alignment, size_type, num_tlev, num_wlev, num_blev) \ - benchmark::RegisterBenchmark( \ - "{lvl:device,item_size:" #item_size ",item_alignment:" #item_alignment \ - ",size_type:" #size_type ",algo:batch_memcpy,num_tlev:" #num_tlev ",num_wlev:" #num_wlev \ - ",num_blev:" #num_blev ",cfg:default_config}", \ - [=](benchmark::State& state) \ - { \ - run_benchmark, \ - size_type>(state, stream, num_tlev, num_wlev, num_blev); \ - }) +// #define CREATE_BENCHMARK(item_size, item_alignment, size_type, num_tlev, num_wlev, num_blev) \ + // benchmark::RegisterBenchmark( \ + // "{lvl:device,item_size:" #item_size ",item_alignment:" #item_alignment \ + // ",size_type:" #size_type ",algo:batch_memcpy,num_tlev:" #num_tlev ",num_wlev:" #num_wlev \ + // ",num_blev:" #num_blev ",cfg:default_config}", \ + // [=](benchmark::State& state) \ + // { \ + // run_benchmark, \ + // size_type>(state, stream, num_tlev, num_wlev, num_blev); \ + // }) + +#define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev) \ + benchmark::RegisterBenchmark( \ + (std::string("device_batch_memcpy.") \ + ).c_str(), \ + [=](benchmark::State& state){ \ + run_benchmark, \ + T>(state, stream, num_tlev, num_wlev, num_blev); \ + } \ + ) #define BENCHMARK_TYPE(item_size, item_alignment) \ CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ From d3bcc9cec4c67df0072984d4fe1d64ea3e5f711f Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 5 Jun 2024 11:38:18 -0600 Subject: [PATCH 05/46] reformated benchmark_device_memory to have key:item in name --- benchmark/benchmark_device_batch_memcpy.cpp | 13 +------ benchmark/benchmark_device_memory.cpp | 38 +++++++++++++-------- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/benchmark/benchmark_device_batch_memcpy.cpp b/benchmark/benchmark_device_batch_memcpy.cpp index abd17a40..14a0e2e8 100644 --- a/benchmark/benchmark_device_batch_memcpy.cpp +++ b/benchmark/benchmark_device_batch_memcpy.cpp @@ -337,17 +337,6 @@ void run_benchmark(benchmark::State& state, HIP_CHECK(hipFree(d_temp_storage)); } -// #define CREATE_BENCHMARK(item_size, item_alignment, size_type, num_tlev, num_wlev, num_blev) \ - // benchmark::RegisterBenchmark( \ - // "{lvl:device,item_size:" #item_size ",item_alignment:" #item_alignment \ - // ",size_type:" #size_type ",algo:batch_memcpy,num_tlev:" #num_tlev ",num_wlev:" #num_wlev \ - // ",num_blev:" #num_blev ",cfg:default_config}", \ - // [=](benchmark::State& state) \ - // { \ - // run_benchmark, \ - // size_type>(state, stream, num_tlev, num_wlev, num_blev); \ - // }) - #define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev) \ benchmark::RegisterBenchmark( \ (std::string("device_batch_memcpy, \ - T>(state, stream, num_tlev, num_wlev, num_blev); \ + T>(state, stream, num_tlev, num_wlev, num_blev); \ } \ ) diff --git a/benchmark/benchmark_device_memory.cpp b/benchmark/benchmark_device_memory.cpp index 8659cedf..079cc19e 100644 --- a/benchmark/benchmark_device_memory.cpp +++ b/benchmark/benchmark_device_memory.cpp @@ -401,20 +401,30 @@ void run_benchmark_memcpy(benchmark::State& state, size_t size, const hipStream_ HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK_IPT(METHOD, OPERATION, T, SIZE, BLOCK_SIZE, IPT) \ - { \ - benchmarks.push_back(benchmark::RegisterBenchmark( \ - #METHOD "_" #OPERATION "<" #T "," #SIZE ",BS:" #BLOCK_SIZE ",IPT:" #IPT ">", \ - [=](benchmark::State& state) \ - { run_benchmark(state, SIZE, stream); })); \ - } - -#define CREATE_BENCHMARK_MEMCPY(T, SIZE) \ - { \ - benchmarks.push_back(benchmark::RegisterBenchmark( \ - "Memcpy<" #T "," #SIZE ">", \ - [=](benchmark::State& state) { run_benchmark_memcpy(state, SIZE, stream); })); \ - } +#define CREATE_BENCHMARK_IPT(METHOD, OPERATION, T, SIZE, BS, IPT) \ + benchmarks.push_back( \ + benchmark::RegisterBenchmark( \ + (std::string("device_memory.") \ + ).c_str(), \ + [=](benchmark::State& state){ \ + run_benchmark(state, SIZE, stream); \ + } \ + ) \ + ); \ + +#define CREATE_BENCHMARK_MEMCPY(T, SIZE) \ + benchmarks.push_back( \ + benchmark::RegisterBenchmark( \ + (std::string("device_memory_memcpy.") \ + ).c_str(), \ + [=](benchmark::State& state) { run_benchmark_memcpy(state, SIZE, stream); } \ + ) \ + ); \ // clang-format off #define CREATE_BENCHMARK_BLOCK_SIZE(MEM_OP, OP, TYPE, SIZE, BLOCK_SIZE) \ From 6721545930ba7b6728ee6c5e1708a12fe2ae8229 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 5 Jun 2024 11:53:32 -0600 Subject: [PATCH 06/46] finished reformating for "name" to key:field in benchmark_device_merge_sort --- benchmark/benchmark_device_merge_sort.cpp | 38 +++++++++++------------ 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/benchmark/benchmark_device_merge_sort.cpp b/benchmark/benchmark_device_merge_sort.cpp index fbfd35f7..6a0f69a2 100644 --- a/benchmark/benchmark_device_merge_sort.cpp +++ b/benchmark/benchmark_device_merge_sort.cpp @@ -230,25 +230,25 @@ void run_sort_pairs_benchmark(benchmark::State& state, } -#define CREATE_SORT_KEYS_BENCHMARK(Key) \ - { \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "<" #Key ">").c_str(), \ - [=](benchmark::State& state) { run_sort_keys_benchmark(state, stream, size); } \ - ) \ - ); \ - } - -#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value) \ - { \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "<" #Key ", " #Value">").c_str(), \ - [=](benchmark::State& state) { run_sort_pairs_benchmark(state, stream, size); } \ - ) \ - ); \ - } +#define CREATE_SORT_KEYS_BENCHMARK(T) \ + benchmarks.push_back( \ + benchmark::RegisterBenchmark( \ + (std::string("device_merge_sort.") \ + ).c_str(), \ + [=](benchmark::State& state) { run_sort_keys_benchmark(state, stream, size); } \ + ) \ + ); \ + +#define CREATE_SORT_PAIRS_BENCHMARK(T, V) \ + benchmarks.push_back( \ + benchmark::RegisterBenchmark( \ + (std::string("device_merge_sort.") \ + ).c_str(), \ + [=](benchmark::State& state) { run_sort_pairs_benchmark(state, stream, size); } \ + ) \ + ); \ void add_sort_keys_benchmarks(std::vector& benchmarks, From ce7a2004ca410356afe70b4333e975a534a28320 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 5 Jun 2024 12:09:10 -0600 Subject: [PATCH 07/46] finished reformating for benchmark_device_spmv --- benchmark/benchmark_device_spmv.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/benchmark/benchmark_device_spmv.cpp b/benchmark/benchmark_device_spmv.cpp index e884f361..9954a72b 100644 --- a/benchmark/benchmark_device_spmv.cpp +++ b/benchmark/benchmark_device_spmv.cpp @@ -169,11 +169,13 @@ void run_benchmark(benchmark::State& state, HIP_CHECK(hipDeviceSynchronize()); } -#define CREATE_BENCHMARK(T, p) \ -benchmark::RegisterBenchmark( \ - ("CsrMV<" #T ">(p = " #p")"), \ - &run_benchmark, size, stream, p \ -) +#define CREATE_BENCHMARK(T, p) \ + benchmark::RegisterBenchmark( \ + (std::string("device_spmv_CsrMV.") \ + ).c_str(), \ + &run_benchmark, size, stream, p \ + ) #define BENCHMARK_TYPE(type) \ CREATE_BENCHMARK(type, 1.0e-6f), \ From 8a8989b84ce80836b1c48d4c33d14a08f1c02997 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 5 Jun 2024 12:17:52 -0600 Subject: [PATCH 08/46] Added missing fields for multi_histogram_range --- benchmark/benchmark_device_histogram.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/benchmark_device_histogram.cpp b/benchmark/benchmark_device_histogram.cpp index cfde99f9..00f441b1 100644 --- a/benchmark/benchmark_device_histogram.cpp +++ b/benchmark/benchmark_device_histogram.cpp @@ -604,8 +604,8 @@ void add_range_benchmarks(std::vector& benchmar #define CREATE_MULTI_RANGE_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS) \ benchmark::RegisterBenchmark( \ - (std::string("multi_histogram_range") + "<" #CHANNELS ", " #ACTIVE_CHANNELS ", " #T ">" + \ - "(" + std::to_string(BINS) + " bins)" \ + (std::string("multi_histogram_range") + "" + \ + "(Bin Count:" + std::to_string(BINS) + " bins)" \ ).c_str(), \ [=](benchmark::State& state) { \ run_multi_range_benchmark( \ From fc433d335b46f694304df4abb39846d35e7f3d22 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 5 Jun 2024 12:24:34 -0600 Subject: [PATCH 09/46] added missing fields for partition_flaged and predicate --- benchmark/benchmark_device_partition.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/benchmark_device_partition.cpp b/benchmark/benchmark_device_partition.cpp index 26c7739a..1ead466d 100644 --- a/benchmark/benchmark_device_partition.cpp +++ b/benchmark/benchmark_device_partition.cpp @@ -354,13 +354,13 @@ void run_threeway(benchmark::State& state, #define CREATE_BENCHMARK_FLAGGED(T, T_FLAG, SPLIT_T) \ benchmark::RegisterBenchmark( \ - "parition_flagged<" #T ", " #T_FLAG ">(" #SPLIT_T "%)", \ + "parition_flagged(Split Threshold:" #SPLIT_T "%)", \ &run_flagged, stream, static_cast(SPLIT_T), size \ ) #define CREATE_BENCHMARK_PREDICATE(T, SPLIT_T) \ benchmark::RegisterBenchmark( \ - "parition_predicate<" #T ">(" #SPLIT_T "%)", \ + "parition_predicate(Split Threshold:" #SPLIT_T "%)", \ &run_predicate, stream, static_cast(SPLIT_T), size \ ) From b368f2821f5204020fa48fdfdb1ea2e6d9c3148a Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 5 Jun 2024 13:46:44 -0600 Subject: [PATCH 10/46] added missng fields for benchmark_device_segmented_radix_sort --- benchmark/benchmark_device_segmented_radix_sort.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmark/benchmark_device_segmented_radix_sort.cpp b/benchmark/benchmark_device_segmented_radix_sort.cpp index 65b8d116..518118fa 100644 --- a/benchmark/benchmark_device_segmented_radix_sort.cpp +++ b/benchmark/benchmark_device_segmented_radix_sort.cpp @@ -382,8 +382,8 @@ benchmark::RegisterBenchmark( \ #define CREATE_SORT_KEYS_DESCENDING_BENCHMARK(Key, SEGMENTS) \ benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "<" #Key ">" + \ - "(~" + std::to_string(SEGMENTS) + " segments), descending" \ + (std::string("sort_keys") + "" + \ + "(Segments:~" + std::to_string(SEGMENTS) + " segments), descending" \ ).c_str(), \ [=](benchmark::State& state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size, Descending); } \ ) @@ -427,8 +427,8 @@ benchmark::RegisterBenchmark( \ #define CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(Key, Value, SEGMENTS) \ benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "<" #Key ", " #Value ">" + \ - "(~" + std::to_string(SEGMENTS) + " segments), descending" \ + (std::string("sort_pairs") + "" + \ + "(Segments:~" + std::to_string(SEGMENTS) + " segments), descending" \ ).c_str(), \ [=](benchmark::State& state) { \ run_sort_pairs_benchmark(state, SEGMENTS, stream, size, Descending); } \ From fc20a21a23c1d62d9ed2265e1f654932561de945 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 5 Jun 2024 13:49:45 -0600 Subject: [PATCH 11/46] added missng fields for benchmark_device_radix_sort --- benchmark/benchmark_device_radix_sort.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/benchmark_device_radix_sort.cpp b/benchmark/benchmark_device_radix_sort.cpp index 386281ab..292d407d 100644 --- a/benchmark/benchmark_device_radix_sort.cpp +++ b/benchmark/benchmark_device_radix_sort.cpp @@ -405,7 +405,7 @@ void run_sort_pairs_benchmark(benchmark::State& state, ); \ benchmarks.push_back( \ benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "<" #Key ">, descending").c_str(), \ + (std::string("sort_keys") + ", descending").c_str(), \ [=](benchmark::State& state) { run_sort_keys_benchmark(state, stream, size, keys_input); } \ ) \ ); \ @@ -422,7 +422,7 @@ void run_sort_pairs_benchmark(benchmark::State& state, ); \ benchmarks.push_back( \ benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "<" #Key ", " #Value">, descending").c_str(), \ + (std::string("sort_pairs") + ", descending").c_str(), \ [=](benchmark::State& state) { run_sort_pairs_benchmark(state, stream, size, keys_input); } \ ) \ ); \ From a2dd97a0ea442a897926d83382903e12af8efdfe Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 5 Jun 2024 13:53:30 -0600 Subject: [PATCH 12/46] added missing fields in benchmark_device_segmented_sort --- benchmark/benchmark_device_segmented_sort.cpp | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/benchmark/benchmark_device_segmented_sort.cpp b/benchmark/benchmark_device_segmented_sort.cpp index e2b2a6a2..078ab8dd 100644 --- a/benchmark/benchmark_device_segmented_sort.cpp +++ b/benchmark/benchmark_device_segmented_sort.cpp @@ -376,18 +376,18 @@ void run_sort_pairs_benchmark(benchmark::State &state, .c_str(), \ [=](benchmark::State &state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size); }), \ benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "<" #Key ">" + \ - "(~" + std::to_string(SEGMENTS) + " segments), descending") \ + (std::string("sort_keys") + "" + \ + "(Number of segments:~" + std::to_string(SEGMENTS) + " segments), descending") \ .c_str(), \ [=](benchmark::State &state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size, true); }), \ benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "<" #Key ">" + \ - "(~" + std::to_string(SEGMENTS) + " segments), stable") \ + (std::string("sort_keys") + "" + \ + "(Number of segments:~" + std::to_string(SEGMENTS) + " segments), stable") \ .c_str(), \ [=](benchmark::State &state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size, false, true); }), \ benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "<" #Key ">" + \ - "(~" + std::to_string(SEGMENTS) + " segments), descending, stable") \ + (std::string("sort_keys") + "" + \ + "(Number of segments:~" + std::to_string(SEGMENTS) + " segments), descending, stable") \ .c_str(), \ [=](benchmark::State &state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size, true, true); }) @@ -419,18 +419,18 @@ void add_sort_keys_benchmarks(std::vector &ben .c_str(), \ [=](benchmark::State &state) { run_sort_pairs_benchmark(state, SEGMENTS, stream, size); }), \ benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "<" #Key ", " #Value ">" + \ - "(~" + std::to_string(SEGMENTS) + " segments), descending") \ + (std::string("sort_pairs") + "" + \ + "(Number of segments:~" + std::to_string(SEGMENTS) + " segments), descending") \ .c_str(), \ [=](benchmark::State &state) { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, true); }), \ benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "<" #Key ", " #Value ">" + \ - "(~" + std::to_string(SEGMENTS) + " segments), stable") \ + (std::string("sort_pairs") + "" + \ + "(Number of segments:~" + std::to_string(SEGMENTS) + " segments), stable") \ .c_str(), \ [=](benchmark::State &state) { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, false, true); }), \ benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "<" #Key ", " #Value ">" + \ - "(~" + std::to_string(SEGMENTS) + " segments), descending, stable") \ + (std::string("sort_pairs") + "" + \ + "(Number of segments:~" + std::to_string(SEGMENTS) + " segments), descending, stable") \ .c_str(), \ [=](benchmark::State &state) { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, true, true); }) From e0b40a5de6560fb79e0b91a2362c568d68b4a986 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 5 Jun 2024 13:56:48 -0600 Subject: [PATCH 13/46] added missing keys in benchmark_device_select --- benchmark/benchmark_device_select.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/benchmark_device_select.cpp b/benchmark/benchmark_device_select.cpp index d1617c79..a430cd61 100644 --- a/benchmark/benchmark_device_select.cpp +++ b/benchmark/benchmark_device_select.cpp @@ -518,7 +518,7 @@ benchmark::RegisterBenchmark( \ #define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p) \ benchmark::RegisterBenchmark( \ - ("unique_by_key<" #K ", "#V", unsigned int>(p = " #p")"), \ + ("unique_by_key(Probability:" #p")"), \ &run_unique_by_key_benchmark, size, stream, p \ ) From 40a1db9d5a49bd03134b0a0e95bb8be49045c7e5 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 5 Jun 2024 15:00:11 -0600 Subject: [PATCH 14/46] fixed missing : in bencmark_device_segmented_radix_sort --- benchmark/benchmark_device_segmented_radix_sort.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/benchmark_device_segmented_radix_sort.cpp b/benchmark/benchmark_device_segmented_radix_sort.cpp index 518118fa..d2047866 100644 --- a/benchmark/benchmark_device_segmented_radix_sort.cpp +++ b/benchmark/benchmark_device_segmented_radix_sort.cpp @@ -427,7 +427,7 @@ benchmark::RegisterBenchmark( \ #define CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(Key, Value, SEGMENTS) \ benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "" + \ + (std::string("sort_pairs") + "" + \ "(Segments:~" + std::to_string(SEGMENTS) + " segments), descending" \ ).c_str(), \ [=](benchmark::State& state) { \ From 3f5f91d3317c4d4bc0ccfdc0112628c2080569fa Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 5 Jun 2024 15:03:43 -0600 Subject: [PATCH 15/46] reformated benchmark_device_merge_sort output name --- benchmark/benchmark_device_merge_sort.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmark/benchmark_device_merge_sort.cpp b/benchmark/benchmark_device_merge_sort.cpp index 6a0f69a2..734d92f2 100644 --- a/benchmark/benchmark_device_merge_sort.cpp +++ b/benchmark/benchmark_device_merge_sort.cpp @@ -233,7 +233,7 @@ void run_sort_pairs_benchmark(benchmark::State& state, #define CREATE_SORT_KEYS_BENCHMARK(T) \ benchmarks.push_back( \ benchmark::RegisterBenchmark( \ - (std::string("device_merge_sort.") \ + (std::string("device_merge_sort_sort_keys.") \ ).c_str(), \ [=](benchmark::State& state) { run_sort_keys_benchmark(state, stream, size); } \ ) \ @@ -242,9 +242,9 @@ void run_sort_pairs_benchmark(benchmark::State& state, #define CREATE_SORT_PAIRS_BENCHMARK(T, V) \ benchmarks.push_back( \ benchmark::RegisterBenchmark( \ - (std::string("device_merge_sort.") \ + (std::string("device_merge_sort_sort_pairs<" \ + ",Key Datatype:" #T \ + ",Value Datatype:" #V ">.") \ ).c_str(), \ [=](benchmark::State& state) { run_sort_pairs_benchmark(state, stream, size); } \ ) \ From f15bb813b4aa518d26ffc266d8603dd83bf2998f Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Fri, 7 Jun 2024 09:18:57 -0600 Subject: [PATCH 16/46] reverted cmakelist from opt/rocm-6.0.1 to opt/rocm --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 89514e1e..b372fd7e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,7 @@ cmake_minimum_required(VERSION 3.16 FATAL_ERROR) cmake_policy(VERSION 3.16...3.25) # Install prefix -set(CMAKE_INSTALL_PREFIX "/opt/rocm-6.0.1" CACHE PATH "Install path prefix, prepended onto install directories") +set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories") # hipCUB project project(hipcub LANGUAGES CXX) @@ -33,7 +33,7 @@ project(hipcub LANGUAGES CXX) if(WIN32) set(ROCM_ROOT "$ENV{HIP_PATH}" CACHE PATH "Root directory of the ROCm installation") else() - set(ROCM_ROOT "/opt/rocm-6.0.1" CACHE PATH "Root directory of the ROCm installation") + set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") endif() From 335c4edf3762022d208269bf41deac10e79e318e Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Mon, 10 Jun 2024 13:12:43 -0600 Subject: [PATCH 17/46] fixed typo in device_spmv --- benchmark/benchmark_device_spmv.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/benchmark_device_spmv.cpp b/benchmark/benchmark_device_spmv.cpp index 9954a72b..baef8a46 100644 --- a/benchmark/benchmark_device_spmv.cpp +++ b/benchmark/benchmark_device_spmv.cpp @@ -171,7 +171,7 @@ void run_benchmark(benchmark::State& state, #define CREATE_BENCHMARK(T, p) \ benchmark::RegisterBenchmark( \ - (std::string("device_spmv_CsrMV.") \ ).c_str(), \ &run_benchmark, size, stream, p \ From 355fbf418b73ea336887cddc87d4f5fc4d03937f Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Mon, 10 Jun 2024 13:33:06 -0600 Subject: [PATCH 18/46] added name at start for benchmark output --- benchmark/benchmark_block_adjacent_difference.cpp | 2 ++ benchmark/benchmark_block_radix_rank.cpp | 2 ++ benchmark/benchmark_device_adjacent_difference.cpp | 2 ++ benchmark/benchmark_device_batch_memcpy.cpp | 8 ++++++++ benchmark/benchmark_device_merge_sort.cpp | 2 ++ benchmark/benchmark_device_spmv.cpp | 2 ++ 6 files changed, 18 insertions(+) diff --git a/benchmark/benchmark_block_adjacent_difference.cpp b/benchmark/benchmark_block_adjacent_difference.cpp index 16a51b01..9d2a0372 100644 --- a/benchmark/benchmark_block_adjacent_difference.cpp +++ b/benchmark/benchmark_block_adjacent_difference.cpp @@ -407,6 +407,8 @@ int main(int argc, char *argv[]) int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_block_adjacent_difference" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks diff --git a/benchmark/benchmark_block_radix_rank.cpp b/benchmark/benchmark_block_radix_rank.cpp index 8f6fd5c4..582fab28 100644 --- a/benchmark/benchmark_block_radix_rank.cpp +++ b/benchmark/benchmark_block_radix_rank.cpp @@ -225,6 +225,8 @@ int main(int argc, char* argv[]) int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_block_radix_rank" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks diff --git a/benchmark/benchmark_device_adjacent_difference.cpp b/benchmark/benchmark_device_adjacent_difference.cpp index 29f7aa1d..383de142 100644 --- a/benchmark/benchmark_device_adjacent_difference.cpp +++ b/benchmark/benchmark_device_adjacent_difference.cpp @@ -219,6 +219,8 @@ int main(int argc, char* argv[]) int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_device_adjacent_difference" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; using custom_float2 = benchmark_utils::custom_type; diff --git a/benchmark/benchmark_device_batch_memcpy.cpp b/benchmark/benchmark_device_batch_memcpy.cpp index 14a0e2e8..97fac843 100644 --- a/benchmark/benchmark_device_batch_memcpy.cpp +++ b/benchmark/benchmark_device_batch_memcpy.cpp @@ -375,6 +375,14 @@ int32_t main(int32_t argc, char* argv[]) const size_t size = parser.get("size"); const int32_t trials = parser.get("trials"); + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_device_adjacent_difference" << std::endl; + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + // HIP hipStream_t stream = hipStreamDefault; // default diff --git a/benchmark/benchmark_device_merge_sort.cpp b/benchmark/benchmark_device_merge_sort.cpp index 734d92f2..aec81680 100644 --- a/benchmark/benchmark_device_merge_sort.cpp +++ b/benchmark/benchmark_device_merge_sort.cpp @@ -307,6 +307,8 @@ int main(int argc, char *argv[]) int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_device_merge_sort" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks diff --git a/benchmark/benchmark_device_spmv.cpp b/benchmark/benchmark_device_spmv.cpp index baef8a46..54b4dcc4 100644 --- a/benchmark/benchmark_device_spmv.cpp +++ b/benchmark/benchmark_device_spmv.cpp @@ -202,6 +202,8 @@ int main(int argc, char *argv[]) int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_device_spmv" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks From 870ac4386b15fb98f40f27f39dab4cb3adf615fd Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 12 Jun 2024 09:45:38 -0600 Subject: [PATCH 19/46] reformated define --- benchmark/benchmark_block_reduce.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/benchmark/benchmark_block_reduce.cpp b/benchmark/benchmark_block_reduce.cpp index a9a33909..7918ed7a 100644 --- a/benchmark/benchmark_block_reduce.cpp +++ b/benchmark/benchmark_block_reduce.cpp @@ -135,11 +135,14 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) } // IPT - items per thread -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark( \ - (std::string("block_reduce.Method Name:") + method_name).c_str(), \ - &run_benchmark, \ - stream, size \ +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark( \ + (std::string("block_reduce.Method Name:") + method_name).c_str(), \ + &run_benchmark, \ + stream, size \ ) #define BENCHMARK_TYPE(type, block) \ From 9d056306a76f1db86af1a67b2af8f651fbb08b6b Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 12 Jun 2024 09:46:21 -0600 Subject: [PATCH 20/46] reformated create_benchmark define --- benchmark/benchmark_block_run_length_decode.cpp | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/benchmark/benchmark_block_run_length_decode.cpp b/benchmark/benchmark_block_run_length_decode.cpp index 8ef5def9..0cbfee58 100644 --- a/benchmark/benchmark_block_run_length_decode.cpp +++ b/benchmark/benchmark_block_run_length_decode.cpp @@ -184,11 +184,17 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT) \ - benchmark::RegisterBenchmark( \ - "block_run_length_decode", \ - &run_benchmark, \ - stream, size \ +#define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT) \ + benchmark::RegisterBenchmark( \ + std::string("block_run_length_decode").c_str(), \ + &run_benchmark, \ + stream, size \ ) int main(int argc, char *argv[]) From c9b59424bbe21ab939c1f162890bfb136b4f98f0 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 12 Jun 2024 09:48:51 -0600 Subject: [PATCH 21/46] re formated CREATE_BENCHMARK --- benchmark/benchmark_block_scan.cpp | 14 +++++++++----- benchmark/benchmark_block_shuffle.cpp | 16 +++++++++------- benchmark/benchmark_device_batch_copy.cpp | 13 ++++++------- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/benchmark/benchmark_block_scan.cpp b/benchmark/benchmark_block_scan.cpp index f45d3862..5eced17a 100644 --- a/benchmark/benchmark_block_scan.cpp +++ b/benchmark/benchmark_block_scan.cpp @@ -154,11 +154,15 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) } // IPT - items per thread -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark( \ - (std::string("block_scan.Method Name:") + method_name).c_str(), \ - &run_benchmark, \ - stream, size \ +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark( \ + (std::string("block_scan.Method Name:") + method_name).c_str(), \ + &run_benchmark, \ + stream, size \ ) // clang-format off diff --git a/benchmark/benchmark_block_shuffle.cpp b/benchmark/benchmark_block_shuffle.cpp index 233e53ad..8e41fff0 100644 --- a/benchmark/benchmark_block_shuffle.cpp +++ b/benchmark/benchmark_block_shuffle.cpp @@ -224,13 +224,15 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) stream, \ size) -#define CREATE_BENCHMARK(BS) \ - benchmark::RegisterBenchmark((std::string("block_shuffle.SubAlgorithm Name:") + name) \ - .c_str(), \ - &run_benchmark, \ - stream, \ - size) +#define CREATE_BENCHMARK(BS) \ + benchmark::RegisterBenchmark( \ + (std::string("block_shuffle.SubAlgorithm Name:") + name \ + ) .c_str(), \ + &run_benchmark, \ + stream, \ + size \ + ) template = true> void add_benchmarks_type(const std::string& name, diff --git a/benchmark/benchmark_device_batch_copy.cpp b/benchmark/benchmark_device_batch_copy.cpp index 5a29f19f..444a9894 100644 --- a/benchmark/benchmark_device_batch_copy.cpp +++ b/benchmark/benchmark_device_batch_copy.cpp @@ -332,17 +332,16 @@ void run_benchmark(benchmark::State& state, "{lvl:device,item_size:" #item_size ",item_alignment:" #item_alignment \ ",size_type:" #size_type ",algo:batch_memcpy,num_tlev:" #num_tlev ",num_wlev:" #num_wlev \ ",num_blev:" #num_blev ",cfg:default_config}", \ - [=](benchmark::State& state) \ - { \ + [=](benchmark::State& state){ \ run_benchmark, \ size_type>(state, stream, num_tlev, num_wlev, num_blev); \ }) -#define BENCHMARK_TYPE(item_size, item_alignment) \ - CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ - CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \ - CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000), \ - CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) +#define BENCHMARK_TYPE(item_size, item_alignment) \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) int32_t main(int32_t argc, char* argv[]) { From 7cae166937b607338fe88febb17b2d5dc09f7832 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 12 Jun 2024 09:50:35 -0600 Subject: [PATCH 22/46] changed Key/Value Type to Key/Value Datatype --- benchmark/benchmark_device_radix_sort.cpp | 82 +++++++++++++---------- 1 file changed, 48 insertions(+), 34 deletions(-) diff --git a/benchmark/benchmark_device_radix_sort.cpp b/benchmark/benchmark_device_radix_sort.cpp index 292d407d..48538fe5 100644 --- a/benchmark/benchmark_device_radix_sort.cpp +++ b/benchmark/benchmark_device_radix_sort.cpp @@ -394,40 +394,54 @@ void run_sort_pairs_benchmark(benchmark::State& state, } -#define CREATE_SORT_KEYS_BENCHMARK(Key) \ - { \ - auto keys_input = std::make_shared>(generate_keys(size)); \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "").c_str(), \ - [=](benchmark::State& state) { run_sort_keys_benchmark(state, stream, size, keys_input); } \ - ) \ - ); \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + ", descending").c_str(), \ - [=](benchmark::State& state) { run_sort_keys_benchmark(state, stream, size, keys_input); } \ - ) \ - ); \ - } - -#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value) \ - { \ - auto keys_input = std::make_shared>(generate_keys(size)); \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "").c_str(), \ - [=](benchmark::State& state) { run_sort_pairs_benchmark(state, stream, size, keys_input); } \ - ) \ - ); \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + ", descending").c_str(), \ - [=](benchmark::State& state) { run_sort_pairs_benchmark(state, stream, size, keys_input); } \ - ) \ - ); \ - } - +#define CREATE_SORT_KEYS_BENCHMARK(Key){ \ + auto keys_input = std::make_shared>(generate_keys(size)); \ + benchmarks.push_back( \ + benchmark::RegisterBenchmark( \ + std::string(std::string("device_radix_sort_keys_ascending") \ + + "").c_str(), \ + [=](benchmark::State& state) { \ + run_sort_keys_benchmark(state, stream, size, keys_input); \ + } \ + ) \ + ); \ + benchmarks.push_back( \ + benchmark::RegisterBenchmark( \ + std::string(std::string("device_radix_sort_keys_descending") \ + + "").c_str(), \ + [=](benchmark::State& state){ \ + run_sort_keys_benchmark(state, stream, size, keys_input); \ + } \ + ) \ + ); \ +} + + +#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value){ \ + auto keys_input = std::make_shared>(generate_keys(size)); \ + benchmarks.push_back( \ + benchmark::RegisterBenchmark( \ + std::string(std::string("device_radix_sort_pairs_ascending") \ + + "").c_str(), \ + [=](benchmark::State& state){ \ + run_sort_pairs_benchmark(state, stream, size, keys_input); \ + } \ + ) \ + ); \ + benchmarks.push_back( \ + benchmark::RegisterBenchmark( \ + std::string(std::string("device_radix_sort_pairs_descending") \ + + "").c_str(), \ + [=](benchmark::State& state){ \ + run_sort_pairs_benchmark(state, stream, size, keys_input); \ + } \ + ) \ + ); \ +} void add_sort_keys_benchmarks(std::vector& benchmarks, hipStream_t stream, From 30feb2b442724c17ffc76ca7eac628b1a124b854 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 12 Jun 2024 09:51:15 -0600 Subject: [PATCH 23/46] changed Key/Value Type to Key/Value Datatype --- benchmark/benchmark_device_reduce_by_key.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/benchmark/benchmark_device_reduce_by_key.cpp b/benchmark/benchmark_device_reduce_by_key.cpp index 7437383f..734e0623 100644 --- a/benchmark/benchmark_device_reduce_by_key.cpp +++ b/benchmark/benchmark_device_reduce_by_key.cpp @@ -161,13 +161,15 @@ void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t strea HIP_CHECK(hipFree(d_unique_count_output)); } -#define CREATE_BENCHMARK(Key, Value, REDUCE_OP) \ -benchmark::RegisterBenchmark( \ - (std::string("reduce_by_key") + "" + \ - "(Random Number Range:[1, " + std::to_string(max_length) + "])" \ - ).c_str(), \ - &run_benchmark, \ - max_length, stream, size, REDUCE_OP() \ +#define CREATE_BENCHMARK(Key, Value, REDUCE_OP) \ +benchmark::RegisterBenchmark( \ + (std::string("reduce_by_key") + "" + \ + "(Random Number Range:[1, " + std::to_string(max_length) + "])" \ + ).c_str(), \ + &run_benchmark, \ + max_length, stream, size, REDUCE_OP() \ ) #define CREATE_BENCHMARKS(REDUCE_OP) \ From 9f2341eb0c4b9254a2996e0b4c7d4f40738e0b1c Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 12 Jun 2024 10:05:18 -0600 Subject: [PATCH 24/46] reformated CREATE_BENCHMARK --- benchmark/benchmark_device_reduce.cpp | 8 +-- .../benchmark_device_run_length_encode.cpp | 16 ++--- benchmark/benchmark_device_scan.cpp | 20 +++--- .../benchmark_device_segmented_radix_sort.cpp | 62 +++++++++++-------- .../benchmark_device_segmented_reduce.cpp | 18 +++--- benchmark/benchmark_device_segmented_sort.cpp | 4 +- 6 files changed, 71 insertions(+), 57 deletions(-) diff --git a/benchmark/benchmark_device_reduce.cpp b/benchmark/benchmark_device_reduce.cpp index a58ea8dc..b82195de 100644 --- a/benchmark/benchmark_device_reduce.cpp +++ b/benchmark/benchmark_device_reduce.cpp @@ -132,10 +132,10 @@ struct Benchmark { } }; -#define CREATE_BENCHMARK(T, REDUCE_OP) \ -benchmark::RegisterBenchmark( \ - ("reduce"), \ - &Benchmark::run, size, stream \ +#define CREATE_BENCHMARK(T, REDUCE_OP) \ + benchmark::RegisterBenchmark( \ + ("reduce"), \ + &Benchmark::run, size, stream \ ) #define CREATE_BENCHMARKS(REDUCE_OP) \ diff --git a/benchmark/benchmark_device_run_length_encode.cpp b/benchmark/benchmark_device_run_length_encode.cpp index 8a20433d..86250ba6 100644 --- a/benchmark/benchmark_device_run_length_encode.cpp +++ b/benchmark/benchmark_device_run_length_encode.cpp @@ -245,14 +245,14 @@ void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length, HIP_CHECK(hipFree(d_runs_count_output)); } -#define CREATE_ENCODE_BENCHMARK(T) \ -benchmark::RegisterBenchmark( \ - (std::string("run_length_encode") + "" + \ - "(Random Number Range:[1, " + std::to_string(max_length) + "])" \ - ).c_str(), \ - &run_encode_benchmark, \ - max_length, stream, size \ -) +#define CREATE_ENCODE_BENCHMARK(T) \ + benchmark::RegisterBenchmark( \ + (std::string("run_length_encode") + "" + \ + "(Random Number Range:[1, " + std::to_string(max_length) + "])" \ + ).c_str(), \ + &run_encode_benchmark, \ + max_length, stream, size \ + ) void add_encode_benchmarks(size_t max_length, std::vector& benchmarks, diff --git a/benchmark/benchmark_device_scan.cpp b/benchmark/benchmark_device_scan.cpp index 897f5eec..753d812a 100644 --- a/benchmark/benchmark_device_scan.cpp +++ b/benchmark/benchmark_device_scan.cpp @@ -303,16 +303,16 @@ void run_benchmark_by_key(benchmark::State& state, HIP_CHECK(hipFree(d_temp_storage)); } -#define CREATE_BENCHMARK(EXCL, T, SCAN_OP) \ -benchmark::RegisterBenchmark( \ - (std::string(EXCL ? "exclusive_scan" : "inclusive_scan") + \ - ("")).c_str(), \ - &run_benchmark, size, stream, SCAN_OP() \ -), \ -benchmark::RegisterBenchmark( \ - (std::string(EXCL ? "exclusive_scan_by_key" : "inclusive_scan_by_key") + \ - ("")).c_str(), \ - &run_benchmark_by_key, size, stream, SCAN_OP() \ +#define CREATE_BENCHMARK(EXCL, T, SCAN_OP) \ +benchmark::RegisterBenchmark( \ + (std::string(EXCL ? "device_exclusive_scan" : "device_inclusive_scan") + \ + ("")).c_str(), \ + &run_benchmark, size, stream, SCAN_OP() \ +), \ +benchmark::RegisterBenchmark( \ + (std::string(EXCL ? "device_exclusive_scan_by_key" : "device_inclusive_scan_by_key") + \ + ("")).c_str(), \ + &run_benchmark_by_key, size, stream, SCAN_OP() \ ) #define CREATE_BENCHMARKS(SCAN_OP) \ diff --git a/benchmark/benchmark_device_segmented_radix_sort.cpp b/benchmark/benchmark_device_segmented_radix_sort.cpp index d2047866..916f7e1d 100644 --- a/benchmark/benchmark_device_segmented_radix_sort.cpp +++ b/benchmark/benchmark_device_segmented_radix_sort.cpp @@ -372,20 +372,26 @@ void run_sort_pairs_benchmark(benchmark::State& state, HIP_CHECK(hipFree(d_values_output)); } -#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ -benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "" + \ - "(Segments:~" + std::to_string(SEGMENTS) + " segments)" \ - ).c_str(), \ - [=](benchmark::State& state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size, Ascending); } \ +#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ +benchmark::RegisterBenchmark( \ + (std::string("device_segmented_radix_sort_keys_ascending") \ + + "" + \ + "(Segments:~" + std::to_string(SEGMENTS) + " segments)" \ + ).c_str(), \ + [=](benchmark::State& state){ \ + run_sort_keys_benchmark(state, SEGMENTS, stream, size, Ascending); \ + } \ ) -#define CREATE_SORT_KEYS_DESCENDING_BENCHMARK(Key, SEGMENTS) \ -benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "" + \ - "(Segments:~" + std::to_string(SEGMENTS) + " segments), descending" \ - ).c_str(), \ - [=](benchmark::State& state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size, Descending); } \ +#define CREATE_SORT_KEYS_DESCENDING_BENCHMARK(Key, SEGMENTS) \ +benchmark::RegisterBenchmark( \ + (std::string("device_segmented_radix_sort_keys_descending") \ + + "" + \ + "(Segments:~" + std::to_string(SEGMENTS) + " segments), " \ + ).c_str(), \ + [=](benchmark::State& state){ \ + run_sort_keys_benchmark(state, SEGMENTS, stream, size, Descending); \ + } \ ) #define BENCHMARK_KEY_TYPE(type) \ @@ -416,22 +422,26 @@ void add_sort_keys_benchmarks(std::vector& benc benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ -benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "" + \ - "(Segments:~" + std::to_string(SEGMENTS) + " segments)" \ - ).c_str(), \ - [=](benchmark::State& state) { \ - run_sort_pairs_benchmark(state, SEGMENTS, stream, size, Ascending); } \ +#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ +benchmark::RegisterBenchmark( \ + (std::string("device_segmented_radix_sort_pairs_ascending") \ + + "" + \ + "(Segments:~" + std::to_string(SEGMENTS) + " segments)" \ + ).c_str(), \ + [=](benchmark::State& state){ \ + run_sort_pairs_benchmark(state, SEGMENTS, stream, size, Ascending); \ + } \ ) -#define CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(Key, Value, SEGMENTS) \ -benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "" + \ - "(Segments:~" + std::to_string(SEGMENTS) + " segments), descending" \ - ).c_str(), \ - [=](benchmark::State& state) { \ - run_sort_pairs_benchmark(state, SEGMENTS, stream, size, Descending); } \ +#define CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(Key, Value, SEGMENTS) \ +benchmark::RegisterBenchmark( \ + (std::string("device_segmented_radix_sort_pairs_descending") \ + + "" + \ + "(Segments:~" + std::to_string(SEGMENTS) + " segments), descending" \ + ).c_str(), \ + [=](benchmark::State& state){ \ + run_sort_pairs_benchmark(state, SEGMENTS, stream, size, Descending);\ + } \ ) #define BENCHMARK_PAIR_TYPE(type, value) \ diff --git a/benchmark/benchmark_device_segmented_reduce.cpp b/benchmark/benchmark_device_segmented_reduce.cpp index ca7cb950..69804192 100644 --- a/benchmark/benchmark_device_segmented_reduce.cpp +++ b/benchmark/benchmark_device_segmented_reduce.cpp @@ -194,13 +194,17 @@ struct Benchmark { } }; -#define CREATE_BENCHMARK(T, SEGMENTS, REDUCE_OP) \ -benchmark::RegisterBenchmark( \ - (std::string("segmented_reduce") + "" + \ - "(Number of segments:~" + std::to_string(SEGMENTS) + " segments)" \ - ).c_str(), \ - &Benchmark::run, \ - SEGMENTS, stream, size \ +#define CREATE_BENCHMARK(T, SEGMENTS, REDUCE_OP) \ +benchmark::RegisterBenchmark( \ + (std::string("device_segmented_reduce") \ + + "" + \ + "(Number of segments:~" \ + + std::to_string(SEGMENTS) \ + + " segments)" \ + ).c_str(), \ + &Benchmark::run, \ + SEGMENTS, stream, size \ ) #define BENCHMARK_TYPE(type, REDUCE_OP) \ diff --git a/benchmark/benchmark_device_segmented_sort.cpp b/benchmark/benchmark_device_segmented_sort.cpp index 078ab8dd..12b3d408 100644 --- a/benchmark/benchmark_device_segmented_sort.cpp +++ b/benchmark/benchmark_device_segmented_sort.cpp @@ -371,12 +371,12 @@ void run_sort_pairs_benchmark(benchmark::State &state, #define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "" + \ + (std::string("device_segmented_sort_keys_ascending") + "" + \ "(Number of segments:~" + std::to_string(SEGMENTS) + " segments)") \ .c_str(), \ [=](benchmark::State &state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size); }), \ benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "" + \ + (std::string("device_segmented_sort_keys_ascending_descending") + "" + \ "(Number of segments:~" + std::to_string(SEGMENTS) + " segments), descending") \ .c_str(), \ [=](benchmark::State &state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size, true); }), \ From 83efde8f5bf0f7e47b3d7f3e87ae7a3923198b29 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 12 Jun 2024 11:27:36 -0600 Subject: [PATCH 25/46] changed Key/Value Type to Key/Value Datatype --- .../benchmark_device_segmented_radix_sort.cpp | 26 +-- benchmark/benchmark_device_segmented_sort.cpp | 153 +++++++++++++----- 2 files changed, 126 insertions(+), 53 deletions(-) diff --git a/benchmark/benchmark_device_segmented_radix_sort.cpp b/benchmark/benchmark_device_segmented_radix_sort.cpp index 916f7e1d..da4abdf4 100644 --- a/benchmark/benchmark_device_segmented_radix_sort.cpp +++ b/benchmark/benchmark_device_segmented_radix_sort.cpp @@ -374,8 +374,9 @@ void run_sort_pairs_benchmark(benchmark::State& state, #define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ benchmark::RegisterBenchmark( \ - (std::string("device_segmented_radix_sort_keys_ascending") \ - + "" + \ + (std::string("device_segmented_radix_sort_keys") \ + + "" + \ "(Segments:~" + std::to_string(SEGMENTS) + " segments)" \ ).c_str(), \ [=](benchmark::State& state){ \ @@ -385,9 +386,10 @@ benchmark::RegisterBenchmark( #define CREATE_SORT_KEYS_DESCENDING_BENCHMARK(Key, SEGMENTS) \ benchmark::RegisterBenchmark( \ - (std::string("device_segmented_radix_sort_keys_descending") \ - + "" + \ - "(Segments:~" + std::to_string(SEGMENTS) + " segments), " \ + (std::string("device_segmented_radix_sort_keys") \ + + "" + \ + "(Segments:~" + std::to_string(SEGMENTS) + " segments)" \ ).c_str(), \ [=](benchmark::State& state){ \ run_sort_keys_benchmark(state, SEGMENTS, stream, size, Descending); \ @@ -424,8 +426,10 @@ void add_sort_keys_benchmarks(std::vector& benc #define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ benchmark::RegisterBenchmark( \ - (std::string("device_segmented_radix_sort_pairs_ascending") \ - + "" + \ + (std::string("device_segmented_radix_sort_pairs") \ + + "" + \ "(Segments:~" + std::to_string(SEGMENTS) + " segments)" \ ).c_str(), \ [=](benchmark::State& state){ \ @@ -435,9 +439,11 @@ benchmark::RegisterBenchmark( #define CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(Key, Value, SEGMENTS) \ benchmark::RegisterBenchmark( \ - (std::string("device_segmented_radix_sort_pairs_descending") \ - + "" + \ - "(Segments:~" + std::to_string(SEGMENTS) + " segments), descending" \ + (std::string("device_segmented_radix_sort_pairs") \ + + "" + \ + "(Segments:~" + std::to_string(SEGMENTS) + " segments)" \ ).c_str(), \ [=](benchmark::State& state){ \ run_sort_pairs_benchmark(state, SEGMENTS, stream, size, Descending);\ diff --git a/benchmark/benchmark_device_segmented_sort.cpp b/benchmark/benchmark_device_segmented_sort.cpp index 12b3d408..8015a641 100644 --- a/benchmark/benchmark_device_segmented_sort.cpp +++ b/benchmark/benchmark_device_segmented_sort.cpp @@ -369,27 +369,59 @@ void run_sort_pairs_benchmark(benchmark::State &state, HIP_CHECK(hipFree(d_values_output)); } -#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ - benchmark::RegisterBenchmark( \ - (std::string("device_segmented_sort_keys_ascending") + "" + \ - "(Number of segments:~" + std::to_string(SEGMENTS) + " segments)") \ - .c_str(), \ - [=](benchmark::State &state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size); }), \ - benchmark::RegisterBenchmark( \ - (std::string("device_segmented_sort_keys_ascending_descending") + "" + \ - "(Number of segments:~" + std::to_string(SEGMENTS) + " segments), descending") \ - .c_str(), \ - [=](benchmark::State &state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size, true); }), \ - benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "" + \ - "(Number of segments:~" + std::to_string(SEGMENTS) + " segments), stable") \ - .c_str(), \ - [=](benchmark::State &state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size, false, true); }), \ - benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "" + \ - "(Number of segments:~" + std::to_string(SEGMENTS) + " segments), descending, stable") \ - .c_str(), \ - [=](benchmark::State &state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size, true, true); }) +#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ + benchmark::RegisterBenchmark( \ + (std::string("device_segmented_sort_keys") \ + + "" + \ + "(Number of segments:~" \ + + std::to_string(SEGMENTS) \ + + " segments)" \ + ).c_str(), \ + [=](benchmark::State &state){ \ + run_sort_keys_benchmark(state, SEGMENTS, stream, size); \ + } \ + ), \ + benchmark::RegisterBenchmark( \ + (std::string("device_segmented_sort_keys") \ + + "" + \ + "(Number of segments:~" \ + + std::to_string(SEGMENTS) \ + + " segments)" \ + ).c_str(), \ + [=](benchmark::State &state){ \ + run_sort_keys_benchmark(state, SEGMENTS, stream, size, true); \ + } \ + ), \ + benchmark::RegisterBenchmark( \ + (std::string("device_segmented_sort_keys") \ + + "" + \ + "(Number of segments:~" \ + + std::to_string(SEGMENTS) \ + + " segments)" \ + ).c_str(), \ + [=](benchmark::State &state){ \ + run_sort_keys_benchmark(state, SEGMENTS, stream, size, false, true); \ + } \ + ), \ + benchmark::RegisterBenchmark( \ + (std::string("device_segmented_sort_keys") \ + + "" + \ + "(Number of segments:~" \ + + std::to_string(SEGMENTS) \ + + " segments)" \ + ).c_str(), \ + [=](benchmark::State &state){ \ + run_sort_keys_benchmark(state, SEGMENTS, stream, size, true, true); \ + } \ + ) \ #define BENCHMARK_KEY_TYPE(type) \ CREATE_SORT_KEYS_BENCHMARK(type, 10), \ @@ -412,28 +444,63 @@ void add_sort_keys_benchmarks(std::vector &ben benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ - benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "" + \ - "(Number of segments:~" + std::to_string(SEGMENTS) + " segments)") \ - .c_str(), \ - [=](benchmark::State &state) { run_sort_pairs_benchmark(state, SEGMENTS, stream, size); }), \ - benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "" + \ - "(Number of segments:~" + std::to_string(SEGMENTS) + " segments), descending") \ - .c_str(), \ - [=](benchmark::State &state) { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, true); }), \ - benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "" + \ - "(Number of segments:~" + std::to_string(SEGMENTS) + " segments), stable") \ - .c_str(), \ - [=](benchmark::State &state) { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, false, true); }), \ - benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "" + \ - "(Number of segments:~" + std::to_string(SEGMENTS) + " segments), descending, stable") \ - .c_str(), \ - [=](benchmark::State &state) { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, true, true); }) - +#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ + benchmark::RegisterBenchmark( \ + (std::string("device_segmented_sort_pairs") \ + + "" + \ + "(Number of segments:~" \ + + std::to_string(SEGMENTS) \ + + " segments)" \ + ).c_str(), \ + [=](benchmark::State &state){ \ + run_sort_pairs_benchmark(state, SEGMENTS, stream, size); \ + } \ + ), \ + benchmark::RegisterBenchmark( \ + (std::string("device_segmented_sort_pairs") \ + + "" + \ + "(Number of segments:~" \ + + std::to_string(SEGMENTS) \ + + " segments)" \ + ).c_str(), \ + [=](benchmark::State &state){ \ + run_sort_pairs_benchmark(state, SEGMENTS, stream, size, true); \ + } \ + ), \ + benchmark::RegisterBenchmark( \ + (std::string("device_segmented_sort_pairs") \ + + "" + \ + "(Number of segments:~" \ + + std::to_string(SEGMENTS) \ + + " segments)" \ + ).c_str(), \ + [=](benchmark::State &state){ \ + run_sort_pairs_benchmark(state, SEGMENTS, stream, size, false, true); \ + } \ + ), \ + benchmark::RegisterBenchmark( \ + (std::string("device_segmented_sort_pairs") \ + + "" + \ + "(Number of segments:~" \ + + std::to_string(SEGMENTS) \ + + " segments)" \ + ).c_str(), \ + [=](benchmark::State &state){ \ + run_sort_pairs_benchmark(state, SEGMENTS, stream, size, true, true); \ + } \ + ) #define BENCHMARK_PAIR_TYPE(type, value) \ CREATE_SORT_PAIRS_BENCHMARK(type, value, 10), \ CREATE_SORT_PAIRS_BENCHMARK(type, value, 100), \ From a4b9d74c36d2dbc8368024d3198863aa42887f2f Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 12 Jun 2024 12:07:23 -0600 Subject: [PATCH 26/46] re formated CREATE_BENCHMARK --- .../benchmark_block_adjacent_difference.cpp | 4 +- benchmark/benchmark_block_discontinuity.cpp | 15 +++-- benchmark/benchmark_block_exchange.cpp | 15 +++-- benchmark/benchmark_block_histogram.cpp | 17 ++++-- benchmark/benchmark_block_merge_sort.cpp | 15 +++-- benchmark/benchmark_block_radix_rank.cpp | 5 +- benchmark/benchmark_block_radix_sort.cpp | 22 ++++--- benchmark/benchmark_block_reduce.cpp | 17 +++--- .../benchmark_block_run_length_decode.cpp | 3 +- benchmark/benchmark_block_scan.cpp | 3 +- benchmark/benchmark_block_shuffle.cpp | 44 ++++++++------ .../benchmark_device_adjacent_difference.cpp | 4 +- benchmark/benchmark_device_batch_copy.cpp | 15 +++-- benchmark/benchmark_device_select.cpp | 40 +++++++------ benchmark/benchmark_warp_exchange.cpp | 57 ++++++++++++------- 15 files changed, 170 insertions(+), 106 deletions(-) diff --git a/benchmark/benchmark_block_adjacent_difference.cpp b/benchmark/benchmark_block_adjacent_difference.cpp index 9d2a0372..89cc1636 100644 --- a/benchmark/benchmark_block_adjacent_difference.cpp +++ b/benchmark/benchmark_block_adjacent_difference.cpp @@ -340,11 +340,11 @@ auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) # define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ benchmark::RegisterBenchmark( \ - (std::string("block_adjacent_difference.SubAlgorithm Name:") + name \ + std::string("") \ + ",With Tile:" #WITH_TILE ">" \ ).c_str(), \ &run_benchmark, \ stream, \ diff --git a/benchmark/benchmark_block_discontinuity.cpp b/benchmark/benchmark_block_discontinuity.cpp index 72d925ec..15ce5a98 100644 --- a/benchmark/benchmark_block_discontinuity.cpp +++ b/benchmark/benchmark_block_discontinuity.cpp @@ -238,11 +238,16 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ -benchmark::RegisterBenchmark( \ - (std::string("block_discontinuity.SubAlgorithm Name:") + name + ("")).c_str(), \ - &run_benchmark, \ - stream, size \ +#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ +benchmark::RegisterBenchmark( \ + std::string("block_discontinuity.SubAlgorithm Name:" \ + + name \ + + "." \ + ).c_str(), \ + &run_benchmark, \ + stream, size \ ) #define BENCHMARK_TYPE(type, block, bool) \ diff --git a/benchmark/benchmark_block_exchange.cpp b/benchmark/benchmark_block_exchange.cpp index 278a6190..8d62979c 100644 --- a/benchmark/benchmark_block_exchange.cpp +++ b/benchmark/benchmark_block_exchange.cpp @@ -293,11 +293,16 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT) \ -benchmark::RegisterBenchmark( \ - (std::string("block_exchange.SubAlgorithm Name:") + name).c_str(), \ - &run_benchmark, \ - stream, size \ +#define CREATE_BENCHMARK(T, BS, IPT) \ +benchmark::RegisterBenchmark( \ + std::string("block_exchange.SubAlgorithm Name:" \ + + name \ + ).c_str(), \ + &run_benchmark, \ + stream, size \ ) #define BENCHMARK_TYPE(type, block) \ diff --git a/benchmark/benchmark_block_histogram.cpp b/benchmark/benchmark_block_histogram.cpp index e247a13b..0771a0cc 100644 --- a/benchmark/benchmark_block_histogram.cpp +++ b/benchmark/benchmark_block_histogram.cpp @@ -143,11 +143,18 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) } // IPT - items per thread -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark( \ - (std::string("block_histogram.Method Name:") + method_name).c_str(), \ - &run_benchmark, \ - stream, size \ +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark( \ + std::string("block_histogram.Method Name:" \ + + method_name \ + ).c_str(), \ + &run_benchmark, \ + stream, size \ ) #define BENCHMARK_TYPE(type, block) \ diff --git a/benchmark/benchmark_block_merge_sort.cpp b/benchmark/benchmark_block_merge_sort.cpp index 14407a62..1cd47c9b 100644 --- a/benchmark/benchmark_block_merge_sort.cpp +++ b/benchmark/benchmark_block_merge_sort.cpp @@ -177,11 +177,16 @@ void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipS HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT) \ -benchmark::RegisterBenchmark( \ - (std::string("block_merge_sort.SubAlgorithm Name:") + name).c_str(), \ - &run_benchmark, \ - benchmark_kind, stream, size \ +#define CREATE_BENCHMARK(T, BS, IPT) \ +benchmark::RegisterBenchmark( \ + std::string("block_merge_sort.SubAlgorithm Name:" \ + + name \ + ).c_str(), \ + &run_benchmark, \ + benchmark_kind, stream, size \ ) #define BENCHMARK_TYPE(type, block) \ diff --git a/benchmark/benchmark_block_radix_rank.cpp b/benchmark/benchmark_block_radix_rank.cpp index 582fab28..e44a6495 100644 --- a/benchmark/benchmark_block_radix_rank.cpp +++ b/benchmark/benchmark_block_radix_rank.cpp @@ -159,11 +159,12 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) #define CREATE_BENCHMARK(T, KIND, BS, IPT) \ benchmark::RegisterBenchmark( \ - (std::string("block_radix_rank.") + name \ + ">." \ + + name \ ).c_str(), \ &run_benchmark, \ stream, \ diff --git a/benchmark/benchmark_block_radix_sort.cpp b/benchmark/benchmark_block_radix_sort.cpp index dbd13fea..1e3fe347 100644 --- a/benchmark/benchmark_block_radix_sort.cpp +++ b/benchmark/benchmark_block_radix_sort.cpp @@ -245,15 +245,19 @@ void run_benchmark(benchmark::State& state, HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark((std::string("block_radix_sort.SubAlgorithm Name:") \ - + name) \ - .c_str(), \ - &run_benchmark, \ - benchmark_kind, \ - stream, \ - size) +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark( \ + std::string("block_radix_sort.SubAlgorithm Name:" \ + + name \ + ).c_str(), \ + &run_benchmark, \ + benchmark_kind, \ + stream, \ + size \ + ) // clang-format off #define BENCHMARK_TYPE(type, block) \ diff --git a/benchmark/benchmark_block_reduce.cpp b/benchmark/benchmark_block_reduce.cpp index 7918ed7a..00571d11 100644 --- a/benchmark/benchmark_block_reduce.cpp +++ b/benchmark/benchmark_block_reduce.cpp @@ -135,14 +135,15 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) } // IPT - items per thread -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark( \ - (std::string("block_reduce.Method Name:") + method_name).c_str(), \ - &run_benchmark, \ - stream, size \ +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark( \ + std::string("block_reduce.Method Name:" + method_name \ + ).c_str(), \ + &run_benchmark, \ + stream, size \ ) #define BENCHMARK_TYPE(type, block) \ diff --git a/benchmark/benchmark_block_run_length_decode.cpp b/benchmark/benchmark_block_run_length_decode.cpp index 0cbfee58..6de3cb41 100644 --- a/benchmark/benchmark_block_run_length_decode.cpp +++ b/benchmark/benchmark_block_run_length_decode.cpp @@ -192,7 +192,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) ",Max RunLength:"#MAXRL \ ",BlockSize: "#BS \ ",Runs Per Thread:"#RPT \ - ",Decoded Items Per Thread:"#DIPT">").c_str(), \ + ",Decoded Items Per Thread:"#DIPT">." \ + ).c_str(), \ &run_benchmark, \ stream, size \ ) diff --git a/benchmark/benchmark_block_scan.cpp b/benchmark/benchmark_block_scan.cpp index 5eced17a..f38267e4 100644 --- a/benchmark/benchmark_block_scan.cpp +++ b/benchmark/benchmark_block_scan.cpp @@ -160,7 +160,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) ",Block Size:"#BS \ ",Items Per Thread:"#IPT \ ",SubAlgorithm Name:" + algorithm_name \ - + ">.Method Name:") + method_name).c_str(), \ + + ">.Method Name:") + method_name \ + ).c_str(), \ &run_benchmark, \ stream, size \ ) diff --git a/benchmark/benchmark_block_shuffle.cpp b/benchmark/benchmark_block_shuffle.cpp index 8e41fff0..8421dd3b 100644 --- a/benchmark/benchmark_block_shuffle.cpp +++ b/benchmark/benchmark_block_shuffle.cpp @@ -214,24 +214,32 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK_IPT(BS, IPT) \ - benchmark::RegisterBenchmark( \ - (std::string("block_shuffle.SubAlgorithm Name:") \ - + name) \ - .c_str(), \ - &run_benchmark, \ - stream, \ - size) - -#define CREATE_BENCHMARK(BS) \ - benchmark::RegisterBenchmark( \ - (std::string("block_shuffle.SubAlgorithm Name:") + name \ - ) .c_str(), \ - &run_benchmark, \ - stream, \ - size \ +#define CREATE_BENCHMARK_IPT(BS, IPT) \ + benchmark::RegisterBenchmark( \ + ("block_shuffle.SubAlgorithm Name:" \ + + name \ + ).c_str(), \ + &run_benchmark, \ + stream, \ + size \ + ) + +#define CREATE_BENCHMARK(BS) \ + benchmark::RegisterBenchmark( \ + ("block_shuffle.SubAlgorithm Name:" \ + + name \ + ).c_str(), \ + &run_benchmark, \ + stream, \ + size \ ) template = true> diff --git a/benchmark/benchmark_device_adjacent_difference.cpp b/benchmark/benchmark_device_adjacent_difference.cpp index 383de142..99f2a278 100644 --- a/benchmark/benchmark_device_adjacent_difference.cpp +++ b/benchmark/benchmark_device_adjacent_difference.cpp @@ -182,9 +182,9 @@ using namespace std::string_literals; #define CREATE_BENCHMARK(T, left, copy) \ benchmark::RegisterBenchmark( \ - (std::string("device_adjacent_difference" \ + std::string("device_adjacent_difference" \ "." \ - "SubAlgorithm Name:Subtract") \ + "SubAlgorithm Name:Subtract" \ + std::string(left ? "Left" : "Right") \ + std::string(copy ? "Copy" : "") \ ).c_str(), \ diff --git a/benchmark/benchmark_device_batch_copy.cpp b/benchmark/benchmark_device_batch_copy.cpp index 444a9894..6e74c1aa 100644 --- a/benchmark/benchmark_device_batch_copy.cpp +++ b/benchmark/benchmark_device_batch_copy.cpp @@ -327,14 +327,17 @@ void run_benchmark(benchmark::State& state, HIP_CHECK(hipFree(d_temp_storage)); } -#define CREATE_BENCHMARK(item_size, item_alignment, size_type, num_tlev, num_wlev, num_blev) \ +#define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev) \ benchmark::RegisterBenchmark( \ - "{lvl:device,item_size:" #item_size ",item_alignment:" #item_alignment \ - ",size_type:" #size_type ",algo:batch_memcpy,num_tlev:" #num_tlev ",num_wlev:" #num_wlev \ - ",num_blev:" #num_blev ",cfg:default_config}", \ + // "{lvl:device,IS:" #IS ",IA:" #IA + // ",T:" #T ",algo:batch_memcpy,num_tlev:" #num_tlev ",num_wlev:" #num_wlev + // ",num_blev:" #num_blev ",cfg:default_config}", + std::string("device_batch_copy" + "<" + ).c_str() [=](benchmark::State& state){ \ - run_benchmark, \ - size_type>(state, stream, num_tlev, num_wlev, num_blev); \ + run_benchmark, \ + T>(state, stream, num_tlev, num_wlev, num_blev); \ }) #define BENCHMARK_TYPE(item_size, item_alignment) \ diff --git a/benchmark/benchmark_device_select.cpp b/benchmark/benchmark_device_select.cpp index a430cd61..b6cfb596 100644 --- a/benchmark/benchmark_device_select.cpp +++ b/benchmark/benchmark_device_select.cpp @@ -498,28 +498,36 @@ void run_unique_by_key_benchmark(benchmark::State& state, hipFree(d_temp_storage); } -#define CREATE_SELECT_FLAGGED_BENCHMARK(T, F, p) \ -benchmark::RegisterBenchmark( \ - ("select_flagged(Probability:" #p")"), \ - &run_flagged_benchmark, size, stream, p \ +#define CREATE_SELECT_FLAGGED_BENCHMARK(T, F, p) \ +benchmark::RegisterBenchmark( \ + ("device_select_flagged(Probability:" #p")"), \ + &run_flagged_benchmark, size, stream, p \ ) -#define CREATE_SELECT_IF_BENCHMARK(T, p) \ -benchmark::RegisterBenchmark( \ - ("select_if(Probability:" #p")"), \ - &run_selectop_benchmark, size, stream, p \ +#define CREATE_SELECT_IF_BENCHMARK(T, p) \ +benchmark::RegisterBenchmark( \ + ("device_select_if(Probability:" #p")"), \ + &run_selectop_benchmark, size, stream, p \ ) -#define CREATE_UNIQUE_BENCHMARK(T, p) \ -benchmark::RegisterBenchmark( \ - ("unique(Probability:" #p")"), \ - &run_unique_benchmark, size, stream, p \ +#define CREATE_UNIQUE_BENCHMARK(T, p) \ +benchmark::RegisterBenchmark( \ + ("device_select_unique(Probability:" #p")"), \ + &run_unique_benchmark, size, stream, p \ ) -#define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p) \ -benchmark::RegisterBenchmark( \ - ("unique_by_key(Probability:" #p")"), \ - &run_unique_by_key_benchmark, size, stream, p \ +#define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p) \ +benchmark::RegisterBenchmark( \ + ("device_select_unique_by_key(Probability:" #p")"), \ + &run_unique_by_key_benchmark, size, stream, p \ ) #define BENCHMARK_FLAGGED_TYPE(type, value) \ diff --git a/benchmark/benchmark_warp_exchange.cpp b/benchmark/benchmark_warp_exchange.cpp index be5ce636..bdc2f5d0 100644 --- a/benchmark/benchmark_warp_exchange.cpp +++ b/benchmark/benchmark_warp_exchange.cpp @@ -239,27 +239,42 @@ struct BlockedToStripedOp } }; -#define CREATE_BENCHMARK_STRIPED_TO_BLOCKED(T, BS, IT, WS, ALG) \ - benchmark::RegisterBenchmark("warp_exchange_striped_to_blocked.", \ - &run_benchmark, \ - stream, \ - size) - -#define CREATE_BENCHMARK_BLOCKED_TO_STRIPED(T, BS, IT, WS, ALG) \ - benchmark::RegisterBenchmark("warp_exchange_blocked_to_striped.", \ - &run_benchmark, \ - stream, \ - size) - -#define CREATE_BENCHMARK_SCATTER_TO_STRIPED(T, OFFSET_T, BS, IT, WS) \ -benchmark::RegisterBenchmark( \ - "warp_exchange_scatter_to_striped.", \ - &run_benchmark_scatter_to_striped, \ - stream, size \ +#define CREATE_BENCHMARK_STRIPED_TO_BLOCKED(T, BS, IT, WS, ALG) \ + benchmark::RegisterBenchmark( \ + std::string("warp_exchange_striped_to_blocked" \ + ).c_str(), \ + &run_benchmark, \ + stream, \ + size \ + ) + +#define CREATE_BENCHMARK_BLOCKED_TO_STRIPED(T, BS, IT, WS, ALG) \ + benchmark::RegisterBenchmark( \ + std::string("warp_exchange_blocked_to_striped" \ + ).c_str(), \ + &run_benchmark, \ + stream, \ + size \ + ) + +#define CREATE_BENCHMARK_SCATTER_TO_STRIPED(T, OFFSET_T, BS, IT, WS) \ +benchmark::RegisterBenchmark( \ + std::string("warp_exchange_scatter_to_striped" \ + ).c_str(), \ + &run_benchmark_scatter_to_striped, \ + stream, size \ ) int main(int argc, char *argv[]) From 4c8a8e6c88b79686f34a4daceb908115ffff2d1f Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 12 Jun 2024 12:14:01 -0600 Subject: [PATCH 27/46] converted benchmark_device_batch_copy.cpp to hipCUB format (was rocPRIM) --- benchmark/benchmark_device_batch_copy.cpp | 27 +++++++++++++---------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/benchmark/benchmark_device_batch_copy.cpp b/benchmark/benchmark_device_batch_copy.cpp index 6e74c1aa..8aa449d4 100644 --- a/benchmark/benchmark_device_batch_copy.cpp +++ b/benchmark/benchmark_device_batch_copy.cpp @@ -327,18 +327,21 @@ void run_benchmark(benchmark::State& state, HIP_CHECK(hipFree(d_temp_storage)); } -#define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev) \ - benchmark::RegisterBenchmark( \ - // "{lvl:device,IS:" #IS ",IA:" #IA - // ",T:" #T ",algo:batch_memcpy,num_tlev:" #num_tlev ",num_wlev:" #num_wlev - // ",num_blev:" #num_blev ",cfg:default_config}", - std::string("device_batch_copy" - "<" - ).c_str() - [=](benchmark::State& state){ \ - run_benchmark, \ - T>(state, stream, num_tlev, num_wlev, num_blev); \ - }) +#define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev) \ + benchmark::RegisterBenchmark( \ + std::string("device_batch_copy" \ + "." \ + ).c_str(), \ + [=](benchmark::State& state){ \ + run_benchmark, \ + T>(state, stream, num_tlev, num_wlev, num_blev); \ + } \ + ) #define BENCHMARK_TYPE(item_size, item_alignment) \ CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ From 943735c95d5ab3e4ab4c891bd4572b42829f3722 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 12 Jun 2024 13:24:15 -0600 Subject: [PATCH 28/46] made sure all benchmarks have same format --- benchmark/benchmark_device_batch_memcpy.cpp | 4 +- benchmark/benchmark_device_histogram.cpp | 104 +++++++++++------- benchmark/benchmark_device_memory.cpp | 51 +++++---- benchmark/benchmark_device_merge_sort.cpp | 43 ++++---- benchmark/benchmark_device_partition.cpp | 53 ++++++--- benchmark/benchmark_device_radix_sort.cpp | 28 +++-- benchmark/benchmark_device_reduce.cpp | 14 ++- benchmark/benchmark_device_reduce_by_key.cpp | 22 ++-- .../benchmark_device_run_length_encode.cpp | 17 +-- benchmark/benchmark_device_scan.cpp | 32 ++++-- .../benchmark_device_segmented_radix_sort.cpp | 50 +++++---- .../benchmark_device_segmented_reduce.cpp | 7 +- benchmark/benchmark_device_segmented_sort.cpp | 32 +++--- benchmark/benchmark_device_select.cpp | 8 +- benchmark/benchmark_warp_exchange.cpp | 6 +- benchmark/benchmark_warp_load.cpp | 16 ++- benchmark/benchmark_warp_merge_sort.cpp | 24 ++-- benchmark/benchmark_warp_reduce.cpp | 17 ++- benchmark/benchmark_warp_scan.cpp | 20 ++-- benchmark/benchmark_warp_store.cpp | 15 ++- 20 files changed, 339 insertions(+), 224 deletions(-) diff --git a/benchmark/benchmark_device_batch_memcpy.cpp b/benchmark/benchmark_device_batch_memcpy.cpp index 97fac843..6e0ae35e 100644 --- a/benchmark/benchmark_device_batch_memcpy.cpp +++ b/benchmark/benchmark_device_batch_memcpy.cpp @@ -339,12 +339,12 @@ void run_benchmark(benchmark::State& state, #define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev) \ benchmark::RegisterBenchmark( \ - (std::string("device_batch_memcpy.") \ + ",Number of Blev:" #num_blev ">." \ ).c_str(), \ [=](benchmark::State& state){ \ run_benchmark, \ diff --git a/benchmark/benchmark_device_histogram.cpp b/benchmark/benchmark_device_histogram.cpp index 00f441b1..943ffebb 100644 --- a/benchmark/benchmark_device_histogram.cpp +++ b/benchmark/benchmark_device_histogram.cpp @@ -506,16 +506,24 @@ struct num_limits<__half> }; }; -#define CREATE_EVEN_BENCHMARK(VECTOR, T, BINS, SCALE) \ - if(num_limits::max() > BINS * SCALE) \ - { \ - VECTOR.push_back(benchmark::RegisterBenchmark( \ - (std::string("histogram_even") + "" + "(Entropy Percent:" \ - + std::to_string(get_entropy_percents(entropy_reduction)) + "%,Bin Count:" \ - + std::to_string(BINS) + " bins)") \ - .c_str(), \ - [=](benchmark::State& state) \ - { run_even_benchmark(state, BINS, SCALE, entropy_reduction, stream, size); })); \ +#define CREATE_EVEN_BENCHMARK(VECTOR, T, BINS, SCALE) \ + if(num_limits::max() > BINS * SCALE){ \ + VECTOR.push_back( \ + benchmark::RegisterBenchmark( \ + std::string("device_histogram_even" \ + "." \ + "(Entropy Percent:" \ + + std::to_string(get_entropy_percents(entropy_reduction)) \ + + "%,Bin Count:" \ + + std::to_string(BINS) \ + + " bins)" \ + ).c_str(), \ + [=](benchmark::State& state){ \ + run_even_benchmark(state, BINS, SCALE, entropy_reduction, stream, size); \ + } \ + ) \ + ); \ } #define BENCHMARK_TYPE(VECTOR, T) \ @@ -545,18 +553,24 @@ void add_even_benchmarks(std::vector& benchmark }; } -#define CREATE_MULTI_EVEN_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS, SCALE) \ -benchmark::RegisterBenchmark( \ - (std::string("multi_histogram_even") + "" + \ - "(Entropy Percent:" + std::to_string(get_entropy_percents(entropy_reduction)) + "%,Bin Count:" + \ - std::to_string(BINS) + " bins)" \ - ).c_str(), \ - [=](benchmark::State& state) { \ - run_multi_even_benchmark( \ - state, BINS, SCALE, entropy_reduction, stream, size \ - ); \ - } \ -) +#define CREATE_MULTI_EVEN_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS, SCALE) \ + benchmark::RegisterBenchmark( \ + std::string("device_multi_histogram_even" \ + "." \ + "(Entropy Percent:" \ + + std::to_string(get_entropy_percents(entropy_reduction)) \ + + "%,Bin Count:" + \ + std::to_string(BINS) \ + + " bins)" \ + ).c_str(), \ + [=](benchmark::State& state){ \ + run_multi_even_benchmark( \ + state, BINS, SCALE, entropy_reduction, stream, size \ + ); \ + } \ + ) void add_multi_even_benchmarks(std::vector& benchmarks, hipStream_t stream, @@ -580,13 +594,18 @@ void add_multi_even_benchmarks(std::vector& ben }; } -#define CREATE_RANGE_BENCHMARK(T, BINS) \ -benchmark::RegisterBenchmark( \ - (std::string("histogram_range") + "" + \ - "(Bin Count:" + std::to_string(BINS) + " bins)" \ - ).c_str(), \ - [=](benchmark::State& state) { run_range_benchmark(state, BINS, stream, size); } \ -) +#define CREATE_RANGE_BENCHMARK(T, BINS) \ + benchmark::RegisterBenchmark( \ + std::string("device_histogram_range" \ + "." \ + "(Bin Count:" \ + + std::to_string(BINS) \ + + " bins)" \ + ).c_str(), \ + [=](benchmark::State& state){ \ + run_range_benchmark(state, BINS, stream, size); \ + } \ + ) #define BENCHMARK_RANGE_TYPE(T) \ CREATE_RANGE_BENCHMARK(T, 10), CREATE_RANGE_BENCHMARK(T, 100), \ @@ -602,17 +621,22 @@ void add_range_benchmarks(std::vector& benchmar benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_MULTI_RANGE_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS) \ -benchmark::RegisterBenchmark( \ - (std::string("multi_histogram_range") + "" + \ - "(Bin Count:" + std::to_string(BINS) + " bins)" \ - ).c_str(), \ - [=](benchmark::State& state) { \ - run_multi_range_benchmark( \ - state, BINS, stream, size \ - ); \ - } \ -) +#define CREATE_MULTI_RANGE_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS) \ + benchmark::RegisterBenchmark( \ + std::string("device_multi_histogram_range" \ + ".(Bin Count:" \ + + std::to_string(BINS) \ + + " bins)" \ + ).c_str(), \ + [=](benchmark::State& state){ \ + run_multi_range_benchmark( \ + state, BINS, stream, size \ + ); \ + } \ + ) void add_multi_range_benchmarks(std::vector& benchmarks, hipStream_t stream, diff --git a/benchmark/benchmark_device_memory.cpp b/benchmark/benchmark_device_memory.cpp index 079cc19e..397b9c65 100644 --- a/benchmark/benchmark_device_memory.cpp +++ b/benchmark/benchmark_device_memory.cpp @@ -401,30 +401,33 @@ void run_benchmark_memcpy(benchmark::State& state, size_t size, const hipStream_ HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK_IPT(METHOD, OPERATION, T, SIZE, BS, IPT) \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - (std::string("device_memory.") \ - ).c_str(), \ - [=](benchmark::State& state){ \ - run_benchmark(state, SIZE, stream); \ - } \ - ) \ - ); \ - -#define CREATE_BENCHMARK_MEMCPY(T, SIZE) \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - (std::string("device_memory_memcpy.") \ - ).c_str(), \ - [=](benchmark::State& state) { run_benchmark_memcpy(state, SIZE, stream); } \ - ) \ - ); \ +#define CREATE_BENCHMARK_IPT(METHOD, OPERATION, T, SIZE, BS, IPT) \ + benchmarks.push_back( \ + benchmark::RegisterBenchmark( \ + std::string("device_memory." \ + ).c_str(), \ + [=](benchmark::State& state){ \ + run_benchmark(state, SIZE, stream); \ + } \ + ) \ + ); \ + +#define CREATE_BENCHMARK_MEMCPY(T, SIZE) \ + benchmarks.push_back( \ + benchmark::RegisterBenchmark( \ + std::string("device_memory_memcpy." \ + ).c_str(), \ + [=](benchmark::State& state){ \ + run_benchmark_memcpy(state, SIZE, stream); \ + } \ + ) \ + ); \ // clang-format off #define CREATE_BENCHMARK_BLOCK_SIZE(MEM_OP, OP, TYPE, SIZE, BLOCK_SIZE) \ diff --git a/benchmark/benchmark_device_merge_sort.cpp b/benchmark/benchmark_device_merge_sort.cpp index aec81680..bac53105 100644 --- a/benchmark/benchmark_device_merge_sort.cpp +++ b/benchmark/benchmark_device_merge_sort.cpp @@ -230,25 +230,30 @@ void run_sort_pairs_benchmark(benchmark::State& state, } -#define CREATE_SORT_KEYS_BENCHMARK(T) \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - (std::string("device_merge_sort_sort_keys.") \ - ).c_str(), \ - [=](benchmark::State& state) { run_sort_keys_benchmark(state, stream, size); } \ - ) \ - ); \ - -#define CREATE_SORT_PAIRS_BENCHMARK(T, V) \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - (std::string("device_merge_sort_sort_pairs<" \ - ",Key Datatype:" #T \ - ",Value Datatype:" #V ">.") \ - ).c_str(), \ - [=](benchmark::State& state) { run_sort_pairs_benchmark(state, stream, size); } \ - ) \ - ); \ +#define CREATE_SORT_KEYS_BENCHMARK(T) \ + benchmarks.push_back( \ + benchmark::RegisterBenchmark( \ + std::string("device_merge_sort_sort_keys" \ + "." \ + ).c_str(), \ + [=](benchmark::State& state){ \ + run_sort_keys_benchmark(state, stream, size); \ + } \ + ) \ + ); \ + +#define CREATE_SORT_PAIRS_BENCHMARK(T, V) \ + benchmarks.push_back( \ + benchmark::RegisterBenchmark( \ + std::string("device_merge_sort_sort_pairs<" \ + ",Key Datatype:" #T \ + ",Value Datatype:" #V ">." \ + ).c_str(), \ + [=](benchmark::State& state){ \ + run_sort_pairs_benchmark(state, stream, size); \ + } \ + ) \ + ); \ void add_sort_keys_benchmarks(std::vector& benchmarks, diff --git a/benchmark/benchmark_device_partition.cpp b/benchmark/benchmark_device_partition.cpp index 1ead466d..68c5b076 100644 --- a/benchmark/benchmark_device_partition.cpp +++ b/benchmark/benchmark_device_partition.cpp @@ -352,23 +352,42 @@ void run_threeway(benchmark::State& state, HIP_CHECK(hipFree(d_num_selected_output)); } -#define CREATE_BENCHMARK_FLAGGED(T, T_FLAG, SPLIT_T) \ -benchmark::RegisterBenchmark( \ - "parition_flagged(Split Threshold:" #SPLIT_T "%)", \ - &run_flagged, stream, static_cast(SPLIT_T), size \ -) - -#define CREATE_BENCHMARK_PREDICATE(T, SPLIT_T) \ -benchmark::RegisterBenchmark( \ - "parition_predicate(Split Threshold:" #SPLIT_T "%)", \ - &run_predicate, stream, static_cast(SPLIT_T), size \ -) - -#define CREATE_BENCHMARK_THREEWAY(T, SMALL_T, LARGE_T) \ -benchmark::RegisterBenchmark( \ - "parition_three_way(Small Threshold:" #SMALL_T "%,Large Threshold:" #LARGE_T "%)", \ - &run_threeway, stream, static_cast(SMALL_T), static_cast(LARGE_T), size \ -) +#define CREATE_BENCHMARK_FLAGGED(T, T_FLAG, SPLIT_T) \ + benchmark::RegisterBenchmark( \ + std::string("device_parition_flagged.(Split Threshold:" #SPLIT_T \ + "%)" \ + ).c_str(), \ + &run_flagged, \ + stream, \ + static_cast(SPLIT_T), size \ + ) + +#define CREATE_BENCHMARK_PREDICATE(T, SPLIT_T) \ + benchmark::RegisterBenchmark( \ + std::string("device_parition_predicate.(Split Threshold:" #SPLIT_T \ + "%)" \ + ).c_str(), \ + &run_predicate, \ + stream, \ + static_cast(SPLIT_T), size \ + ) + +#define CREATE_BENCHMARK_THREEWAY(T, SMALL_T, LARGE_T) \ + benchmark::RegisterBenchmark( \ + std::string("device_parition_three_way" \ + ".(Small Threshold:" #SMALL_T \ + "%,Large Threshold:" #LARGE_T \ + "%)" \ + ).c_str(), \ + &run_threeway, \ + stream, \ + static_cast(SMALL_T), \ + static_cast(LARGE_T), size \ + ) #define BENCHMARK_FLAGGED_TYPE(type, flag_type) \ CREATE_BENCHMARK_FLAGGED(type, flag_type, 33), \ diff --git a/benchmark/benchmark_device_radix_sort.cpp b/benchmark/benchmark_device_radix_sort.cpp index 48538fe5..0c19da1b 100644 --- a/benchmark/benchmark_device_radix_sort.cpp +++ b/benchmark/benchmark_device_radix_sort.cpp @@ -398,8 +398,10 @@ void run_sort_pairs_benchmark(benchmark::State& state, auto keys_input = std::make_shared>(generate_keys(size)); \ benchmarks.push_back( \ benchmark::RegisterBenchmark( \ - std::string(std::string("device_radix_sort_keys_ascending") \ - + "").c_str(), \ + std::string("device_radix_sort_keys_ascending" \ + "." \ + ).c_str(), \ [=](benchmark::State& state) { \ run_sort_keys_benchmark(state, stream, size, keys_input); \ } \ @@ -407,9 +409,10 @@ void run_sort_pairs_benchmark(benchmark::State& state, ); \ benchmarks.push_back( \ benchmark::RegisterBenchmark( \ - std::string(std::string("device_radix_sort_keys_descending") \ - + "").c_str(), \ + std::string("device_radix_sort_keys_descending" \ + "." \ + ).c_str(), \ [=](benchmark::State& state){ \ run_sort_keys_benchmark(state, stream, size, keys_input); \ } \ @@ -422,9 +425,11 @@ void run_sort_pairs_benchmark(benchmark::State& state, auto keys_input = std::make_shared>(generate_keys(size)); \ benchmarks.push_back( \ benchmark::RegisterBenchmark( \ - std::string(std::string("device_radix_sort_pairs_ascending") \ - + "").c_str(), \ + std::string("device_radix_sort_pairs_ascending" \ + "." \ + ).c_str(), \ [=](benchmark::State& state){ \ run_sort_pairs_benchmark(state, stream, size, keys_input); \ } \ @@ -432,10 +437,11 @@ void run_sort_pairs_benchmark(benchmark::State& state, ); \ benchmarks.push_back( \ benchmark::RegisterBenchmark( \ - std::string(std::string("device_radix_sort_pairs_descending") \ - + "").c_str(), \ + ">." \ + ).c_str(), \ [=](benchmark::State& state){ \ run_sort_pairs_benchmark(state, stream, size, keys_input); \ } \ diff --git a/benchmark/benchmark_device_reduce.cpp b/benchmark/benchmark_device_reduce.cpp index b82195de..92d60c05 100644 --- a/benchmark/benchmark_device_reduce.cpp +++ b/benchmark/benchmark_device_reduce.cpp @@ -132,10 +132,16 @@ struct Benchmark { } }; -#define CREATE_BENCHMARK(T, REDUCE_OP) \ - benchmark::RegisterBenchmark( \ - ("reduce"), \ - &Benchmark::run, size, stream \ +#define CREATE_BENCHMARK(T, REDUCE_OP) \ + benchmark::RegisterBenchmark( \ + std::string("device_reduce" \ + "." \ + ).c_str(), \ + &Benchmark::run, \ + size, \ + stream \ ) #define CREATE_BENCHMARKS(REDUCE_OP) \ diff --git a/benchmark/benchmark_device_reduce_by_key.cpp b/benchmark/benchmark_device_reduce_by_key.cpp index 734e0623..9ed87691 100644 --- a/benchmark/benchmark_device_reduce_by_key.cpp +++ b/benchmark/benchmark_device_reduce_by_key.cpp @@ -161,15 +161,19 @@ void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t strea HIP_CHECK(hipFree(d_unique_count_output)); } -#define CREATE_BENCHMARK(Key, Value, REDUCE_OP) \ -benchmark::RegisterBenchmark( \ - (std::string("reduce_by_key") + "" + \ - "(Random Number Range:[1, " + std::to_string(max_length) + "])" \ - ).c_str(), \ - &run_benchmark, \ - max_length, stream, size, REDUCE_OP() \ +#define CREATE_BENCHMARK(Key, Value, REDUCE_OP) \ +benchmark::RegisterBenchmark( \ + std::string("device_reduce_by_key" \ + "." \ + "(Random Number Range:[1, " \ + + std::to_string(max_length) \ + + "])" \ + ).c_str(), \ + &run_benchmark, \ + max_length, stream, size, REDUCE_OP() \ ) #define CREATE_BENCHMARKS(REDUCE_OP) \ diff --git a/benchmark/benchmark_device_run_length_encode.cpp b/benchmark/benchmark_device_run_length_encode.cpp index 86250ba6..050a7010 100644 --- a/benchmark/benchmark_device_run_length_encode.cpp +++ b/benchmark/benchmark_device_run_length_encode.cpp @@ -245,13 +245,16 @@ void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length, HIP_CHECK(hipFree(d_runs_count_output)); } -#define CREATE_ENCODE_BENCHMARK(T) \ - benchmark::RegisterBenchmark( \ - (std::string("run_length_encode") + "" + \ - "(Random Number Range:[1, " + std::to_string(max_length) + "])" \ - ).c_str(), \ - &run_encode_benchmark, \ - max_length, stream, size \ +#define CREATE_ENCODE_BENCHMARK(T) \ + benchmark::RegisterBenchmark( \ + std::string("device_run_length_encode" \ + "." \ + "(Random Number Range:[1, " \ + + std::to_string(max_length) \ + + "])" \ + ).c_str(), \ + &run_encode_benchmark, \ + max_length, stream, size \ ) void add_encode_benchmarks(size_t max_length, diff --git a/benchmark/benchmark_device_scan.cpp b/benchmark/benchmark_device_scan.cpp index 753d812a..73b7251e 100644 --- a/benchmark/benchmark_device_scan.cpp +++ b/benchmark/benchmark_device_scan.cpp @@ -303,17 +303,27 @@ void run_benchmark_by_key(benchmark::State& state, HIP_CHECK(hipFree(d_temp_storage)); } -#define CREATE_BENCHMARK(EXCL, T, SCAN_OP) \ -benchmark::RegisterBenchmark( \ - (std::string(EXCL ? "device_exclusive_scan" : "device_inclusive_scan") + \ - ("")).c_str(), \ - &run_benchmark, size, stream, SCAN_OP() \ -), \ -benchmark::RegisterBenchmark( \ - (std::string(EXCL ? "device_exclusive_scan_by_key" : "device_inclusive_scan_by_key") + \ - ("")).c_str(), \ - &run_benchmark_by_key, size, stream, SCAN_OP() \ -) +#define CREATE_BENCHMARK(EXCL, T, SCAN_OP) \ + benchmark::RegisterBenchmark( \ + std::string(std::string(EXCL ? "device_exclusive_scan" : "device_inclusive_scan") \ + +"." \ + ).c_str(), \ + &run_benchmark, \ + size, \ + stream, \ + SCAN_OP() \ + ), \ + benchmark::RegisterBenchmark( \ + std::string(std::string(EXCL ? "device_exclusive_scan_by_key" : "device_inclusive_scan_by_key") \ + + "." \ + ).c_str(), \ + &run_benchmark_by_key, \ + size, stream, \ + SCAN_OP() \ + ) #define CREATE_BENCHMARKS(SCAN_OP) \ CREATE_BENCHMARK(false, int, SCAN_OP), \ diff --git a/benchmark/benchmark_device_segmented_radix_sort.cpp b/benchmark/benchmark_device_segmented_radix_sort.cpp index da4abdf4..287589cc 100644 --- a/benchmark/benchmark_device_segmented_radix_sort.cpp +++ b/benchmark/benchmark_device_segmented_radix_sort.cpp @@ -374,22 +374,30 @@ void run_sort_pairs_benchmark(benchmark::State& state, #define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ benchmark::RegisterBenchmark( \ - (std::string("device_segmented_radix_sort_keys") \ - + "" + \ - "(Segments:~" + std::to_string(SEGMENTS) + " segments)" \ + std::string("device_segmented_radix_sort_keys" \ + "." \ + "(Segments:~" \ + + std::to_string(SEGMENTS) \ + + " segments)" \ ).c_str(), \ [=](benchmark::State& state){ \ - run_sort_keys_benchmark(state, SEGMENTS, stream, size, Ascending); \ + run_sort_keys_benchmark(state, \ + SEGMENTS, \ + stream, \ + size, \ + Ascending); \ } \ ) #define CREATE_SORT_KEYS_DESCENDING_BENCHMARK(Key, SEGMENTS) \ benchmark::RegisterBenchmark( \ - (std::string("device_segmented_radix_sort_keys") \ - + "" + \ - "(Segments:~" + std::to_string(SEGMENTS) + " segments)" \ + std::string("device_segmented_radix_sort_keys" \ + "." \ + "(Segments:~" \ + + std::to_string(SEGMENTS) \ + + " segments)" \ ).c_str(), \ [=](benchmark::State& state){ \ run_sort_keys_benchmark(state, SEGMENTS, stream, size, Descending); \ @@ -426,11 +434,13 @@ void add_sort_keys_benchmarks(std::vector& benc #define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ benchmark::RegisterBenchmark( \ - (std::string("device_segmented_radix_sort_pairs") \ - + "" + \ - "(Segments:~" + std::to_string(SEGMENTS) + " segments)" \ + std::string("device_segmented_radix_sort_pairs" \ + "." \ + "(Segments:~" \ + + std::to_string(SEGMENTS) \ + + " segments)" \ ).c_str(), \ [=](benchmark::State& state){ \ run_sort_pairs_benchmark(state, SEGMENTS, stream, size, Ascending); \ @@ -439,11 +449,13 @@ benchmark::RegisterBenchmark( #define CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(Key, Value, SEGMENTS) \ benchmark::RegisterBenchmark( \ - (std::string("device_segmented_radix_sort_pairs") \ - + "" + \ - "(Segments:~" + std::to_string(SEGMENTS) + " segments)" \ + std::string("device_segmented_radix_sort_pairs" \ + "." \ + "(Segments:~" \ + + std::to_string(SEGMENTS) \ + + " segments)" \ ).c_str(), \ [=](benchmark::State& state){ \ run_sort_pairs_benchmark(state, SEGMENTS, stream, size, Descending);\ diff --git a/benchmark/benchmark_device_segmented_reduce.cpp b/benchmark/benchmark_device_segmented_reduce.cpp index 69804192..40bd05d6 100644 --- a/benchmark/benchmark_device_segmented_reduce.cpp +++ b/benchmark/benchmark_device_segmented_reduce.cpp @@ -196,9 +196,10 @@ struct Benchmark { #define CREATE_BENCHMARK(T, SEGMENTS, REDUCE_OP) \ benchmark::RegisterBenchmark( \ - (std::string("device_segmented_reduce") \ - + "" + \ + std::string("device_segmented_reduce" \ + "." \ "(Number of segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ diff --git a/benchmark/benchmark_device_segmented_sort.cpp b/benchmark/benchmark_device_segmented_sort.cpp index 8015a641..0fdcbbb6 100644 --- a/benchmark/benchmark_device_segmented_sort.cpp +++ b/benchmark/benchmark_device_segmented_sort.cpp @@ -371,10 +371,10 @@ void run_sort_pairs_benchmark(benchmark::State &state, #define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ benchmark::RegisterBenchmark( \ - (std::string("device_segmented_sort_keys") \ - + "" + \ + ",Stable:false>." \ "(Number of segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ @@ -384,10 +384,10 @@ void run_sort_pairs_benchmark(benchmark::State &state, } \ ), \ benchmark::RegisterBenchmark( \ - (std::string("device_segmented_sort_keys") \ - + "" + \ + ",Stable:false>." \ "(Number of segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ @@ -397,10 +397,10 @@ void run_sort_pairs_benchmark(benchmark::State &state, } \ ), \ benchmark::RegisterBenchmark( \ - (std::string("device_segmented_sort_keys") \ - + "" + \ + ",Stable:true>." \ "(Number of segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ @@ -410,10 +410,10 @@ void run_sort_pairs_benchmark(benchmark::State &state, } \ ), \ benchmark::RegisterBenchmark( \ - (std::string("device_segmented_sort_keys") \ - + "" + \ + ",Stable:true>." \ "(Number of segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ @@ -450,7 +450,7 @@ void add_sort_keys_benchmarks(std::vector &ben + "" + \ + ",Stable:false>." + \ "(Number of segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ @@ -464,7 +464,7 @@ void add_sort_keys_benchmarks(std::vector &ben + "" + \ + ",Stable:false>." + \ "(Number of segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ @@ -478,7 +478,7 @@ void add_sort_keys_benchmarks(std::vector &ben + "" + \ + ",Stable:true>." + \ "(Number of segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ @@ -492,7 +492,7 @@ void add_sort_keys_benchmarks(std::vector &ben + "" + \ + ",Stable:true>." + \ "(Number of segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ diff --git a/benchmark/benchmark_device_select.cpp b/benchmark/benchmark_device_select.cpp index b6cfb596..c5a925a8 100644 --- a/benchmark/benchmark_device_select.cpp +++ b/benchmark/benchmark_device_select.cpp @@ -502,7 +502,7 @@ void run_unique_by_key_benchmark(benchmark::State& state, benchmark::RegisterBenchmark( \ ("device_select_flagged(Probability:" #p")"), \ + ",Selected Output Datatype:unsigned int>.(Probability:" #p")"), \ &run_flagged_benchmark, size, stream, p \ ) @@ -510,7 +510,7 @@ benchmark::RegisterBenchmark( \ benchmark::RegisterBenchmark( \ ("device_select_if(Probability:" #p")"), \ + ",Selected Output Datatype:unsigned int>.(Probability:" #p")"), \ &run_selectop_benchmark, size, stream, p \ ) @@ -518,7 +518,7 @@ benchmark::RegisterBenchmark( \ benchmark::RegisterBenchmark( \ ("device_select_unique(Probability:" #p")"), \ + ",Selected Output Datatype:unsigned int>.(Probability:" #p")"), \ &run_unique_benchmark, size, stream, p \ ) @@ -526,7 +526,7 @@ benchmark::RegisterBenchmark( \ benchmark::RegisterBenchmark( \ ("device_select_unique_by_key(Probability:" #p")"), \ + ",Selected Output Datatype:unsigned int>.(Probability:" #p")"), \ &run_unique_by_key_benchmark, size, stream, p \ ) diff --git a/benchmark/benchmark_warp_exchange.cpp b/benchmark/benchmark_warp_exchange.cpp index bdc2f5d0..5bcbb65e 100644 --- a/benchmark/benchmark_warp_exchange.cpp +++ b/benchmark/benchmark_warp_exchange.cpp @@ -245,7 +245,7 @@ struct BlockedToStripedOp ",Block Size:" #BS \ ",Items Per Thread:" #IT \ ",Warp Size:" #WS \ - ",Algorithm:" #ALG ">" \ + ",Algorithm:" #ALG ">." \ ).c_str(), \ &run_benchmark, \ stream, \ @@ -258,7 +258,7 @@ struct BlockedToStripedOp ",Block Size:" #BS \ ",Items Per Thread:" #IT \ ",Warp Size:" #WS \ - ",Algorithm:" #ALG ">" \ + ",Algorithm:" #ALG ">." \ ).c_str(), \ &run_benchmark, \ stream, \ @@ -271,7 +271,7 @@ benchmark::RegisterBenchmark( \ ",Offset Type:" #OFFSET_T \ ",Block Size:" #BS \ ",Items Per Thread:" #IT \ - ",Warp Size:" #WS ">" \ + ",Warp Size:" #WS ">." \ ).c_str(), \ &run_benchmark_scatter_to_striped, \ stream, size \ diff --git a/benchmark/benchmark_warp_load.cpp b/benchmark/benchmark_warp_load.cpp index 50bc0a19..958cc558 100644 --- a/benchmark/benchmark_warp_load.cpp +++ b/benchmark/benchmark_warp_load.cpp @@ -127,12 +127,16 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ -benchmark::RegisterBenchmark( \ - "warp_load.", \ - &run_benchmark, \ - stream, size \ -) +#define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ + benchmark::RegisterBenchmark( \ + "warp_load.", \ + &run_benchmark, \ + stream, size \ + ) int main(int argc, char *argv[]) { diff --git a/benchmark/benchmark_warp_merge_sort.cpp b/benchmark/benchmark_warp_merge_sort.cpp index 9a0e4fd8..13730a16 100644 --- a/benchmark/benchmark_warp_merge_sort.cpp +++ b/benchmark/benchmark_warp_merge_sort.cpp @@ -448,16 +448,20 @@ void run_segmented_benchmark(benchmark::State& state, const benchmark_kinds benc HIP_CHECK(hipFree(d_segment_sizes)); } -#define CREATE_BENCHMARK(T, BS, WS, IPT) \ -do { \ - const auto benchmark_name = \ - std::string{"warp_merge_sort.SubAlgorithm Name:"} + name; \ - if(WS <= device_warp_size) { \ - benchmarks.push_back(benchmark::RegisterBenchmark(benchmark_name.c_str(), \ - segmented ? &run_benchmark : &run_segmented_benchmark, \ - benchmark_kind, stream, size)); \ - } \ -} while(false) +#define CREATE_BENCHMARK(T, BS, WS, IPT) \ + if(WS <= device_warp_size) { \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("warp_merge_sort.SubAlgorithm Name:" \ + + name \ + ).c_str(), \ + segmented ? &run_benchmark : &run_segmented_benchmark, \ + benchmark_kind, stream, size)); \ + } \ + #define BENCHMARK_TYPE_WS(type, block, warp) \ CREATE_BENCHMARK(type, block, warp, 1); \ diff --git a/benchmark/benchmark_warp_reduce.cpp b/benchmark/benchmark_warp_reduce.cpp index 39716261..ec6f39bb 100644 --- a/benchmark/benchmark_warp_reduce.cpp +++ b/benchmark/benchmark_warp_reduce.cpp @@ -181,12 +181,17 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_flags)); } -#define CREATE_BENCHMARK(T, WS, BS) \ -benchmark::RegisterBenchmark( \ - (std::string("warp_reduce.SubAlgorithm Name:") + name).c_str(), \ - &run_benchmark, \ - stream, size \ -) +#define CREATE_BENCHMARK(T, WS, BS) \ + benchmark::RegisterBenchmark( \ + std::string("warp_reduce.SubAlgorithm Name:" \ + + name \ + ).c_str(), \ + &run_benchmark, \ + stream, size \ + ) // If warp size limit is 16 diff --git a/benchmark/benchmark_warp_scan.cpp b/benchmark/benchmark_warp_scan.cpp index c66b31aa..10f6748e 100644 --- a/benchmark/benchmark_warp_scan.cpp +++ b/benchmark/benchmark_warp_scan.cpp @@ -159,14 +159,18 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK_IMPL(T, BS, WS, OP) \ - benchmark::RegisterBenchmark((std::string("warp_scan.Method Name:") \ - + method_name) \ - .c_str(), \ - &run_benchmark, \ - stream, \ - size) +#define CREATE_BENCHMARK_IMPL(T, BS, WS, OP) \ + benchmark::RegisterBenchmark( \ + std::string("warp_scan.Method Name:" \ + + method_name \ + ).c_str(), \ + &run_benchmark, \ + stream, \ + size \ + ) #define CREATE_BENCHMARK(T, BS, WS) CREATE_BENCHMARK_IMPL(T, BS, WS, Benchmark) diff --git a/benchmark/benchmark_warp_store.cpp b/benchmark/benchmark_warp_store.cpp index a331f16b..6b816baa 100644 --- a/benchmark/benchmark_warp_store.cpp +++ b/benchmark/benchmark_warp_store.cpp @@ -113,11 +113,16 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ -benchmark::RegisterBenchmark( \ - "warp_store.", \ - &run_benchmark, \ - stream, size \ +#define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ +benchmark::RegisterBenchmark( \ + std::string("warp_store." \ + ).c_str(), \ + &run_benchmark, \ + stream, size \ ) int main(int argc, char *argv[]) From ca5a76c668109508c653ade96de53faab346d064 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 12 Jun 2024 13:28:10 -0600 Subject: [PATCH 29/46] Updated Value Type to Value Datatype --- benchmark/benchmark_device_select.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/benchmark_device_select.cpp b/benchmark/benchmark_device_select.cpp index c5a925a8..11bd40e6 100644 --- a/benchmark/benchmark_device_select.cpp +++ b/benchmark/benchmark_device_select.cpp @@ -525,7 +525,7 @@ benchmark::RegisterBenchmark( \ #define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p) \ benchmark::RegisterBenchmark( \ ("device_select_unique_by_key.(Probability:" #p")"), \ &run_unique_by_key_benchmark, size, stream, p \ ) From 7d7aec579e79644b120641390cd264dc5095c3b4 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Thu, 13 Jun 2024 15:57:20 -0600 Subject: [PATCH 30/46] changed field to snake case --- .../benchmark_block_adjacent_difference.cpp | 22 +++--- benchmark/benchmark_block_discontinuity.cpp | 8 +-- benchmark/benchmark_block_exchange.cpp | 8 +-- benchmark/benchmark_block_histogram.cpp | 10 +-- benchmark/benchmark_block_merge_sort.cpp | 8 +-- benchmark/benchmark_block_radix_rank.cpp | 8 +-- benchmark/benchmark_block_radix_sort.cpp | 8 +-- benchmark/benchmark_block_reduce.cpp | 8 +-- .../benchmark_block_run_length_decode.cpp | 14 ++-- benchmark/benchmark_block_scan.cpp | 10 +-- benchmark/benchmark_block_shuffle.cpp | 14 ++-- .../benchmark_device_adjacent_difference.cpp | 4 +- benchmark/benchmark_device_batch_copy.cpp | 12 ++-- benchmark/benchmark_device_batch_memcpy.cpp | 12 ++-- benchmark/benchmark_device_histogram.cpp | 28 ++++---- benchmark/benchmark_device_memory.cpp | 12 ++-- benchmark/benchmark_device_merge_sort.cpp | 6 +- benchmark/benchmark_device_partition.cpp | 16 ++--- benchmark/benchmark_device_radix_sort.cpp | 12 ++-- benchmark/benchmark_device_reduce.cpp | 4 +- benchmark/benchmark_device_reduce_by_key.cpp | 8 +-- .../benchmark_device_run_length_encode.cpp | 23 +++--- benchmark/benchmark_device_scan.cpp | 8 +-- .../benchmark_device_segmented_radix_sort.cpp | 24 +++---- .../benchmark_device_segmented_reduce.cpp | 6 +- benchmark/benchmark_device_segmented_sort.cpp | 72 +++++++++---------- benchmark/benchmark_device_select.cpp | 16 ++--- benchmark/benchmark_device_spmv.cpp | 4 +- benchmark/benchmark_warp_exchange.cpp | 30 ++++---- benchmark/benchmark_warp_load.cpp | 10 +-- benchmark/benchmark_warp_merge_sort.cpp | 10 +-- benchmark/benchmark_warp_reduce.cpp | 8 +-- benchmark/benchmark_warp_scan.cpp | 8 +-- benchmark/benchmark_warp_store.cpp | 10 +-- 34 files changed, 232 insertions(+), 229 deletions(-) diff --git a/benchmark/benchmark_block_adjacent_difference.cpp b/benchmark/benchmark_block_adjacent_difference.cpp index 89cc1636..bc3bf62a 100644 --- a/benchmark/benchmark_block_adjacent_difference.cpp +++ b/benchmark/benchmark_block_adjacent_difference.cpp @@ -338,17 +338,17 @@ auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -# define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ - benchmark::RegisterBenchmark( \ - std::string("block_adjacent_difference.SubAlgorithm Name:") + name \ - + std::string("" \ - ).c_str(), \ - &run_benchmark, \ - stream, \ - size \ +# define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ + benchmark::RegisterBenchmark( \ + std::string("block_adjacent_difference.sub_algorithm_name:") + name \ + + std::string("" \ + ).c_str(), \ + &run_benchmark, \ + stream, \ + size \ ) diff --git a/benchmark/benchmark_block_discontinuity.cpp b/benchmark/benchmark_block_discontinuity.cpp index 15ce5a98..44babbe5 100644 --- a/benchmark/benchmark_block_discontinuity.cpp +++ b/benchmark/benchmark_block_discontinuity.cpp @@ -240,11 +240,11 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) #define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ benchmark::RegisterBenchmark( \ - std::string("block_discontinuity.SubAlgorithm Name:" \ + std::string("block_discontinuity.sub_algorithm_name:" \ + name \ - + "." \ + + "." \ ).c_str(), \ &run_benchmark, \ stream, size \ diff --git a/benchmark/benchmark_block_exchange.cpp b/benchmark/benchmark_block_exchange.cpp index 8d62979c..952b9f92 100644 --- a/benchmark/benchmark_block_exchange.cpp +++ b/benchmark/benchmark_block_exchange.cpp @@ -295,10 +295,10 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark( \ - std::string("block_exchange.SubAlgorithm Name:" \ + std::string("block_exchange.sub_algorithm_name:" \ + name \ ).c_str(), \ &run_benchmark, \ diff --git a/benchmark/benchmark_block_histogram.cpp b/benchmark/benchmark_block_histogram.cpp index 0771a0cc..dd202e36 100644 --- a/benchmark/benchmark_block_histogram.cpp +++ b/benchmark/benchmark_block_histogram.cpp @@ -145,12 +145,12 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) // IPT - items per thread #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark( \ - std::string("block_histogram.Method Name:" \ + + ">.method_name:" \ + method_name \ ).c_str(), \ &run_benchmark, \ diff --git a/benchmark/benchmark_block_merge_sort.cpp b/benchmark/benchmark_block_merge_sort.cpp index 1cd47c9b..0d502390 100644 --- a/benchmark/benchmark_block_merge_sort.cpp +++ b/benchmark/benchmark_block_merge_sort.cpp @@ -179,10 +179,10 @@ void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipS #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark( \ - std::string("block_merge_sort.SubAlgorithm Name:" \ + std::string("block_merge_sort.sub_algorithm_name:" \ + name \ ).c_str(), \ &run_benchmark, \ diff --git a/benchmark/benchmark_block_radix_rank.cpp b/benchmark/benchmark_block_radix_rank.cpp index e44a6495..e351b6c2 100644 --- a/benchmark/benchmark_block_radix_rank.cpp +++ b/benchmark/benchmark_block_radix_rank.cpp @@ -159,10 +159,10 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) #define CREATE_BENCHMARK(T, KIND, BS, IPT) \ benchmark::RegisterBenchmark( \ - std::string("block_radix_rank." \ + name \ ).c_str(), \ diff --git a/benchmark/benchmark_block_radix_sort.cpp b/benchmark/benchmark_block_radix_sort.cpp index 1e3fe347..c63a566b 100644 --- a/benchmark/benchmark_block_radix_sort.cpp +++ b/benchmark/benchmark_block_radix_sort.cpp @@ -247,10 +247,10 @@ void run_benchmark(benchmark::State& state, #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark( \ - std::string("block_radix_sort.SubAlgorithm Name:" \ + std::string("block_radix_sort.sub_algorithm_name:" \ + name \ ).c_str(), \ &run_benchmark, \ diff --git a/benchmark/benchmark_block_reduce.cpp b/benchmark/benchmark_block_reduce.cpp index 00571d11..c3364a20 100644 --- a/benchmark/benchmark_block_reduce.cpp +++ b/benchmark/benchmark_block_reduce.cpp @@ -137,10 +137,10 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) // IPT - items per thread #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark( \ - std::string("block_reduce.Method Name:" + method_name \ + std::string("block_reduce.method_name:" + method_name \ ).c_str(), \ &run_benchmark, \ stream, size \ diff --git a/benchmark/benchmark_block_run_length_decode.cpp b/benchmark/benchmark_block_run_length_decode.cpp index 6de3cb41..7a854fe4 100644 --- a/benchmark/benchmark_block_run_length_decode.cpp +++ b/benchmark/benchmark_block_run_length_decode.cpp @@ -186,13 +186,13 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) #define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT) \ benchmark::RegisterBenchmark( \ - std::string("block_run_length_decode." \ + std::string("block_run_length_decode." \ ).c_str(), \ &run_benchmark, \ stream, size \ diff --git a/benchmark/benchmark_block_scan.cpp b/benchmark/benchmark_block_scan.cpp index f38267e4..38097f17 100644 --- a/benchmark/benchmark_block_scan.cpp +++ b/benchmark/benchmark_block_scan.cpp @@ -156,11 +156,11 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) // IPT - items per thread #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark( \ - (std::string("block_scan.Method Name:") + method_name \ + (std::string("block_scan.method_name:") + method_name \ ).c_str(), \ &run_benchmark, \ stream, size \ diff --git a/benchmark/benchmark_block_shuffle.cpp b/benchmark/benchmark_block_shuffle.cpp index 8421dd3b..0d7289fb 100644 --- a/benchmark/benchmark_block_shuffle.cpp +++ b/benchmark/benchmark_block_shuffle.cpp @@ -216,12 +216,12 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) #define CREATE_BENCHMARK_IPT(BS, IPT) \ benchmark::RegisterBenchmark( \ - ("block_shuffle.SubAlgorithm Name:" \ + ">.sub_algorithm_name:" \ + name \ ).c_str(), \ &run_benchmark, \ @@ -231,10 +231,10 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) #define CREATE_BENCHMARK(BS) \ benchmark::RegisterBenchmark( \ - ("block_shuffle.SubAlgorithm Name:" \ + + ",block_size:" #BS \ + ">.sub_algorithm_name:" \ + name \ ).c_str(), \ &run_benchmark, \ diff --git a/benchmark/benchmark_device_adjacent_difference.cpp b/benchmark/benchmark_device_adjacent_difference.cpp index 99f2a278..7ac57c0b 100644 --- a/benchmark/benchmark_device_adjacent_difference.cpp +++ b/benchmark/benchmark_device_adjacent_difference.cpp @@ -183,8 +183,8 @@ using namespace std::string_literals; #define CREATE_BENCHMARK(T, left, copy) \ benchmark::RegisterBenchmark( \ std::string("device_adjacent_difference" \ - "." \ - "SubAlgorithm Name:Subtract" \ + "." \ + "sub_algorithm_name:Subtract" \ + std::string(left ? "Left" : "Right") \ + std::string(copy ? "Copy" : "") \ ).c_str(), \ diff --git a/benchmark/benchmark_device_batch_copy.cpp b/benchmark/benchmark_device_batch_copy.cpp index 8aa449d4..06d2e94e 100644 --- a/benchmark/benchmark_device_batch_copy.cpp +++ b/benchmark/benchmark_device_batch_copy.cpp @@ -330,12 +330,12 @@ void run_benchmark(benchmark::State& state, #define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev) \ benchmark::RegisterBenchmark( \ std::string("device_batch_copy" \ - "." \ + "." \ ).c_str(), \ [=](benchmark::State& state){ \ run_benchmark, \ diff --git a/benchmark/benchmark_device_batch_memcpy.cpp b/benchmark/benchmark_device_batch_memcpy.cpp index 6e0ae35e..56e639b4 100644 --- a/benchmark/benchmark_device_batch_memcpy.cpp +++ b/benchmark/benchmark_device_batch_memcpy.cpp @@ -339,12 +339,12 @@ void run_benchmark(benchmark::State& state, #define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev) \ benchmark::RegisterBenchmark( \ - std::string("device_batch_memcpy." \ + std::string("device_batch_memcpy." \ ).c_str(), \ [=](benchmark::State& state){ \ run_benchmark, \ diff --git a/benchmark/benchmark_device_histogram.cpp b/benchmark/benchmark_device_histogram.cpp index 943ffebb..663ddd94 100644 --- a/benchmark/benchmark_device_histogram.cpp +++ b/benchmark/benchmark_device_histogram.cpp @@ -511,11 +511,11 @@ struct num_limits<__half> VECTOR.push_back( \ benchmark::RegisterBenchmark( \ std::string("device_histogram_even" \ - "." \ - "(Entropy Percent:" \ + "(entropy_percent:" \ + std::to_string(get_entropy_percents(entropy_reduction)) \ - + "%,Bin Count:" \ + + "%,bin_count:" \ + std::to_string(BINS) \ + " bins)" \ ).c_str(), \ @@ -556,12 +556,12 @@ void add_even_benchmarks(std::vector& benchmark #define CREATE_MULTI_EVEN_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS, SCALE) \ benchmark::RegisterBenchmark( \ std::string("device_multi_histogram_even" \ - "." \ - "(Entropy Percent:" \ + "." \ + "(entropy_percent:" \ + std::to_string(get_entropy_percents(entropy_reduction)) \ - + "%,Bin Count:" + \ + + "%,bin_count:" + \ std::to_string(BINS) \ + " bins)" \ ).c_str(), \ @@ -597,8 +597,8 @@ void add_multi_even_benchmarks(std::vector& ben #define CREATE_RANGE_BENCHMARK(T, BINS) \ benchmark::RegisterBenchmark( \ std::string("device_histogram_range" \ - "." \ - "(Bin Count:" \ + "." \ + "(bin_count:" \ + std::to_string(BINS) \ + " bins)" \ ).c_str(), \ @@ -624,10 +624,10 @@ void add_range_benchmarks(std::vector& benchmar #define CREATE_MULTI_RANGE_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS) \ benchmark::RegisterBenchmark( \ std::string("device_multi_histogram_range" \ - ".(Bin Count:" \ + ".(bin_count:" \ + std::to_string(BINS) \ + " bins)" \ ).c_str(), \ diff --git a/benchmark/benchmark_device_memory.cpp b/benchmark/benchmark_device_memory.cpp index 397b9c65..027f591f 100644 --- a/benchmark/benchmark_device_memory.cpp +++ b/benchmark/benchmark_device_memory.cpp @@ -404,12 +404,12 @@ void run_benchmark_memcpy(benchmark::State& state, size_t size, const hipStream_ #define CREATE_BENCHMARK_IPT(METHOD, OPERATION, T, SIZE, BS, IPT) \ benchmarks.push_back( \ benchmark::RegisterBenchmark( \ - std::string("device_memory." \ + std::string("device_memory." \ ).c_str(), \ [=](benchmark::State& state){ \ run_benchmark(state, SIZE, stream); \ diff --git a/benchmark/benchmark_device_merge_sort.cpp b/benchmark/benchmark_device_merge_sort.cpp index bac53105..38baada6 100644 --- a/benchmark/benchmark_device_merge_sort.cpp +++ b/benchmark/benchmark_device_merge_sort.cpp @@ -234,7 +234,7 @@ void run_sort_pairs_benchmark(benchmark::State& state, benchmarks.push_back( \ benchmark::RegisterBenchmark( \ std::string("device_merge_sort_sort_keys" \ - "." \ + "." \ ).c_str(), \ [=](benchmark::State& state){ \ run_sort_keys_benchmark(state, stream, size); \ @@ -246,8 +246,8 @@ void run_sort_pairs_benchmark(benchmark::State& state, benchmarks.push_back( \ benchmark::RegisterBenchmark( \ std::string("device_merge_sort_sort_pairs<" \ - ",Key Datatype:" #T \ - ",Value Datatype:" #V ">." \ + ",key_data_type:" #T \ + ",value_data_type:" #V ">." \ ).c_str(), \ [=](benchmark::State& state){ \ run_sort_pairs_benchmark(state, stream, size); \ diff --git a/benchmark/benchmark_device_partition.cpp b/benchmark/benchmark_device_partition.cpp index 68c5b076..f87d4181 100644 --- a/benchmark/benchmark_device_partition.cpp +++ b/benchmark/benchmark_device_partition.cpp @@ -354,9 +354,9 @@ void run_threeway(benchmark::State& state, #define CREATE_BENCHMARK_FLAGGED(T, T_FLAG, SPLIT_T) \ benchmark::RegisterBenchmark( \ - std::string("device_parition_flagged.(Split Threshold:" #SPLIT_T \ + std::string("device_parition_flagged.(split_threshold:" #SPLIT_T \ "%)" \ ).c_str(), \ &run_flagged, \ @@ -366,8 +366,8 @@ void run_threeway(benchmark::State& state, #define CREATE_BENCHMARK_PREDICATE(T, SPLIT_T) \ benchmark::RegisterBenchmark( \ - std::string("device_parition_predicate.(Split Threshold:" #SPLIT_T \ + std::string("device_parition_predicate.(split_threshold:" #SPLIT_T \ "%)" \ ).c_str(), \ &run_predicate, \ @@ -378,9 +378,9 @@ void run_threeway(benchmark::State& state, #define CREATE_BENCHMARK_THREEWAY(T, SMALL_T, LARGE_T) \ benchmark::RegisterBenchmark( \ std::string("device_parition_three_way" \ - ".(Small Threshold:" #SMALL_T \ - "%,Large Threshold:" #LARGE_T \ + ".(small_threshold:" #SMALL_T \ + "%,large_threshold:" #LARGE_T \ "%)" \ ).c_str(), \ &run_threeway, \ diff --git a/benchmark/benchmark_device_radix_sort.cpp b/benchmark/benchmark_device_radix_sort.cpp index 0c19da1b..8f646b31 100644 --- a/benchmark/benchmark_device_radix_sort.cpp +++ b/benchmark/benchmark_device_radix_sort.cpp @@ -399,7 +399,7 @@ void run_sort_pairs_benchmark(benchmark::State& state, benchmarks.push_back( \ benchmark::RegisterBenchmark( \ std::string("device_radix_sort_keys_ascending" \ - "." \ ).c_str(), \ [=](benchmark::State& state) { \ @@ -410,7 +410,7 @@ void run_sort_pairs_benchmark(benchmark::State& state, benchmarks.push_back( \ benchmark::RegisterBenchmark( \ std::string("device_radix_sort_keys_descending" \ - "." \ ).c_str(), \ [=](benchmark::State& state){ \ @@ -426,8 +426,8 @@ void run_sort_pairs_benchmark(benchmark::State& state, benchmarks.push_back( \ benchmark::RegisterBenchmark( \ std::string("device_radix_sort_pairs_ascending" \ - "." \ ).c_str(), \ [=](benchmark::State& state){ \ @@ -438,8 +438,8 @@ void run_sort_pairs_benchmark(benchmark::State& state, benchmarks.push_back( \ benchmark::RegisterBenchmark( \ std::string("device_radix_sort_pairs_descending" \ - "." \ ).c_str(), \ [=](benchmark::State& state){ \ diff --git a/benchmark/benchmark_device_reduce.cpp b/benchmark/benchmark_device_reduce.cpp index 92d60c05..efc8f9b0 100644 --- a/benchmark/benchmark_device_reduce.cpp +++ b/benchmark/benchmark_device_reduce.cpp @@ -135,8 +135,8 @@ struct Benchmark { #define CREATE_BENCHMARK(T, REDUCE_OP) \ benchmark::RegisterBenchmark( \ std::string("device_reduce" \ - "." \ ).c_str(), \ &Benchmark::run, \ diff --git a/benchmark/benchmark_device_reduce_by_key.cpp b/benchmark/benchmark_device_reduce_by_key.cpp index 9ed87691..8a3fbdd8 100644 --- a/benchmark/benchmark_device_reduce_by_key.cpp +++ b/benchmark/benchmark_device_reduce_by_key.cpp @@ -164,11 +164,11 @@ void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t strea #define CREATE_BENCHMARK(Key, Value, REDUCE_OP) \ benchmark::RegisterBenchmark( \ std::string("device_reduce_by_key" \ - "." \ - "(Random Number Range:[1, " \ + "(random_number_range:[1, " \ + std::to_string(max_length) \ + "])" \ ).c_str(), \ diff --git a/benchmark/benchmark_device_run_length_encode.cpp b/benchmark/benchmark_device_run_length_encode.cpp index 050a7010..dc785f72 100644 --- a/benchmark/benchmark_device_run_length_encode.cpp +++ b/benchmark/benchmark_device_run_length_encode.cpp @@ -248,8 +248,8 @@ void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length, #define CREATE_ENCODE_BENCHMARK(T) \ benchmark::RegisterBenchmark( \ std::string("device_run_length_encode" \ - "." \ - "(Random Number Range:[1, " \ + "." \ + "(random_number_range:[1, " \ + std::to_string(max_length) \ + "])" \ ).c_str(), \ @@ -280,14 +280,17 @@ void add_encode_benchmarks(size_t max_length, benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_NON_TRIVIAL_RUNS_BENCHMARK(T) \ -benchmark::RegisterBenchmark( \ - (std::string("run_length_encode_non_trivial_runs") + "" + \ - "(Random Number Range:[1, " + std::to_string(max_length) + "])" \ - ).c_str(), \ - &run_non_trivial_runs_benchmark, \ - max_length, stream, size \ -) +#define CREATE_NON_TRIVIAL_RUNS_BENCHMARK(T) \ + benchmark::RegisterBenchmark( \ + std::string("run_length_encode_non_trivial_runs" \ + "" \ + "(random_number_range:[1, " \ + + std::to_string(max_length) \ + + "])" \ + ).c_str(), \ + &run_non_trivial_runs_benchmark, \ + max_length, stream, size \ + ) void add_non_trivial_runs_benchmarks(size_t max_length, std::vector& benchmarks, diff --git a/benchmark/benchmark_device_scan.cpp b/benchmark/benchmark_device_scan.cpp index 73b7251e..29b2f3c0 100644 --- a/benchmark/benchmark_device_scan.cpp +++ b/benchmark/benchmark_device_scan.cpp @@ -306,8 +306,8 @@ void run_benchmark_by_key(benchmark::State& state, #define CREATE_BENCHMARK(EXCL, T, SCAN_OP) \ benchmark::RegisterBenchmark( \ std::string(std::string(EXCL ? "device_exclusive_scan" : "device_inclusive_scan") \ - +"." \ ).c_str(), \ &run_benchmark, \ @@ -317,8 +317,8 @@ void run_benchmark_by_key(benchmark::State& state, ), \ benchmark::RegisterBenchmark( \ std::string(std::string(EXCL ? "device_exclusive_scan_by_key" : "device_inclusive_scan_by_key") \ - + "." \ + + "." \ ).c_str(), \ &run_benchmark_by_key, \ size, stream, \ diff --git a/benchmark/benchmark_device_segmented_radix_sort.cpp b/benchmark/benchmark_device_segmented_radix_sort.cpp index 287589cc..d2267612 100644 --- a/benchmark/benchmark_device_segmented_radix_sort.cpp +++ b/benchmark/benchmark_device_segmented_radix_sort.cpp @@ -375,9 +375,9 @@ void run_sort_pairs_benchmark(benchmark::State& state, #define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ benchmark::RegisterBenchmark( \ std::string("device_segmented_radix_sort_keys" \ - "." \ - "(Segments:~" \ + "." \ + "(segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ ).c_str(), \ @@ -393,9 +393,9 @@ benchmark::RegisterBenchmark( #define CREATE_SORT_KEYS_DESCENDING_BENCHMARK(Key, SEGMENTS) \ benchmark::RegisterBenchmark( \ std::string("device_segmented_radix_sort_keys" \ - "." \ - "(Segments:~" \ + "." \ + "(segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ ).c_str(), \ @@ -435,10 +435,10 @@ void add_sort_keys_benchmarks(std::vector& benc #define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ benchmark::RegisterBenchmark( \ std::string("device_segmented_radix_sort_pairs" \ - "." \ - "(Segments:~" \ + ",ascending:true>." \ + "(segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ ).c_str(), \ @@ -450,10 +450,10 @@ benchmark::RegisterBenchmark( #define CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(Key, Value, SEGMENTS) \ benchmark::RegisterBenchmark( \ std::string("device_segmented_radix_sort_pairs" \ - "." \ - "(Segments:~" \ + ",ascending:false>." \ + "(segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ ).c_str(), \ diff --git a/benchmark/benchmark_device_segmented_reduce.cpp b/benchmark/benchmark_device_segmented_reduce.cpp index 40bd05d6..77024237 100644 --- a/benchmark/benchmark_device_segmented_reduce.cpp +++ b/benchmark/benchmark_device_segmented_reduce.cpp @@ -197,10 +197,10 @@ struct Benchmark { #define CREATE_BENCHMARK(T, SEGMENTS, REDUCE_OP) \ benchmark::RegisterBenchmark( \ std::string("device_segmented_reduce" \ - "." \ - "(Number of segments:~" \ + "(number_of_segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ ).c_str(), \ diff --git a/benchmark/benchmark_device_segmented_sort.cpp b/benchmark/benchmark_device_segmented_sort.cpp index 0fdcbbb6..6426ae30 100644 --- a/benchmark/benchmark_device_segmented_sort.cpp +++ b/benchmark/benchmark_device_segmented_sort.cpp @@ -372,10 +372,10 @@ void run_sort_pairs_benchmark(benchmark::State &state, #define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ benchmark::RegisterBenchmark( \ std::string("device_segmented_sort_keys" \ - "." \ - "(Number of segments:~" \ + "." \ + "(number_of_segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ ).c_str(), \ @@ -385,10 +385,10 @@ void run_sort_pairs_benchmark(benchmark::State &state, ), \ benchmark::RegisterBenchmark( \ std::string("device_segmented_sort_keys" \ - "." \ - "(Number of segments:~" \ + "." \ + "(number_of_segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ ).c_str(), \ @@ -398,10 +398,10 @@ void run_sort_pairs_benchmark(benchmark::State &state, ), \ benchmark::RegisterBenchmark( \ std::string("device_segmented_sort_keys" \ - "." \ - "(Number of segments:~" \ + "." \ + "(number_of_segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ ).c_str(), \ @@ -411,10 +411,10 @@ void run_sort_pairs_benchmark(benchmark::State &state, ), \ benchmark::RegisterBenchmark( \ std::string("device_segmented_sort_keys" \ - "." \ - "(Number of segments:~" \ + "." \ + "(number_of_segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ ).c_str(), \ @@ -447,11 +447,11 @@ void add_sort_keys_benchmarks(std::vector &ben #define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ benchmark::RegisterBenchmark( \ (std::string("device_segmented_sort_pairs") \ - + "." + \ - "(Number of segments:~" \ + + "." + \ + "(number_of_segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ ).c_str(), \ @@ -461,11 +461,11 @@ void add_sort_keys_benchmarks(std::vector &ben ), \ benchmark::RegisterBenchmark( \ (std::string("device_segmented_sort_pairs") \ - + "." + \ - "(Number of segments:~" \ + + "." + \ + "(number_of_segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ ).c_str(), \ @@ -475,11 +475,11 @@ void add_sort_keys_benchmarks(std::vector &ben ), \ benchmark::RegisterBenchmark( \ (std::string("device_segmented_sort_pairs") \ - + "." + \ - "(Number of segments:~" \ + + "." + \ + "(number_of_segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ ).c_str(), \ @@ -489,11 +489,11 @@ void add_sort_keys_benchmarks(std::vector &ben ), \ benchmark::RegisterBenchmark( \ (std::string("device_segmented_sort_pairs") \ - + "." + \ - "(Number of segments:~" \ + + "." + \ + "(number_of_segments:~" \ + std::to_string(SEGMENTS) \ + " segments)" \ ).c_str(), \ diff --git a/benchmark/benchmark_device_select.cpp b/benchmark/benchmark_device_select.cpp index 11bd40e6..ee06ee7e 100644 --- a/benchmark/benchmark_device_select.cpp +++ b/benchmark/benchmark_device_select.cpp @@ -501,32 +501,32 @@ void run_unique_by_key_benchmark(benchmark::State& state, #define CREATE_SELECT_FLAGGED_BENCHMARK(T, F, p) \ benchmark::RegisterBenchmark( \ ("device_select_flagged.(Probability:" #p")"), \ + ",flag_type:" #F ",output_datatype:"#T \ + ",selected_output_datatype:unsigned int>.(probability:" #p")"), \ &run_flagged_benchmark, size, stream, p \ ) #define CREATE_SELECT_IF_BENCHMARK(T, p) \ benchmark::RegisterBenchmark( \ ("device_select_if.(Probability:" #p")"), \ + ",output_datatype:"#T \ + ",selected_output_datatype:unsigned int>.(probability:" #p")"), \ &run_selectop_benchmark, size, stream, p \ ) #define CREATE_UNIQUE_BENCHMARK(T, p) \ benchmark::RegisterBenchmark( \ ("device_select_unique.(Probability:" #p")"), \ + ",output_datatype:"#T \ + ",selected_output_datatype:unsigned int>.(probability:" #p")"), \ &run_unique_benchmark, size, stream, p \ ) #define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p) \ benchmark::RegisterBenchmark( \ ("device_select_unique_by_key.(Probability:" #p")"), \ + ",value_datatype:"#V \ + ",selected_output_datatype:unsigned int>.(probability:" #p")"), \ &run_unique_by_key_benchmark, size, stream, p \ ) diff --git a/benchmark/benchmark_device_spmv.cpp b/benchmark/benchmark_device_spmv.cpp index 54b4dcc4..a0ac69a2 100644 --- a/benchmark/benchmark_device_spmv.cpp +++ b/benchmark/benchmark_device_spmv.cpp @@ -171,8 +171,8 @@ void run_benchmark(benchmark::State& state, #define CREATE_BENCHMARK(T, p) \ benchmark::RegisterBenchmark( \ - (std::string("device_spmv_CsrMV.") \ + (std::string("device_spmv_CsrMV.") \ ).c_str(), \ &run_benchmark, size, stream, p \ ) diff --git a/benchmark/benchmark_warp_exchange.cpp b/benchmark/benchmark_warp_exchange.cpp index 5bcbb65e..cec62c71 100644 --- a/benchmark/benchmark_warp_exchange.cpp +++ b/benchmark/benchmark_warp_exchange.cpp @@ -241,11 +241,11 @@ struct BlockedToStripedOp #define CREATE_BENCHMARK_STRIPED_TO_BLOCKED(T, BS, IT, WS, ALG) \ benchmark::RegisterBenchmark( \ - std::string("warp_exchange_striped_to_blocked." \ + std::string("warp_exchange_striped_to_blocked." \ ).c_str(), \ &run_benchmark, \ stream, \ @@ -254,11 +254,11 @@ struct BlockedToStripedOp #define CREATE_BENCHMARK_BLOCKED_TO_STRIPED(T, BS, IT, WS, ALG) \ benchmark::RegisterBenchmark( \ - std::string("warp_exchange_blocked_to_striped." \ + std::string("warp_exchange_blocked_to_striped." \ ).c_str(), \ &run_benchmark, \ stream, \ @@ -267,11 +267,11 @@ struct BlockedToStripedOp #define CREATE_BENCHMARK_SCATTER_TO_STRIPED(T, OFFSET_T, BS, IT, WS) \ benchmark::RegisterBenchmark( \ - std::string("warp_exchange_scatter_to_striped." \ + std::string("warp_exchange_scatter_to_striped." \ ).c_str(), \ &run_benchmark_scatter_to_striped, \ stream, size \ diff --git a/benchmark/benchmark_warp_load.cpp b/benchmark/benchmark_warp_load.cpp index 958cc558..fb708537 100644 --- a/benchmark/benchmark_warp_load.cpp +++ b/benchmark/benchmark_warp_load.cpp @@ -129,11 +129,11 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) #define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ benchmark::RegisterBenchmark( \ - "warp_load.", \ + "warp_load.", \ &run_benchmark, \ stream, size \ ) diff --git a/benchmark/benchmark_warp_merge_sort.cpp b/benchmark/benchmark_warp_merge_sort.cpp index 13730a16..322271bc 100644 --- a/benchmark/benchmark_warp_merge_sort.cpp +++ b/benchmark/benchmark_warp_merge_sort.cpp @@ -451,11 +451,11 @@ void run_segmented_benchmark(benchmark::State& state, const benchmark_kinds benc #define CREATE_BENCHMARK(T, BS, WS, IPT) \ if(WS <= device_warp_size) { \ benchmarks.push_back(benchmark::RegisterBenchmark( \ - std::string("warp_merge_sort.SubAlgorithm Name:" \ + std::string("warp_merge_sort.sub_algorithm_name:" \ + name \ ).c_str(), \ segmented ? &run_benchmark : &run_segmented_benchmark, \ diff --git a/benchmark/benchmark_warp_reduce.cpp b/benchmark/benchmark_warp_reduce.cpp index ec6f39bb..65b6e991 100644 --- a/benchmark/benchmark_warp_reduce.cpp +++ b/benchmark/benchmark_warp_reduce.cpp @@ -183,10 +183,10 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) #define CREATE_BENCHMARK(T, WS, BS) \ benchmark::RegisterBenchmark( \ - std::string("warp_reduce.SubAlgorithm Name:" \ + std::string("warp_reduce.sub_algorithm_name:" \ + name \ ).c_str(), \ &run_benchmark, \ diff --git a/benchmark/benchmark_warp_scan.cpp b/benchmark/benchmark_warp_scan.cpp index 10f6748e..f8003473 100644 --- a/benchmark/benchmark_warp_scan.cpp +++ b/benchmark/benchmark_warp_scan.cpp @@ -161,10 +161,10 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) #define CREATE_BENCHMARK_IMPL(T, BS, WS, OP) \ benchmark::RegisterBenchmark( \ - std::string("warp_scan.Method Name:" \ + std::string("warp_scan.method_name:" \ + method_name \ ).c_str(), \ &run_benchmark, \ diff --git a/benchmark/benchmark_warp_store.cpp b/benchmark/benchmark_warp_store.cpp index 6b816baa..03f63e46 100644 --- a/benchmark/benchmark_warp_store.cpp +++ b/benchmark/benchmark_warp_store.cpp @@ -115,11 +115,11 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) #define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ benchmark::RegisterBenchmark( \ - std::string("warp_store." \ + std::string("warp_store." \ ).c_str(), \ &run_benchmark, \ stream, size \ From bcd2787ce15b58725154ed6be2737546a7c71541 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Thu, 13 Jun 2024 16:02:12 -0600 Subject: [PATCH 31/46] updated Value Datatype to value_data_type --- benchmark/benchmark_device_segmented_radix_sort.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/benchmark_device_segmented_radix_sort.cpp b/benchmark/benchmark_device_segmented_radix_sort.cpp index d2267612..22de7d13 100644 --- a/benchmark/benchmark_device_segmented_radix_sort.cpp +++ b/benchmark/benchmark_device_segmented_radix_sort.cpp @@ -436,7 +436,7 @@ void add_sort_keys_benchmarks(std::vector& benc benchmark::RegisterBenchmark( \ std::string("device_segmented_radix_sort_pairs" \ "." \ "(segments:~" \ + std::to_string(SEGMENTS) \ @@ -451,7 +451,7 @@ benchmark::RegisterBenchmark( benchmark::RegisterBenchmark( \ std::string("device_segmented_radix_sort_pairs" \ "." \ "(segments:~" \ + std::to_string(SEGMENTS) \ From b3c949b8f259ba47d7a254ad8885562af52cc87c Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Thu, 13 Jun 2024 16:04:31 -0600 Subject: [PATCH 32/46] updated Datatype to data_type --- benchmark/benchmark_device_select.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/benchmark/benchmark_device_select.cpp b/benchmark/benchmark_device_select.cpp index ee06ee7e..9ec49b01 100644 --- a/benchmark/benchmark_device_select.cpp +++ b/benchmark/benchmark_device_select.cpp @@ -500,33 +500,33 @@ void run_unique_by_key_benchmark(benchmark::State& state, #define CREATE_SELECT_FLAGGED_BENCHMARK(T, F, p) \ benchmark::RegisterBenchmark( \ - ("device_select_flagged.(probability:" #p")"), \ + ("device_select_flagged.(probability:" #p")"), \ &run_flagged_benchmark, size, stream, p \ ) #define CREATE_SELECT_IF_BENCHMARK(T, p) \ benchmark::RegisterBenchmark( \ - ("device_select_if.(probability:" #p")"), \ + ("device_select_if.(probability:" #p")"), \ &run_selectop_benchmark, size, stream, p \ ) #define CREATE_UNIQUE_BENCHMARK(T, p) \ benchmark::RegisterBenchmark( \ - ("device_select_unique.(probability:" #p")"), \ + ("device_select_unique.(probability:" #p")"), \ &run_unique_benchmark, size, stream, p \ ) #define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p) \ benchmark::RegisterBenchmark( \ - ("device_select_unique_by_key.(probability:" #p")"), \ + ("device_select_unique_by_key.(probability:" #p")"), \ &run_unique_by_key_benchmark, size, stream, p \ ) From 24d53076feb3e44ad4418a2f22da26e11b0196c8 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Mon, 17 Jun 2024 10:04:20 -0600 Subject: [PATCH 33/46] ran clang-format --- .../benchmark_block_adjacent_difference.cpp | 682 +++++------ benchmark/benchmark_block_discontinuity.cpp | 479 ++++---- benchmark/benchmark_block_exchange.cpp | 597 ++++----- benchmark/benchmark_block_histogram.cpp | 346 +++--- benchmark/benchmark_block_merge_sort.cpp | 392 +++--- benchmark/benchmark_block_radix_rank.cpp | 337 +++-- benchmark/benchmark_block_radix_sort.cpp | 513 ++++---- benchmark/benchmark_block_reduce.cpp | 347 +++--- .../benchmark_block_run_length_decode.cpp | 395 +++--- benchmark/benchmark_block_scan.cpp | 357 +++--- benchmark/benchmark_block_shuffle.cpp | 488 ++++---- .../benchmark_device_adjacent_difference.cpp | 312 +++-- benchmark/benchmark_device_batch_copy.cpp | 607 +++++---- benchmark/benchmark_device_batch_memcpy.cpp | 641 +++++----- benchmark/benchmark_device_histogram.cpp | 1091 +++++++---------- benchmark/benchmark_device_memory.cpp | 711 +++++------ benchmark/benchmark_device_merge_sort.cpp | 519 ++++---- benchmark/benchmark_device_partition.cpp | 724 +++++------ benchmark/benchmark_device_radix_sort.cpp | 786 +++++------- benchmark/benchmark_device_reduce.cpp | 305 +++-- benchmark/benchmark_device_reduce_by_key.cpp | 372 +++--- .../benchmark_device_run_length_encode.cpp | 569 ++++----- benchmark/benchmark_device_scan.cpp | 585 ++++----- .../benchmark_device_segmented_radix_sort.cpp | 843 ++++++------- .../benchmark_device_segmented_reduce.cpp | 432 +++---- benchmark/benchmark_device_segmented_sort.cpp | 911 ++++++-------- benchmark/benchmark_device_select.cpp | 979 ++++++--------- benchmark/benchmark_device_spmv.cpp | 367 +++--- benchmark/benchmark_utils.hpp | 705 +++++------ benchmark/benchmark_warp_exchange.cpp | 579 ++++----- benchmark/benchmark_warp_load.cpp | 437 +++---- benchmark/benchmark_warp_merge_sort.cpp | 842 ++++++------- benchmark/benchmark_warp_reduce.cpp | 401 +++--- benchmark/benchmark_warp_scan.cpp | 351 +++--- benchmark/benchmark_warp_store.cpp | 412 +++---- 35 files changed, 8551 insertions(+), 10863 deletions(-) diff --git a/benchmark/benchmark_block_adjacent_difference.cpp b/benchmark/benchmark_block_adjacent_difference.cpp index bc3bf62a..70b28f63 100644 --- a/benchmark/benchmark_block_adjacent_difference.cpp +++ b/benchmark/benchmark_block_adjacent_difference.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -32,409 +32,341 @@ const size_t DEFAULT_N = 1024 * 1024 * 128; #endif -template < - class Benchmark, - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool WithTile, - typename... Args -> -__global__ -__launch_bounds__(BlockSize) -void kernel(Args ...args) -{ - Benchmark::template run(args...); +template +__global__ __launch_bounds__(BlockSize) void kernel(Args... args) { + Benchmark::template run(args...); } -template -struct minus -{ - HIPCUB_HOST_DEVICE inline - constexpr T operator()(const T& a, const T& b) const - { - return a - b; - } +template struct minus { + HIPCUB_HOST_DEVICE inline constexpr T operator()(const T &a, + const T &b) const { + return a - b; + } }; -struct subtract_left -{ - template - __device__ static void run(const T* d_input, T* d_output, unsigned int trials) - { - const unsigned int lid = threadIdx.x; - const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; - - T input[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, d_input + block_offset, input); - - hipcub::BlockAdjacentDifference adjacent_difference; - - #pragma nounroll - for(unsigned int trial = 0; trial < trials; trial++) - { - T output[ItemsPerThread]; - if(WithTile) - { - adjacent_difference.SubtractLeft(input, output, minus{}, T(123)); - } - else - { - adjacent_difference.SubtractLeft(input, output, minus{}); - } - - for(unsigned int i = 0; i < ItemsPerThread; ++i) - { - input[i] += output[i]; - } - - __syncthreads(); - } - - hipcub::StoreDirectStriped(lid, d_output + block_offset, input); +struct subtract_left { + template + __device__ static void run(const T *d_input, T *d_output, + unsigned int trials) { + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; + + T input[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, d_input + block_offset, input); + + hipcub::BlockAdjacentDifference adjacent_difference; + +#pragma nounroll + for (unsigned int trial = 0; trial < trials; trial++) { + T output[ItemsPerThread]; + if (WithTile) { + adjacent_difference.SubtractLeft(input, output, minus{}, T(123)); + } else { + adjacent_difference.SubtractLeft(input, output, minus{}); + } + + for (unsigned int i = 0; i < ItemsPerThread; ++i) { + input[i] += output[i]; + } + + __syncthreads(); } + + hipcub::StoreDirectStriped(lid, d_output + block_offset, input); + } }; -struct subtract_left_partial_tile -{ - template - __device__ static void run(const T* d_input, const int* tile_sizes, T* d_output, unsigned int trials) - { - const unsigned int lid = threadIdx.x; - const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; - - T input[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, d_input + block_offset, input); - - hipcub::BlockAdjacentDifference adjacent_difference; - - int tile_size = tile_sizes[blockIdx.x]; - - // Try to evenly distribute the length of tile_sizes between all the trials - const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; - - #pragma nounroll - for(unsigned int trial = 0; trial < trials; trial++) - { - T output[ItemsPerThread]; - - if(WithTile) - { - adjacent_difference.SubtractLeftPartialTile(input, - output, - minus{}, - tile_size, - T(123)); - } - else - { - adjacent_difference.SubtractLeftPartialTile(input, output, minus{}, tile_size); - } - - for(unsigned int i = 0; i < ItemsPerThread; ++i) - { - input[i] += output[i]; - } - - // Change the tile_size to even out the distribution - tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); - __syncthreads(); - } - - hipcub::StoreDirectStriped(lid, d_output + block_offset, input); +struct subtract_left_partial_tile { + template + __device__ static void run(const T *d_input, const int *tile_sizes, + T *d_output, unsigned int trials) { + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; + + T input[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, d_input + block_offset, input); + + hipcub::BlockAdjacentDifference adjacent_difference; + + int tile_size = tile_sizes[blockIdx.x]; + + // Try to evenly distribute the length of tile_sizes between all the trials + const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; + +#pragma nounroll + for (unsigned int trial = 0; trial < trials; trial++) { + T output[ItemsPerThread]; + + if (WithTile) { + adjacent_difference.SubtractLeftPartialTile(input, output, minus{}, + tile_size, T(123)); + } else { + adjacent_difference.SubtractLeftPartialTile(input, output, minus{}, + tile_size); + } + + for (unsigned int i = 0; i < ItemsPerThread; ++i) { + input[i] += output[i]; + } + + // Change the tile_size to even out the distribution + tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); + __syncthreads(); } + + hipcub::StoreDirectStriped(lid, d_output + block_offset, input); + } }; -struct subtract_right -{ - template - __device__ static void run(const T* d_input, T* d_output, unsigned int trials) - { - const unsigned int lid = threadIdx.x; - const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; - - T input[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, d_input + block_offset, input); - - hipcub::BlockAdjacentDifference adjacent_difference; - - #pragma nounroll - for(unsigned int trial = 0; trial < trials; trial++) - { - T output[ItemsPerThread]; - if(WithTile) - { - adjacent_difference.SubtractRight(input, output, minus{}, T(123)); - } - else - { - adjacent_difference.SubtractRight(input, output, minus{}); - } - - for(unsigned int i = 0; i < ItemsPerThread; ++i) - { - input[i] += output[i]; - } - - __syncthreads(); - } - - hipcub::StoreDirectStriped(lid, d_output + block_offset, input); +struct subtract_right { + template + __device__ static void run(const T *d_input, T *d_output, + unsigned int trials) { + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; + + T input[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, d_input + block_offset, input); + + hipcub::BlockAdjacentDifference adjacent_difference; + +#pragma nounroll + for (unsigned int trial = 0; trial < trials; trial++) { + T output[ItemsPerThread]; + if (WithTile) { + adjacent_difference.SubtractRight(input, output, minus{}, T(123)); + } else { + adjacent_difference.SubtractRight(input, output, minus{}); + } + + for (unsigned int i = 0; i < ItemsPerThread; ++i) { + input[i] += output[i]; + } + + __syncthreads(); } + + hipcub::StoreDirectStriped(lid, d_output + block_offset, input); + } }; -struct subtract_right_partial_tile -{ - template - __device__ static void run(const T* d_input, const int* tile_sizes, T* d_output, unsigned int trials) - { - const unsigned int lid = threadIdx.x; - const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; +struct subtract_right_partial_tile { + template + __device__ static void run(const T *d_input, const int *tile_sizes, + T *d_output, unsigned int trials) { + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; - T input[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, d_input + block_offset, input); + T input[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, d_input + block_offset, input); - hipcub::BlockAdjacentDifference adjacent_difference; + hipcub::BlockAdjacentDifference adjacent_difference; - int tile_size = tile_sizes[blockIdx.x]; + int tile_size = tile_sizes[blockIdx.x]; - // Try to evenly distribute the length of tile_sizes between all the trials - const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; + // Try to evenly distribute the length of tile_sizes between all the trials + const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; - #pragma nounroll - for(unsigned int trial = 0; trial < trials; trial++) - { - T output[ItemsPerThread]; +#pragma nounroll + for (unsigned int trial = 0; trial < trials; trial++) { + T output[ItemsPerThread]; - adjacent_difference.SubtractRightPartialTile(input, output, minus{}, tile_size); + adjacent_difference.SubtractRightPartialTile(input, output, minus{}, + tile_size); - for(unsigned int i = 0; i < ItemsPerThread; ++i) - { - input[i] += output[i]; - } - - // Change the tile_size to even out the distribution - tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); - __syncthreads(); - } + for (unsigned int i = 0; i < ItemsPerThread; ++i) { + input[i] += output[i]; + } - hipcub::StoreDirectStriped(lid, d_output + block_offset, input); + // Change the tile_size to even out the distribution + tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); + __syncthreads(); } -}; -template -auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) - -> std::enable_if_t::value - && !std::is_same::value> -{ - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto num_blocks = (N + items_per_block - 1) / items_per_block; - // Round up size to the next multiple of items_per_block - const auto size = num_blocks * items_per_block; - - const std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); - T* d_input; - T* d_output; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(input[0]), - hipMemcpyHostToDevice - ) - ); - - for(auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel), - dim3(num_blocks), dim3(BlockSize), 0, stream, - d_input, d_output, Trials - ); - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); + hipcub::StoreDirectStriped(lid, d_output + block_offset, input); + } +}; - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); +template +auto run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) + -> std::enable_if_t< + !std::is_same::value && + !std::is_same::value> { + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto num_blocks = (N + items_per_block - 1) / items_per_block; + // Round up size to the next multiple of items_per_block + const auto size = num_blocks * items_per_block; + + const std::vector input = + benchmark_utils::get_random_data(size, T(0), T(10)); + T *d_input; + T *d_output; + HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); + HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), + hipMemcpyHostToDevice)); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel), + dim3(num_blocks), dim3(BlockSize), 0, stream, d_input, d_output, + Trials); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -template -auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) - -> std::enable_if_t::value - || std::is_same::value> -{ - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto num_blocks = (N + items_per_block - 1) / items_per_block; - // Round up size to the next multiple of items_per_block - const auto size = num_blocks * items_per_block; - - const std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); - const std::vector tile_sizes - = benchmark_utils::get_random_data(num_blocks, 0, items_per_block); - - T* d_input; - int* d_tile_sizes; - T* d_output; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); - HIP_CHECK(hipMalloc(&d_tile_sizes, tile_sizes.size() * sizeof(tile_sizes[0]))); - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(input[0]), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_tile_sizes, tile_sizes.data(), - tile_sizes.size() * sizeof(tile_sizes[0]), - hipMemcpyHostToDevice - ) - ); - - for(auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel), - dim3(num_blocks), dim3(BlockSize), 0, stream, - d_input, d_tile_sizes, d_output, Trials - ); - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); - - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_tile_sizes)); - HIP_CHECK(hipFree(d_output)); +template +auto run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) + -> std::enable_if_t< + std::is_same::value || + std::is_same::value> { + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto num_blocks = (N + items_per_block - 1) / items_per_block; + // Round up size to the next multiple of items_per_block + const auto size = num_blocks * items_per_block; + + const std::vector input = + benchmark_utils::get_random_data(size, T(0), T(10)); + const std::vector tile_sizes = + benchmark_utils::get_random_data(num_blocks, 0, items_per_block); + + T *d_input; + int *d_tile_sizes; + T *d_output; + HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); + HIP_CHECK( + hipMalloc(&d_tile_sizes, tile_sizes.size() * sizeof(tile_sizes[0]))); + HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_tile_sizes, tile_sizes.data(), + tile_sizes.size() * sizeof(tile_sizes[0]), + hipMemcpyHostToDevice)); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel), + dim3(num_blocks), dim3(BlockSize), 0, stream, d_input, d_tile_sizes, + d_output, Trials); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_tile_sizes)); + HIP_CHECK(hipFree(d_output)); } -# define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ - benchmark::RegisterBenchmark( \ - std::string("block_adjacent_difference.sub_algorithm_name:") + name \ - + std::string("" \ - ).c_str(), \ - &run_benchmark, \ - stream, \ - size \ - ) - - -#define BENCHMARK_TYPE(type, block, with_tile) \ - CREATE_BENCHMARK(type, block, 1, with_tile), \ - CREATE_BENCHMARK(type, block, 3, with_tile), \ - CREATE_BENCHMARK(type, block, 4, with_tile), \ - CREATE_BENCHMARK(type, block, 8, with_tile), \ - CREATE_BENCHMARK(type, block, 16, with_tile), \ - CREATE_BENCHMARK(type, block, 32, with_tile) - -template -void add_benchmarks(const std::string& name, - std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - std::vector bs = - { - BENCHMARK_TYPE(int, 256, false), - BENCHMARK_TYPE(float, 256, false), - BENCHMARK_TYPE(int8_t, 256, false), - BENCHMARK_TYPE(long long, 256, false), - BENCHMARK_TYPE(double, 256, false) - }; - - if(!std::is_same::value) - { - bs.insert(bs.end(), { - BENCHMARK_TYPE(int, 256, true), - BENCHMARK_TYPE(float, 256, true), - BENCHMARK_TYPE(int8_t, 256, true), - BENCHMARK_TYPE(long long, 256, true), - BENCHMARK_TYPE(double, 256, true) - }); - } - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ + benchmark::RegisterBenchmark( \ + std::string("block_adjacent_difference.sub_algorithm_name:") + \ + name + \ + std::string("") \ + .c_str(), \ + &run_benchmark, stream, size) + +#define BENCHMARK_TYPE(type, block, with_tile) \ + CREATE_BENCHMARK(type, block, 1, with_tile), \ + CREATE_BENCHMARK(type, block, 3, with_tile), \ + CREATE_BENCHMARK(type, block, 4, with_tile), \ + CREATE_BENCHMARK(type, block, 8, with_tile), \ + CREATE_BENCHMARK(type, block, 16, with_tile), \ + CREATE_BENCHMARK(type, block, 32, with_tile) + +template +void add_benchmarks(const std::string &name, + std::vector &benchmarks, + hipStream_t stream, size_t size) { + std::vector bs = { + BENCHMARK_TYPE(int, 256, false), BENCHMARK_TYPE(float, 256, false), + BENCHMARK_TYPE(int8_t, 256, false), BENCHMARK_TYPE(long long, 256, false), + BENCHMARK_TYPE(double, 256, false)}; + + if (!std::is_same::value) { + bs.insert(bs.end(), + {BENCHMARK_TYPE(int, 256, true), BENCHMARK_TYPE(float, 256, true), + BENCHMARK_TYPE(int8_t, 256, true), + BENCHMARK_TYPE(long long, 256, true), + BENCHMARK_TYPE(double, 256, true)}); + } + + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - - std::cout << "benchmark_block_adjacent_difference" << std::endl; - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_benchmarks("SubtractLeft", benchmarks, stream, size); - add_benchmarks("SubtractRight", benchmarks, stream, size); - add_benchmarks("SubtractLeftPartialTile", benchmarks, stream, size); - add_benchmarks("SubtractRightPartialTile", benchmarks, stream, size); - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_block_adjacent_difference" << std::endl; + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_benchmarks("SubtractLeft", benchmarks, stream, size); + add_benchmarks("SubtractRight", benchmarks, stream, size); + add_benchmarks("SubtractLeftPartialTile", + benchmarks, stream, size); + add_benchmarks("SubtractRightPartialTile", + benchmarks, stream, size); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } \ No newline at end of file diff --git a/benchmark/benchmark_block_discontinuity.cpp b/benchmark/benchmark_block_discontinuity.cpp index 44babbe5..79a5aa33 100644 --- a/benchmark/benchmark_block_discontinuity.cpp +++ b/benchmark/benchmark_block_discontinuity.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -25,305 +25,236 @@ // HIP API #include "hipcub/block/block_discontinuity.hpp" -#include "hipcub/thread/thread_operators.hpp" //to use hipcub::Equality #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_store.hpp" - +#include "hipcub/thread/thread_operators.hpp" //to use hipcub::Equality #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif -template -struct custom_flag_op1 -{ - HIPCUB_HOST_DEVICE - bool operator()(const T& a, const T& b) const - { - return (a == b); - } +template struct custom_flag_op1 { + HIPCUB_HOST_DEVICE + bool operator()(const T &a, const T &b) const { return (a == b); } }; -template< - class Runner, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool WithTile, - unsigned int Trials -> -__global__ -__launch_bounds__(BlockSize) -void kernel(const T * d_input, T * d_output) -{ - Runner::template run(d_input, d_output); +template +__global__ __launch_bounds__(BlockSize) void kernel(const T *d_input, + T *d_output) { + Runner::template run( + d_input, d_output); } -struct flag_heads -{ - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool WithTile, - unsigned int Trials - > - __device__ - static void run(const T * d_input, T * d_output) - { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - - T input[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, d_input + block_offset, input); - - #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - hipcub::BlockDiscontinuity bdiscontinuity; - bool head_flags[ItemsPerThread]; - if(WithTile) - { - bdiscontinuity.FlagHeads(head_flags, input, hipcub::Equality(), T(123)); - } - else - { - bdiscontinuity.FlagHeads(head_flags, input, hipcub::Equality()); - } - - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - input[i] += head_flags[i]; - } - __syncthreads(); - } - hipcub::StoreDirectStriped(lid, d_output + block_offset, input); +struct flag_heads { + template + __device__ static void run(const T *d_input, T *d_output) { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = + hipBlockIdx_x * ItemsPerThread * BlockSize; + + T input[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, d_input + block_offset, input); + +#pragma nounroll + for (unsigned int trial = 0; trial < Trials; trial++) { + hipcub::BlockDiscontinuity bdiscontinuity; + bool head_flags[ItemsPerThread]; + if (WithTile) { + bdiscontinuity.FlagHeads(head_flags, input, hipcub::Equality(), T(123)); + } else { + bdiscontinuity.FlagHeads(head_flags, input, hipcub::Equality()); + } + + for (unsigned int i = 0; i < ItemsPerThread; i++) { + input[i] += head_flags[i]; + } + __syncthreads(); } + hipcub::StoreDirectStriped(lid, d_output + block_offset, input); + } }; -struct flag_tails -{ - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool WithTile, - unsigned int Trials - > - __device__ - static void run(const T * d_input, T * d_output) - { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - - T input[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, d_input + block_offset, input); - - #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - hipcub::BlockDiscontinuity bdiscontinuity; - bool tail_flags[ItemsPerThread]; - if(WithTile) - { - bdiscontinuity.FlagTails(tail_flags, input, hipcub::Equality(), T(123)); - } - else - { - bdiscontinuity.FlagTails(tail_flags, input, hipcub::Equality()); - } - - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - input[i] += tail_flags[i]; - } - __syncthreads(); - } - hipcub::StoreDirectStriped(lid, d_output + block_offset, input); +struct flag_tails { + template + __device__ static void run(const T *d_input, T *d_output) { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = + hipBlockIdx_x * ItemsPerThread * BlockSize; + + T input[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, d_input + block_offset, input); + +#pragma nounroll + for (unsigned int trial = 0; trial < Trials; trial++) { + hipcub::BlockDiscontinuity bdiscontinuity; + bool tail_flags[ItemsPerThread]; + if (WithTile) { + bdiscontinuity.FlagTails(tail_flags, input, hipcub::Equality(), T(123)); + } else { + bdiscontinuity.FlagTails(tail_flags, input, hipcub::Equality()); + } + + for (unsigned int i = 0; i < ItemsPerThread; i++) { + input[i] += tail_flags[i]; + } + __syncthreads(); } + hipcub::StoreDirectStriped(lid, d_output + block_offset, input); + } }; -struct flag_heads_and_tails -{ - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool WithTile, - unsigned int Trials - > - __device__ - static void run(const T * d_input, T * d_output) - { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - - T input[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, d_input + block_offset, input); - - #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - hipcub::BlockDiscontinuity bdiscontinuity; - bool head_flags[ItemsPerThread]; - bool tail_flags[ItemsPerThread]; - if(WithTile) - { - bdiscontinuity.FlagHeadsAndTails(head_flags, T(123), tail_flags, T(234), input, hipcub::Equality()); - } - else - { - bdiscontinuity.FlagHeadsAndTails(head_flags, tail_flags, input, hipcub::Equality()); - } - - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - input[i] += head_flags[i]; - input[i] += tail_flags[i]; - } - __syncthreads(); - } - hipcub::StoreDirectStriped(lid, d_output + block_offset, input); +struct flag_heads_and_tails { + template + __device__ static void run(const T *d_input, T *d_output) { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = + hipBlockIdx_x * ItemsPerThread * BlockSize; + + T input[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, d_input + block_offset, input); + +#pragma nounroll + for (unsigned int trial = 0; trial < Trials; trial++) { + hipcub::BlockDiscontinuity bdiscontinuity; + bool head_flags[ItemsPerThread]; + bool tail_flags[ItemsPerThread]; + if (WithTile) { + bdiscontinuity.FlagHeadsAndTails(head_flags, T(123), tail_flags, T(234), + input, hipcub::Equality()); + } else { + bdiscontinuity.FlagHeadsAndTails(head_flags, tail_flags, input, + hipcub::Equality()); + } + + for (unsigned int i = 0; i < ItemsPerThread; i++) { + input[i] += head_flags[i]; + input[i] += tail_flags[i]; + } + __syncthreads(); } + hipcub::StoreDirectStriped(lid, d_output + block_offset, input); + } }; -template< - class Benchmark, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool WithTile, - unsigned int Trials = 100 -> -void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) -{ - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); - - std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); - T * d_input; - T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); +template +void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto size = + items_per_block * ((N + items_per_block - 1) / items_per_block); + + std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); + T *d_input; + T *d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel), + dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, + d_output); + HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); - for(auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel), - dim3(size/items_per_block), dim3(BlockSize), 0, stream, - d_input, d_output - ); - HIP_CHECK(hipPeekAtLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ -benchmark::RegisterBenchmark( \ - std::string("block_discontinuity.sub_algorithm_name:" \ - + name \ - + "." \ - ).c_str(), \ - &run_benchmark, \ - stream, size \ -) - -#define BENCHMARK_TYPE(type, block, bool) \ - CREATE_BENCHMARK(type, block, 1, bool), \ - CREATE_BENCHMARK(type, block, 2, bool), \ - CREATE_BENCHMARK(type, block, 3, bool), \ - CREATE_BENCHMARK(type, block, 4, bool), \ - CREATE_BENCHMARK(type, block, 8, bool) - - -template -void add_benchmarks(const std::string& name, - std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - std::vector bs = - { - BENCHMARK_TYPE(int, 256, false), - BENCHMARK_TYPE(int, 256, true), - BENCHMARK_TYPE(int8_t, 256, false), - BENCHMARK_TYPE(int8_t, 256, true), - BENCHMARK_TYPE(uint8_t, 256, false), - BENCHMARK_TYPE(uint8_t, 256, true), - BENCHMARK_TYPE(long long, 256, false), - BENCHMARK_TYPE(long long, 256, true), - }; - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ + benchmark::RegisterBenchmark( \ + std::string("block_discontinuity.sub_algorithm_name:" + \ + name + \ + ".") \ + .c_str(), \ + &run_benchmark, stream, size) + +#define BENCHMARK_TYPE(type, block, bool) \ + CREATE_BENCHMARK(type, block, 1, bool), \ + CREATE_BENCHMARK(type, block, 2, bool), \ + CREATE_BENCHMARK(type, block, 3, bool), \ + CREATE_BENCHMARK(type, block, 4, bool), \ + CREATE_BENCHMARK(type, block, 8, bool) + +template +void add_benchmarks(const std::string &name, + std::vector &benchmarks, + hipStream_t stream, size_t size) { + std::vector bs = { + BENCHMARK_TYPE(int, 256, false), + BENCHMARK_TYPE(int, 256, true), + BENCHMARK_TYPE(int8_t, 256, false), + BENCHMARK_TYPE(int8_t, 256, true), + BENCHMARK_TYPE(uint8_t, 256, false), + BENCHMARK_TYPE(uint8_t, 256, true), + BENCHMARK_TYPE(long long, 256, false), + BENCHMARK_TYPE(long long, 256, true), + }; + + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_block_discontinuity" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_benchmarks("flag_heads", benchmarks, stream, size); - add_benchmarks("flag_tails", benchmarks, stream, size); - add_benchmarks("flag_heads_and_tails", benchmarks, stream, size); - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_block_discontinuity" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_benchmarks("flag_heads", benchmarks, stream, size); + add_benchmarks("flag_tails", benchmarks, stream, size); + add_benchmarks("flag_heads_and_tails", benchmarks, + stream, size); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_block_exchange.cpp b/benchmark/benchmark_block_exchange.cpp index 952b9f92..d91a2297 100644 --- a/benchmark/benchmark_block_exchange.cpp +++ b/benchmark/benchmark_block_exchange.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -27,361 +27,304 @@ #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_store.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template< - class Runner, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials -> -__global__ -__launch_bounds__(BlockSize) -void kernel(const T * d_input, const unsigned int * d_ranks, T * d_output) -{ - Runner::template run(d_input, d_ranks, d_output); +template +__global__ __launch_bounds__(BlockSize) void kernel(const T *d_input, + const unsigned int *d_ranks, + T *d_output) { + Runner::template run(d_input, d_ranks, + d_output); } -struct blocked_to_striped -{ - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T * d_input, const unsigned int *, T * d_output) - { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - - T input[ItemsPerThread]; - hipcub::LoadDirectBlocked(lid, d_input + block_offset, input); - - - #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - hipcub::BlockExchange exchange; - exchange.BlockedToStriped(input, input); - __syncthreads(); // extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop). - } - hipcub::StoreDirectStriped(lid, d_output + block_offset, input); +struct blocked_to_striped { + template + __device__ static void run(const T *d_input, const unsigned int *, + T *d_output) { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = + hipBlockIdx_x * ItemsPerThread * BlockSize; + + T input[ItemsPerThread]; + hipcub::LoadDirectBlocked(lid, d_input + block_offset, input); + +#pragma nounroll + for (unsigned int trial = 0; trial < Trials; trial++) { + hipcub::BlockExchange exchange; + exchange.BlockedToStriped(input, input); + __syncthreads(); // extra sync needed because of loop. In normal usage + // sync with be cared for by the load and store functions + // (outside the loop). } + hipcub::StoreDirectStriped(lid, d_output + block_offset, input); + } }; -struct striped_to_blocked -{ - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T * d_input, const unsigned int *, T * d_output) - { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - - T input[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, d_input + block_offset, input); - - #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - hipcub::BlockExchange exchange; - exchange.StripedToBlocked(input, input); - __syncthreads();// extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop). - } - hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); +struct striped_to_blocked { + template + __device__ static void run(const T *d_input, const unsigned int *, + T *d_output) { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = + hipBlockIdx_x * ItemsPerThread * BlockSize; + + T input[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, d_input + block_offset, input); + +#pragma nounroll + for (unsigned int trial = 0; trial < Trials; trial++) { + hipcub::BlockExchange exchange; + exchange.StripedToBlocked(input, input); + __syncthreads(); // extra sync needed because of loop. In normal usage + // sync with be cared for by the load and store functions + // (outside the loop). } + hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); + } }; -struct blocked_to_warp_striped -{ - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T * d_input, const unsigned int *, T * d_output) - { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - - T input[ItemsPerThread]; - hipcub::LoadDirectBlocked(lid, d_input + block_offset, input); - - #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - hipcub::BlockExchange exchange; - exchange.BlockedToWarpStriped(input, input); - __syncthreads();// extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop). - } - hipcub::StoreDirectWarpStriped(lid, d_output + block_offset, input); +struct blocked_to_warp_striped { + template + __device__ static void run(const T *d_input, const unsigned int *, + T *d_output) { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = + hipBlockIdx_x * ItemsPerThread * BlockSize; + + T input[ItemsPerThread]; + hipcub::LoadDirectBlocked(lid, d_input + block_offset, input); + +#pragma nounroll + for (unsigned int trial = 0; trial < Trials; trial++) { + hipcub::BlockExchange exchange; + exchange.BlockedToWarpStriped(input, input); + __syncthreads(); // extra sync needed because of loop. In normal usage + // sync with be cared for by the load and store functions + // (outside the loop). } + hipcub::StoreDirectWarpStriped(lid, d_output + block_offset, input); + } }; -struct warp_striped_to_blocked -{ - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T * d_input, const unsigned int *, T * d_output) - { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - - T input[ItemsPerThread]; - hipcub::LoadDirectWarpStriped(lid, d_input + block_offset, input); - - #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - hipcub::BlockExchange exchange; - exchange.WarpStripedToBlocked(input, input); - __syncthreads(); // extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop). - } - hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); +struct warp_striped_to_blocked { + template + __device__ static void run(const T *d_input, const unsigned int *, + T *d_output) { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = + hipBlockIdx_x * ItemsPerThread * BlockSize; + + T input[ItemsPerThread]; + hipcub::LoadDirectWarpStriped(lid, d_input + block_offset, input); + +#pragma nounroll + for (unsigned int trial = 0; trial < Trials; trial++) { + hipcub::BlockExchange exchange; + exchange.WarpStripedToBlocked(input, input); + __syncthreads(); // extra sync needed because of loop. In normal usage + // sync with be cared for by the load and store functions + // (outside the loop). } + hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); + } }; -struct scatter_to_blocked -{ - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T * d_input, const unsigned int * d_ranks, T * d_output) - { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - - T input[ItemsPerThread]; - unsigned int ranks[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, d_input + block_offset, input); - hipcub::LoadDirectStriped(lid, d_ranks + block_offset, ranks); - - #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - hipcub::BlockExchange exchange; - exchange.ScatterToBlocked(input, input, ranks); - __syncthreads();// extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop). - } - hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); +struct scatter_to_blocked { + template + __device__ static void run(const T *d_input, const unsigned int *d_ranks, + T *d_output) { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = + hipBlockIdx_x * ItemsPerThread * BlockSize; + + T input[ItemsPerThread]; + unsigned int ranks[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, d_input + block_offset, input); + hipcub::LoadDirectStriped(lid, d_ranks + block_offset, ranks); + +#pragma nounroll + for (unsigned int trial = 0; trial < Trials; trial++) { + hipcub::BlockExchange exchange; + exchange.ScatterToBlocked(input, input, ranks); + __syncthreads(); // extra sync needed because of loop. In normal usage + // sync with be cared for by the load and store functions + // (outside the loop). } + hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); + } }; -struct scatter_to_striped -{ - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T * d_input, const unsigned int * d_ranks, T * d_output) - { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - - T input[ItemsPerThread]; - unsigned int ranks[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, d_input + block_offset, input); - hipcub::LoadDirectStriped(lid, d_ranks + block_offset, ranks); - - #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - hipcub::BlockExchange exchange; - exchange.ScatterToStriped(input, input, ranks); - __syncthreads(); // extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop). - } - hipcub::StoreDirectStriped(lid, d_output + block_offset, input); +struct scatter_to_striped { + template + __device__ static void run(const T *d_input, const unsigned int *d_ranks, + T *d_output) { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = + hipBlockIdx_x * ItemsPerThread * BlockSize; + + T input[ItemsPerThread]; + unsigned int ranks[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, d_input + block_offset, input); + hipcub::LoadDirectStriped(lid, d_ranks + block_offset, ranks); + +#pragma nounroll + for (unsigned int trial = 0; trial < Trials; trial++) { + hipcub::BlockExchange exchange; + exchange.ScatterToStriped(input, input, ranks); + __syncthreads(); // extra sync needed because of loop. In normal usage + // sync with be cared for by the load and store functions + // (outside the loop). } + hipcub::StoreDirectStriped(lid, d_output + block_offset, input); + } }; -template< - class Benchmark, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials = 100 -> -void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) -{ - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); - - std::vector input(size); - // Fill input - for(size_t i = 0; i < size; i++) - { - input[i] = T(i); - } - std::vector ranks(size); - // Fill ranks (for scatter operations) - std::mt19937 gen; - for(size_t bi = 0; bi < size / items_per_block; bi++) - { - auto block_ranks = ranks.begin() + bi * items_per_block; - std::iota(block_ranks, block_ranks + items_per_block, 0); - std::shuffle(block_ranks, block_ranks + items_per_block, gen); - } - T * d_input; - unsigned int * d_ranks; - T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_ranks, size * sizeof(unsigned int))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_ranks, ranks.data(), - size * sizeof(unsigned int), - hipMemcpyHostToDevice - ) - ); +template +void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto size = + items_per_block * ((N + items_per_block - 1) / items_per_block); + + std::vector input(size); + // Fill input + for (size_t i = 0; i < size; i++) { + input[i] = T(i); + } + std::vector ranks(size); + // Fill ranks (for scatter operations) + std::mt19937 gen; + for (size_t bi = 0; bi < size / items_per_block; bi++) { + auto block_ranks = ranks.begin() + bi * items_per_block; + std::iota(block_ranks, block_ranks + items_per_block, 0); + std::shuffle(block_ranks, block_ranks + items_per_block, gen); + } + T *d_input; + unsigned int *d_ranks; + T *d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_ranks, size * sizeof(unsigned int))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_ranks, ranks.data(), size * sizeof(unsigned int), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel), + dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, + d_ranks, d_output); + HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); - for(auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel), - dim3(size/items_per_block), dim3(BlockSize), 0, stream, - d_input, d_ranks, d_output - ); - HIP_CHECK(hipPeekAtLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); - - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_ranks)); - HIP_CHECK(hipFree(d_output)); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_ranks)); + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT) \ -benchmark::RegisterBenchmark( \ - std::string("block_exchange.sub_algorithm_name:" \ - + name \ - ).c_str(), \ - &run_benchmark, \ - stream, size \ -) - -#define BENCHMARK_TYPE(type, block) \ - CREATE_BENCHMARK(type, block, 1), \ - CREATE_BENCHMARK(type, block, 2), \ - CREATE_BENCHMARK(type, block, 3), \ - CREATE_BENCHMARK(type, block, 4), \ - CREATE_BENCHMARK(type, block, 7), \ - CREATE_BENCHMARK(type, block, 8) - -template -void add_benchmarks(const std::string& name, - std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - - std::vector bs = - { - BENCHMARK_TYPE(int, 256), - BENCHMARK_TYPE(int8_t, 256), - BENCHMARK_TYPE(long long, 256), - BENCHMARK_TYPE(custom_float2, 256), - BENCHMARK_TYPE(custom_double2, 256), - }; - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark( \ + std::string("block_exchange.sub_algorithm_name:" + \ + name) \ + .c_str(), \ + &run_benchmark, stream, size) + +#define BENCHMARK_TYPE(type, block) \ + CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2), \ + CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \ + CREATE_BENCHMARK(type, block, 7), CREATE_BENCHMARK(type, block, 8) + +template +void add_benchmarks(const std::string &name, + std::vector &benchmarks, + hipStream_t stream, size_t size) { + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + + std::vector bs = { + BENCHMARK_TYPE(int, 256), + BENCHMARK_TYPE(int8_t, 256), + BENCHMARK_TYPE(long long, 256), + BENCHMARK_TYPE(custom_float2, 256), + BENCHMARK_TYPE(custom_double2, 256), + }; + + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_block_exchange" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_benchmarks("blocked_to_striped", benchmarks, stream, size); - add_benchmarks("striped_to_blocked", benchmarks, stream, size); - add_benchmarks("blocked_to_warp_striped", benchmarks, stream, size); - add_benchmarks("warp_striped_to_blocked", benchmarks, stream, size); - add_benchmarks("scatter_to_blocked", benchmarks, stream, size); - add_benchmarks("scatter_to_striped", benchmarks, stream, size); - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_block_exchange" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_benchmarks("blocked_to_striped", benchmarks, stream, + size); + add_benchmarks("striped_to_blocked", benchmarks, stream, + size); + add_benchmarks("blocked_to_warp_striped", benchmarks, + stream, size); + add_benchmarks("warp_striped_to_blocked", benchmarks, + stream, size); + add_benchmarks("scatter_to_blocked", benchmarks, stream, + size); + add_benchmarks("scatter_to_striped", benchmarks, stream, + size); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_block_histogram.cpp b/benchmark/benchmark_block_histogram.cpp index dd202e36..b9fe9f54 100644 --- a/benchmark/benchmark_block_histogram.cpp +++ b/benchmark/benchmark_block_histogram.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -25,217 +25,171 @@ // HIP API #include "hipcub/block/block_histogram.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif -template< - class Runner, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int BinSize, - unsigned int Trials -> -__global__ -__launch_bounds__(BlockSize) -void kernel(const T* input, T* output) -{ - Runner::template run(input, output); +template +__global__ __launch_bounds__(BlockSize) void kernel(const T *input, T *output) { + Runner::template run(input, + output); } -template -struct histogram -{ - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int BinSize, - unsigned int Trials - > - __device__ - static void run(const T* input, T* output) - { - const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; - unsigned int global_offset = hipBlockIdx_x * BinSize; - - T values[ItemsPerThread]; - for(unsigned int k = 0; k < ItemsPerThread; k++) - { - values[k] = input[index + k]; - } - - using bhistogram_t = hipcub::BlockHistogram; - __shared__ T histogram[BinSize]; - __shared__ typename bhistogram_t::TempStorage storage; - - #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - bhistogram_t(storage).Histogram(values, histogram); - } - - #pragma unroll - for (unsigned int offset = 0; offset < BinSize; offset += BlockSize) - { - if(offset + hipThreadIdx_x < BinSize) - { - output[global_offset + hipThreadIdx_x] = histogram[offset + hipThreadIdx_x]; - global_offset += BlockSize; - } - } +template struct histogram { + template + __device__ static void run(const T *input, T *output) { + const unsigned int index = + ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; + unsigned int global_offset = hipBlockIdx_x * BinSize; + + T values[ItemsPerThread]; + for (unsigned int k = 0; k < ItemsPerThread; k++) { + values[k] = input[index + k]; + } + + using bhistogram_t = hipcub::BlockHistogram; + __shared__ T histogram[BinSize]; + __shared__ typename bhistogram_t::TempStorage storage; + +#pragma nounroll + for (unsigned int trial = 0; trial < Trials; trial++) { + bhistogram_t(storage).Histogram(values, histogram); + } + +#pragma unroll + for (unsigned int offset = 0; offset < BinSize; offset += BlockSize) { + if (offset + hipThreadIdx_x < BinSize) { + output[global_offset + hipThreadIdx_x] = + histogram[offset + hipThreadIdx_x]; + global_offset += BlockSize; + } } + } }; -template< - class Benchmark, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int BinSize = BlockSize, - unsigned int Trials = 100 -> -void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) -{ - // Make sure size is a multiple of BlockSize - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); - const auto bin_size = BinSize * ((N + items_per_block - 1)/items_per_block); - // Allocate and fill memory - std::vector input(size, 0.0f); - T * d_input; - T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, bin_size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); +template +void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { + // Make sure size is a multiple of BlockSize + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto size = + items_per_block * ((N + items_per_block - 1) / items_per_block); + const auto bin_size = BinSize * ((N + items_per_block - 1) / items_per_block); + // Allocate and fill memory + std::vector input(size, 0.0f); + T *d_input; + T *d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, bin_size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel), + dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, + d_output); + HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel), - dim3(size/items_per_block), dim3(BlockSize), 0, stream, - d_input, d_output - ); - HIP_CHECK(hipPeekAtLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); - state.SetItemsProcessed(state.iterations() * size * Trials); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); + state.SetItemsProcessed(state.iterations() * size * Trials); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } // IPT - items per thread -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark( \ - std::string("block_histogram.method_name:" \ - + method_name \ - ).c_str(), \ - &run_benchmark, \ - stream, size \ - ) - -#define BENCHMARK_TYPE(type, block) \ - CREATE_BENCHMARK(type, block, 1), \ - CREATE_BENCHMARK(type, block, 2), \ - CREATE_BENCHMARK(type, block, 3), \ - CREATE_BENCHMARK(type, block, 4), \ - CREATE_BENCHMARK(type, block, 8), \ - CREATE_BENCHMARK(type, block, 16) - -template -void add_benchmarks(std::vector& benchmarks, - const std::string& method_name, - const std::string& algorithm_name, - hipStream_t stream, - size_t size) -{ - std::vector new_benchmarks = - { - BENCHMARK_TYPE(int, 256), - BENCHMARK_TYPE(int, 320), - BENCHMARK_TYPE(int, 512), - - BENCHMARK_TYPE(unsigned long long, 256), - BENCHMARK_TYPE(unsigned long long, 320) - }; - benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark( \ + std::string("block_histogram.method_name:" + method_name) \ + .c_str(), \ + &run_benchmark, stream, size) + +#define BENCHMARK_TYPE(type, block) \ + CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2), \ + CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \ + CREATE_BENCHMARK(type, block, 8), CREATE_BENCHMARK(type, block, 16) + +template +void add_benchmarks(std::vector &benchmarks, + const std::string &method_name, + const std::string &algorithm_name, hipStream_t stream, + size_t size) { + std::vector new_benchmarks = { + BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int, 320), + BENCHMARK_TYPE(int, 512), + + BENCHMARK_TYPE(unsigned long long, 256), + BENCHMARK_TYPE(unsigned long long, 320)}; + benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), + new_benchmarks.end()); } -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_block_histogram" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - // using_atomic - using histogram_a_t = histogram; - add_benchmarks( - benchmarks, "histogram", "using_atomic", stream, size - ); - // using_sort - using histogram_s_t = histogram; - add_benchmarks( - benchmarks, "histogram", "using_sort", stream, size - ); - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_block_histogram" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + // using_atomic + using histogram_a_t = + histogram; + add_benchmarks(benchmarks, "histogram", "using_atomic", stream, + size); + // using_sort + using histogram_s_t = + histogram; + add_benchmarks(benchmarks, "histogram", "using_sort", stream, + size); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_block_merge_sort.cpp b/benchmark/benchmark_block_merge_sort.cpp index 0d502390..2c6b62aa 100644 --- a/benchmark/benchmark_block_merge_sort.cpp +++ b/benchmark/benchmark_block_merge_sort.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -24,254 +24,192 @@ #include "../test/hipcub/test_utils_sort_comparator.hpp" // HIP API -#include "hipcub/block/block_merge_sort.hpp" #include "hipcub/block/block_load.hpp" +#include "hipcub/block/block_merge_sort.hpp" #include "hipcub/block/block_store.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif -enum class benchmark_kinds -{ - sort_keys, - sort_pairs -}; - -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - class CompareOp, - unsigned int Trials -> +enum class benchmark_kinds { sort_keys, sort_pairs }; + +template __global__ -__launch_bounds__(BlockSize) -void sort_keys_kernel(const T * input, T * output, CompareOp compare_op) -{ - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - - T keys[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, input + block_offset, keys); - - #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - hipcub::BlockMergeSort sort; - sort.Sort(keys, compare_op); - } +__launch_bounds__(BlockSize) void sort_keys_kernel(const T *input, T *output, + CompareOp compare_op) { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + + T keys[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, input + block_offset, keys); - hipcub::StoreDirectStriped(lid, output + block_offset, keys); +#pragma nounroll + for (unsigned int trial = 0; trial < Trials; trial++) { + hipcub::BlockMergeSort sort; + sort.Sort(keys, compare_op); + } + + hipcub::StoreDirectStriped(lid, output + block_offset, keys); } -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - class CompareOp, - unsigned int Trials -> +template __global__ -__launch_bounds__(BlockSize) -void sort_pairs_kernel(const T * input, T * output, CompareOp compare_op) -{ - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - - T keys[ItemsPerThread]; - T values[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, input + block_offset, keys); - - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - values[i] = keys[i] + T(1); - } +__launch_bounds__(BlockSize) void sort_pairs_kernel(const T *input, T *output, + CompareOp compare_op) { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + + T keys[ItemsPerThread]; + T values[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, input + block_offset, keys); + + for (unsigned int i = 0; i < ItemsPerThread; i++) { + values[i] = keys[i] + T(1); + } + +#pragma nounroll + for (unsigned int trial = 0; trial < Trials; trial++) { + hipcub::BlockMergeSort sort; + sort.Sort(keys, values, compare_op); + } + + for (unsigned int i = 0; i < ItemsPerThread; i++) { + keys[i] += values[i]; + } + hipcub::StoreDirectStriped(lid, output + block_offset, keys); +} - #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - hipcub::BlockMergeSort sort; - sort.Sort(keys, values, compare_op); +template +void run_benchmark(benchmark::State &state, benchmark_kinds benchmark_kind, + hipStream_t stream, size_t N) { + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto size = + items_per_block * ((N + items_per_block - 1) / items_per_block); + + std::vector input; + if (std::is_floating_point::value) { + input = benchmark_utils::get_random_data(size, (T)-1000, (T) + 1000); + } else { + input = benchmark_utils::get_random_data( + size, std::numeric_limits::min(), std::numeric_limits::max()); + } + T *d_input; + T *d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + if (benchmark_kind == benchmark_kinds::sort_keys) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(sort_keys_kernel), + dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, + d_output, CompareOp()); + } else if (benchmark_kind == benchmark_kinds::sort_pairs) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(sort_pairs_kernel), + dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, + d_output, CompareOp()); } + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - keys[i] += values[i]; - } - hipcub::StoreDirectStriped(lid, output + block_offset, keys); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - class CompareOp = test_utils::less, - unsigned int Trials = 10 -> -void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipStream_t stream, size_t N) -{ - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); - - std::vector input; - if(std::is_floating_point::value) - { - input = benchmark_utils::get_random_data(size, (T)-1000, (T)+1000); - } - else - { - input = benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); - } - T * d_input; - T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK(hipDeviceSynchronize()); +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark( \ + std::string("block_merge_sort.sub_algorithm_name:" + \ + name) \ + .c_str(), \ + &run_benchmark, benchmark_kind, stream, size) - for(auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - if(benchmark_kind == benchmark_kinds::sort_keys) - { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(sort_keys_kernel), - dim3(size/items_per_block), dim3(BlockSize), 0, stream, - d_input, d_output, CompareOp() - ); - } - else if(benchmark_kind == benchmark_kinds::sort_pairs) - { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(sort_pairs_kernel), - dim3(size/items_per_block), dim3(BlockSize), 0, stream, - d_input, d_output, CompareOp() - ); - } - HIP_CHECK(hipPeekAtLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); +#define BENCHMARK_TYPE(type, block) \ + CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2), \ + CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \ + CREATE_BENCHMARK(type, block, 8) - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); -} +void add_benchmarks(benchmark_kinds benchmark_kind, const std::string &name, + std::vector &benchmarks, + hipStream_t stream, size_t size) { + std::vector bs = { + BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(int, 128), + BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int, 512), -#define CREATE_BENCHMARK(T, BS, IPT) \ -benchmark::RegisterBenchmark( \ - std::string("block_merge_sort.sub_algorithm_name:" \ - + name \ - ).c_str(), \ - &run_benchmark, \ - benchmark_kind, stream, size \ -) - -#define BENCHMARK_TYPE(type, block) \ - CREATE_BENCHMARK(type, block, 1), \ - CREATE_BENCHMARK(type, block, 2), \ - CREATE_BENCHMARK(type, block, 3), \ - CREATE_BENCHMARK(type, block, 4), \ - CREATE_BENCHMARK(type, block, 8) - -void add_benchmarks(benchmark_kinds benchmark_kind, - const std::string& name, - std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - std::vector bs = - { - BENCHMARK_TYPE(int, 64), - BENCHMARK_TYPE(int, 128), - BENCHMARK_TYPE(int, 256), - BENCHMARK_TYPE(int, 512), - - BENCHMARK_TYPE(int8_t, 64), - BENCHMARK_TYPE(int8_t, 128), - BENCHMARK_TYPE(int8_t, 256), - BENCHMARK_TYPE(int8_t, 512), - - BENCHMARK_TYPE(uint8_t, 64), - BENCHMARK_TYPE(uint8_t, 128), - BENCHMARK_TYPE(uint8_t, 256), - BENCHMARK_TYPE(uint8_t, 512), - - BENCHMARK_TYPE(long long, 64), - BENCHMARK_TYPE(long long, 128), - BENCHMARK_TYPE(long long, 256), - BENCHMARK_TYPE(long long, 512) - }; - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); -} + BENCHMARK_TYPE(int8_t, 64), BENCHMARK_TYPE(int8_t, 128), + BENCHMARK_TYPE(int8_t, 256), BENCHMARK_TYPE(int8_t, 512), -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_block_merge_sort" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_benchmarks(benchmark_kinds::sort_keys, "sort(keys)", benchmarks, stream, size); - add_benchmarks(benchmark_kinds::sort_pairs, "sort(keys, values)", benchmarks, stream, size); - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } + BENCHMARK_TYPE(uint8_t, 64), BENCHMARK_TYPE(uint8_t, 128), + BENCHMARK_TYPE(uint8_t, 256), BENCHMARK_TYPE(uint8_t, 512), + + BENCHMARK_TYPE(long long, 64), BENCHMARK_TYPE(long long, 128), + BENCHMARK_TYPE(long long, 256), BENCHMARK_TYPE(long long, 512)}; + + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +} - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_block_merge_sort" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_benchmarks(benchmark_kinds::sort_keys, "sort(keys)", benchmarks, stream, + size); + add_benchmarks(benchmark_kinds::sort_pairs, "sort(keys, values)", benchmarks, + stream, size); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_block_radix_rank.cpp b/benchmark/benchmark_block_radix_rank.cpp index e351b6c2..e9d1f474 100644 --- a/benchmark/benchmark_block_radix_rank.cpp +++ b/benchmark/benchmark_block_radix_rank.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -33,144 +33,116 @@ const size_t DEFAULT_N = 1024 * 1024 * 128; #endif -enum class RadixRankAlgorithm -{ - RADIX_RANK_BASIC, - RADIX_RANK_MEMOIZE, - RADIX_RANK_MATCH, +enum class RadixRankAlgorithm { + RADIX_RANK_BASIC, + RADIX_RANK_MEMOIZE, + RADIX_RANK_MATCH, }; -template -__global__ __launch_bounds__(BlockSize) void rank_kernel(const T* keys_input, int* ranks_output) -{ - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - - T keys[ItemsPerThread]; - hipcub::LoadDirectBlocked(lid, keys_input + block_offset, keys); - - using KeyTraits = hipcub::Traits; - using UnsignedBits = typename KeyTraits::UnsignedBits; - using DigitExtractor = hipcub::BFEDigitExtractor; - - UnsignedBits(&unsigned_keys)[ItemsPerThread] - = reinterpret_cast(keys); - - using RankType = std::conditional_t< - BenchmarkKind == RadixRankAlgorithm::RADIX_RANK_MATCH, - hipcub::BlockRadixRankMatch, - hipcub::BlockRadixRank>; +template +__global__ __launch_bounds__(BlockSize) void rank_kernel(const T *keys_input, + int *ranks_output) { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + + T keys[ItemsPerThread]; + hipcub::LoadDirectBlocked(lid, keys_input + block_offset, keys); + + using KeyTraits = hipcub::Traits; + using UnsignedBits = typename KeyTraits::UnsignedBits; + using DigitExtractor = hipcub::BFEDigitExtractor; + + UnsignedBits(&unsigned_keys)[ItemsPerThread] = + reinterpret_cast(keys); + + using RankType = std::conditional_t< + BenchmarkKind == RadixRankAlgorithm::RADIX_RANK_MATCH, + hipcub::BlockRadixRankMatch, + hipcub::BlockRadixRank>; #pragma unroll - for(unsigned int key = 0; key < ItemsPerThread; key++) - { - unsigned_keys[key] = KeyTraits::TwiddleIn(unsigned_keys[key]); - } + for (unsigned int key = 0; key < ItemsPerThread; key++) { + unsigned_keys[key] = KeyTraits::TwiddleIn(unsigned_keys[key]); + } - int ranks[ItemsPerThread]; + int ranks[ItemsPerThread]; #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - __shared__ typename RankType::TempStorage storage; - RankType rank(storage); - unsigned begin_bit = 0; - const unsigned end_bit = sizeof(T) * 8; - - while(begin_bit < end_bit) - { - const unsigned pass_bits = min(RadixBits, end_bit - begin_bit); - DigitExtractor digit_extractor(begin_bit, pass_bits); - - rank.RankKeys(unsigned_keys, ranks, digit_extractor); - begin_bit += RadixBits; - } + for (unsigned int trial = 0; trial < Trials; trial++) { + __shared__ typename RankType::TempStorage storage; + RankType rank(storage); + unsigned begin_bit = 0; + const unsigned end_bit = sizeof(T) * 8; + + while (begin_bit < end_bit) { + const unsigned pass_bits = min(RadixBits, end_bit - begin_bit); + DigitExtractor digit_extractor(begin_bit, pass_bits); + + rank.RankKeys(unsigned_keys, ranks, digit_extractor); + begin_bit += RadixBits; } + } - hipcub::StoreDirectBlocked(lid, ranks_output + block_offset, ranks); + hipcub::StoreDirectBlocked(lid, ranks_output + block_offset, ranks); } -template -void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) -{ - constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int size = items_per_block * ((N + items_per_block - 1) / items_per_block); - - std::vector input; - if(std::is_floating_point::value) - { - input = benchmark_utils::get_random_data(size, - static_cast(-1000), - static_cast(1000)); - } - else - { - input = benchmark_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max()); - } - T* d_input; - int* d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(int))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); +template +void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + const unsigned int size = + items_per_block * ((N + items_per_block - 1) / items_per_block); + + std::vector input; + if (std::is_floating_point::value) { + input = benchmark_utils::get_random_data(size, static_cast(-1000), + static_cast(1000)); + } else { + input = benchmark_utils::get_random_data( + size, std::numeric_limits::min(), std::numeric_limits::max()); + } + T *d_input; + int *d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(int))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(rank_kernel), + dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, + d_output); + HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); - for(auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - rank_kernel), - dim3(size / items_per_block), - dim3(BlockSize), - 0, - stream, - d_input, - d_output); - HIP_CHECK(hipPeekAtLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds - = std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, KIND, BS, IPT) \ - benchmark::RegisterBenchmark( \ - std::string("block_radix_rank." \ - + name \ - ).c_str(), \ - &run_benchmark, \ - stream, \ - size \ - ) - +#define CREATE_BENCHMARK(T, KIND, BS, IPT) \ + benchmark::RegisterBenchmark(std::string("block_radix_rank." + \ + name) \ + .c_str(), \ + &run_benchmark, stream, size) // clang-format off #define CREATE_BENCHMARK_KINDS(type, block, ipt) \ @@ -186,71 +158,62 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) CREATE_BENCHMARK_KINDS(type, block, 32) // clang-format on -void add_benchmarks(const std::string& name, - std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - std::vector bs = { - BENCHMARK_TYPE(int, 128), - BENCHMARK_TYPE(int, 256), - BENCHMARK_TYPE(int, 512), - - BENCHMARK_TYPE(uint8_t, 128), - BENCHMARK_TYPE(uint8_t, 256), - BENCHMARK_TYPE(uint8_t, 512), - - BENCHMARK_TYPE(long long, 128), - BENCHMARK_TYPE(long long, 256), - BENCHMARK_TYPE(long long, 512), - }; - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); -} +void add_benchmarks(const std::string &name, + std::vector &benchmarks, + hipStream_t stream, size_t size) { + std::vector bs = { + BENCHMARK_TYPE(int, 128), BENCHMARK_TYPE(int, 256), + BENCHMARK_TYPE(int, 512), -int main(int argc, char* argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - - std::cout << "benchmark_block_radix_rank" << std::endl; - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_benchmarks("rank", benchmarks, stream, size); - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } + BENCHMARK_TYPE(uint8_t, 128), BENCHMARK_TYPE(uint8_t, 256), + BENCHMARK_TYPE(uint8_t, 512), + + BENCHMARK_TYPE(long long, 128), BENCHMARK_TYPE(long long, 256), + BENCHMARK_TYPE(long long, 512), + }; + + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +} - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_block_radix_rank" << std::endl; + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_benchmarks("rank", benchmarks, stream, size); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_block_radix_sort.cpp b/benchmark/benchmark_block_radix_sort.cpp index c63a566b..d31fbbac 100644 --- a/benchmark/benchmark_block_radix_sort.cpp +++ b/benchmark/benchmark_block_radix_sort.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -23,241 +23,199 @@ #include "common_benchmark_header.hpp" // HIP API -#include "hipcub/block/block_radix_sort.hpp" #include "hipcub/block/block_load.hpp" +#include "hipcub/block/block_radix_sort.hpp" #include "hipcub/block/block_store.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif -enum class benchmark_kinds -{ - sort_keys, - sort_pairs +enum class benchmark_kinds { sort_keys, sort_pairs }; + +struct helper_blocked_blocked { + template + HIPCUB_DEVICE static void load(int linear_id, InputIteratorT block_iter, + T (&items)[ItemsPerThread]) { + hipcub::LoadDirectStriped(linear_id, block_iter, items); + } + + template + HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread]) { + hipcub::BlockRadixSort sort; + sort.Sort(keys); + } + + template + HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread], + T (&values)[ItemsPerThread]) { + hipcub::BlockRadixSort sort; + sort.Sort(keys, values); + } + + template + HIPCUB_DEVICE static void + sort(benchmark_utils::custom_type (&keys)[ItemsPerThread]) { + using custom_t = benchmark_utils::custom_type; + hipcub::BlockRadixSort sort; + sort.Sort(keys, benchmark_utils::custom_type_decomposer{}); + } + + template + HIPCUB_DEVICE static void + sort(benchmark_utils::custom_type (&keys)[ItemsPerThread], + benchmark_utils::custom_type (&values)[ItemsPerThread]) { + using custom_t = benchmark_utils::custom_type; + hipcub::BlockRadixSort sort; + sort.Sort(keys, values, + benchmark_utils::custom_type_decomposer{}); + } }; -struct helper_blocked_blocked -{ - template - HIPCUB_DEVICE static void - load(int linear_id, InputIteratorT block_iter, T (&items)[ItemsPerThread]) - { - hipcub::LoadDirectStriped(linear_id, block_iter, items); - } - - template - HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread]) - { - hipcub::BlockRadixSort sort; - sort.Sort(keys); - } - - template - HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread], T (&values)[ItemsPerThread]) - { - hipcub::BlockRadixSort sort; - sort.Sort(keys, values); - } - - template - HIPCUB_DEVICE static void sort(benchmark_utils::custom_type (&keys)[ItemsPerThread]) - { - using custom_t = benchmark_utils::custom_type; - hipcub::BlockRadixSort sort; - sort.Sort(keys, benchmark_utils::custom_type_decomposer{}); - } - - template - HIPCUB_DEVICE static void sort(benchmark_utils::custom_type (&keys)[ItemsPerThread], - benchmark_utils::custom_type (&values)[ItemsPerThread]) - { - using custom_t = benchmark_utils::custom_type; - hipcub::BlockRadixSort sort; - sort.Sort(keys, values, benchmark_utils::custom_type_decomposer{}); - } +struct helper_blocked_striped { + template + HIPCUB_DEVICE static void load(int linear_id, InputIteratorT block_iter, + T (&items)[ItemsPerThread]) { + hipcub::LoadDirectBlocked(linear_id, block_iter, items); + } + + template + HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread]) { + hipcub::BlockRadixSort sort; + sort.SortBlockedToStriped(keys); + } + + template + HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread], + T (&values)[ItemsPerThread]) { + hipcub::BlockRadixSort sort; + sort.SortBlockedToStriped(keys, values); + } + + template + HIPCUB_DEVICE static void + sort(benchmark_utils::custom_type (&keys)[ItemsPerThread]) { + using custom_t = benchmark_utils::custom_type; + hipcub::BlockRadixSort sort; + sort.SortBlockedToStriped( + keys, benchmark_utils::custom_type_decomposer{}); + } + + template + HIPCUB_DEVICE static void + sort(benchmark_utils::custom_type (&keys)[ItemsPerThread], + benchmark_utils::custom_type (&values)[ItemsPerThread]) { + using custom_t = benchmark_utils::custom_type; + hipcub::BlockRadixSort sort; + sort.SortBlockedToStriped( + keys, values, benchmark_utils::custom_type_decomposer{}); + } }; -struct helper_blocked_striped -{ - template - HIPCUB_DEVICE static void - load(int linear_id, InputIteratorT block_iter, T (&items)[ItemsPerThread]) - { - hipcub::LoadDirectBlocked(linear_id, block_iter, items); - } +template +__global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T *input, + T *output) { + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; - template - HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread]) - { - hipcub::BlockRadixSort sort; - sort.SortBlockedToStriped(keys); - } + T keys[ItemsPerThread]; + Helper::template load(lid, input + block_offset, keys); - template - HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread], T (&values)[ItemsPerThread]) - { - hipcub::BlockRadixSort sort; - sort.SortBlockedToStriped(keys, values); - } - - template - HIPCUB_DEVICE static void sort(benchmark_utils::custom_type (&keys)[ItemsPerThread]) - { - using custom_t = benchmark_utils::custom_type; - hipcub::BlockRadixSort sort; - sort.SortBlockedToStriped(keys, benchmark_utils::custom_type_decomposer{}); - } +#pragma nounroll + for (unsigned int trial = 0; trial < Trials; trial++) { + Helper::template sort(keys); + } - template - HIPCUB_DEVICE static void sort(benchmark_utils::custom_type (&keys)[ItemsPerThread], - benchmark_utils::custom_type (&values)[ItemsPerThread]) - { - using custom_t = benchmark_utils::custom_type; - hipcub::BlockRadixSort sort; - sort.SortBlockedToStriped(keys, - values, - benchmark_utils::custom_type_decomposer{}); - } -}; + hipcub::StoreDirectStriped(lid, output + block_offset, keys); +} -template -__global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T* input, T* output) -{ - const unsigned int lid = threadIdx.x; - const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; - - T keys[ItemsPerThread]; - Helper::template load(lid, input + block_offset, keys); - - #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - Helper::template sort(keys); - } +template +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T *input, + T *output) { + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; - hipcub::StoreDirectStriped(lid, output + block_offset, keys); -} + T keys[ItemsPerThread]; + T values[ItemsPerThread]; + Helper::template load(lid, input + block_offset, keys); -template -__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T* input, T* output) -{ - const unsigned int lid = threadIdx.x; - const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; - - T keys[ItemsPerThread]; - T values[ItemsPerThread]; - Helper::template load(lid, input + block_offset, keys); - - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - values[i] = keys[i] + T(1); - } + for (unsigned int i = 0; i < ItemsPerThread; i++) { + values[i] = keys[i] + T(1); + } - #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - Helper::template sort(keys, values); - } +#pragma nounroll + for (unsigned int trial = 0; trial < Trials; trial++) { + Helper::template sort(keys, values); + } - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - keys[i] += values[i]; - } + for (unsigned int i = 0; i < ItemsPerThread; i++) { + keys[i] += values[i]; + } - hipcub::StoreDirectStriped(lid, output + block_offset, keys); + hipcub::StoreDirectStriped(lid, output + block_offset, keys); } -template -void run_benchmark(benchmark::State& state, - benchmark_kinds benchmark_kind, - hipStream_t stream, - size_t N) -{ - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); - - std::vector input; - if(std::is_floating_point::value) - { - input = benchmark_utils::get_random_data(size, (T)-1000, (T)+1000); +template +void run_benchmark(benchmark::State &state, benchmark_kinds benchmark_kind, + hipStream_t stream, size_t N) { + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto size = + items_per_block * ((N + items_per_block - 1) / items_per_block); + + std::vector input; + if (std::is_floating_point::value) { + input = benchmark_utils::get_random_data(size, (T)-1000, (T) + 1000); + } else { + input = benchmark_utils::get_random_data( + size, std::numeric_limits::min(), std::numeric_limits::max()); + } + T *d_input; + T *d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + if (benchmark_kind == benchmark_kinds::sort_keys) { + sort_keys_kernel + <<>>( + d_input, d_output); + } else if (benchmark_kind == benchmark_kinds::sort_pairs) { + sort_pairs_kernel + <<>>( + d_input, d_output); } - else - { - input = benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); - } - T * d_input; - T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); - for(auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - if(benchmark_kind == benchmark_kinds::sort_keys) - { - sort_keys_kernel - <<>>(d_input, d_output); - } - else if(benchmark_kind == benchmark_kinds::sort_pairs) - { - sort_pairs_kernel - <<>>(d_input, d_output); - } - HIP_CHECK(hipPeekAtLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark( \ - std::string("block_radix_sort.sub_algorithm_name:" \ - + name \ - ).c_str(), \ - &run_benchmark, \ - benchmark_kind, \ - stream, \ - size \ - ) +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark( \ + std::string("block_radix_sort.sub_algorithm_name:" + \ + name) \ + .c_str(), \ + &run_benchmark, benchmark_kind, stream, size) // clang-format off #define BENCHMARK_TYPE(type, block) \ @@ -267,61 +225,57 @@ void run_benchmark(benchmark::State& state, CREATE_BENCHMARK(type, block, 8) // clang-format on -template -void add_benchmarks(benchmark_kinds benchmark_kind, - const std::string& name, - std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - using custom_int_t = benchmark_utils::custom_type; - - std::vector bs = { - BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(int, 128), - BENCHMARK_TYPE(int, 192), BENCHMARK_TYPE(int, 256), - BENCHMARK_TYPE(int, 320), BENCHMARK_TYPE(int, 512), - - BENCHMARK_TYPE(int8_t, 64), BENCHMARK_TYPE(int8_t, 128), - BENCHMARK_TYPE(int8_t, 192), BENCHMARK_TYPE(int8_t, 256), - BENCHMARK_TYPE(int8_t, 320), BENCHMARK_TYPE(int8_t, 512), - - BENCHMARK_TYPE(long long, 64), BENCHMARK_TYPE(long long, 128), - BENCHMARK_TYPE(long long, 192), BENCHMARK_TYPE(long long, 256), - BENCHMARK_TYPE(long long, 320), BENCHMARK_TYPE(long long, 512), - - BENCHMARK_TYPE(custom_int_t, 64), BENCHMARK_TYPE(custom_int_t, 128), - BENCHMARK_TYPE(custom_int_t, 192), BENCHMARK_TYPE(custom_int_t, 256), - BENCHMARK_TYPE(custom_int_t, 320), BENCHMARK_TYPE(custom_int_t, 512), - }; - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +template +void add_benchmarks(benchmark_kinds benchmark_kind, const std::string &name, + std::vector &benchmarks, + hipStream_t stream, size_t size) { + using custom_int_t = benchmark_utils::custom_type; + + std::vector bs = { + BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(int, 128), + BENCHMARK_TYPE(int, 192), BENCHMARK_TYPE(int, 256), + BENCHMARK_TYPE(int, 320), BENCHMARK_TYPE(int, 512), + + BENCHMARK_TYPE(int8_t, 64), BENCHMARK_TYPE(int8_t, 128), + BENCHMARK_TYPE(int8_t, 192), BENCHMARK_TYPE(int8_t, 256), + BENCHMARK_TYPE(int8_t, 320), BENCHMARK_TYPE(int8_t, 512), + + BENCHMARK_TYPE(long long, 64), BENCHMARK_TYPE(long long, 128), + BENCHMARK_TYPE(long long, 192), BENCHMARK_TYPE(long long, 256), + BENCHMARK_TYPE(long long, 320), BENCHMARK_TYPE(long long, 512), + + BENCHMARK_TYPE(custom_int_t, 64), BENCHMARK_TYPE(custom_int_t, 128), + BENCHMARK_TYPE(custom_int_t, 192), BENCHMARK_TYPE(custom_int_t, 256), + BENCHMARK_TYPE(custom_int_t, 320), BENCHMARK_TYPE(custom_int_t, 512), + }; + + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_block_radix_sort" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - // clang-format off +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_block_radix_sort" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + // clang-format off add_benchmarks( benchmark_kinds::sort_keys, "sort(keys)", benchmarks, stream, size); add_benchmarks( @@ -330,25 +284,22 @@ int main(int argc, char *argv[]) benchmark_kinds::sort_keys, "sort_to_striped(keys)", benchmarks, stream, size); add_benchmarks( benchmark_kinds::sort_pairs, "sort_to_striped(keys, values)", benchmarks, stream, size); - // clang-format on - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } + // clang-format on + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_block_reduce.cpp b/benchmark/benchmark_block_reduce.cpp index c3364a20..72b9ed7c 100644 --- a/benchmark/benchmark_block_reduce.cpp +++ b/benchmark/benchmark_block_reduce.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -26,218 +26,171 @@ #include "hipcub/block/block_reduce.hpp" #include "hipcub/thread/thread_operators.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template< - class Runner, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials -> -__global__ -__launch_bounds__(BlockSize) -void kernel(const T* input, T* output) -{ - Runner::template run(input, output); +template +__global__ __launch_bounds__(BlockSize) void kernel(const T *input, T *output) { + Runner::template run(input, output); } -template -struct reduce -{ - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T* input, T* output) - { - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - - T values[ItemsPerThread]; - T reduced_value; - for(unsigned int k = 0; k < ItemsPerThread; k++) - { - values[k] = input[i * ItemsPerThread + k]; - } - - using breduce_t = hipcub::BlockReduce; - __shared__ typename breduce_t::TempStorage storage; - - #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - reduced_value = breduce_t(storage).Reduce(values, hipcub::Sum()); - values[0] = reduced_value; - } - - if(hipThreadIdx_x == 0) - { - output[hipBlockIdx_x] = reduced_value; - } +template struct reduce { + template + __device__ static void run(const T *input, T *output) { + const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + T values[ItemsPerThread]; + T reduced_value; + for (unsigned int k = 0; k < ItemsPerThread; k++) { + values[k] = input[i * ItemsPerThread + k]; + } + + using breduce_t = hipcub::BlockReduce; + __shared__ typename breduce_t::TempStorage storage; + +#pragma nounroll + for (unsigned int trial = 0; trial < Trials; trial++) { + reduced_value = breduce_t(storage).Reduce(values, hipcub::Sum()); + values[0] = reduced_value; } + + if (hipThreadIdx_x == 0) { + output[hipBlockIdx_x] = reduced_value; + } + } }; -template< - class Benchmark, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials = 100 -> -void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) -{ - // Make sure size is a multiple of BlockSize - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); - // Allocate and fill memory - std::vector input(size, T(1)); - T * d_input; - T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); +template +void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { + // Make sure size is a multiple of BlockSize + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto size = + items_per_block * ((N + items_per_block - 1) / items_per_block); + // Allocate and fill memory + std::vector input(size, T(1)); + T *d_input; + T *d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel), + dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, + d_output); + HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel), - dim3(size/items_per_block), dim3(BlockSize), 0, stream, - d_input, d_output - ); - HIP_CHECK(hipPeekAtLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); - state.SetItemsProcessed(state.iterations() * size * Trials); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); + state.SetItemsProcessed(state.iterations() * size * Trials); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } // IPT - items per thread -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark( \ - std::string("block_reduce.method_name:" + method_name \ - ).c_str(), \ - &run_benchmark, \ - stream, size \ - ) - -#define BENCHMARK_TYPE(type, block) \ - CREATE_BENCHMARK(type, block, 1), \ - CREATE_BENCHMARK(type, block, 2), \ - CREATE_BENCHMARK(type, block, 3), \ - CREATE_BENCHMARK(type, block, 4), \ - CREATE_BENCHMARK(type, block, 8), \ - CREATE_BENCHMARK(type, block, 11), \ - CREATE_BENCHMARK(type, block, 16) - -template -void add_benchmarks(std::vector& benchmarks, - const std::string& method_name, - const std::string& algorithm_name, - hipStream_t stream, - size_t size) -{ - - std::vector new_benchmarks = - { - // When block size is less than or equal to warp size - BENCHMARK_TYPE(int, 64), - BENCHMARK_TYPE(float, 64), - BENCHMARK_TYPE(double, 64), - BENCHMARK_TYPE(int8_t, 64), - BENCHMARK_TYPE(uint8_t, 64), - - BENCHMARK_TYPE(int, 256), - BENCHMARK_TYPE(float, 256), - BENCHMARK_TYPE(double, 256), - BENCHMARK_TYPE(int8_t, 256), - BENCHMARK_TYPE(uint8_t, 256), - }; - benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark( \ + std::string("block_reduce.method_name:" + method_name) \ + .c_str(), \ + &run_benchmark, stream, size) + +#define BENCHMARK_TYPE(type, block) \ + CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2), \ + CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \ + CREATE_BENCHMARK(type, block, 8), CREATE_BENCHMARK(type, block, 11), \ + CREATE_BENCHMARK(type, block, 16) + +template +void add_benchmarks(std::vector &benchmarks, + const std::string &method_name, + const std::string &algorithm_name, hipStream_t stream, + size_t size) { + + std::vector new_benchmarks = { + // When block size is less than or equal to warp size + BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(float, 64), + BENCHMARK_TYPE(double, 64), BENCHMARK_TYPE(int8_t, 64), + BENCHMARK_TYPE(uint8_t, 64), + + BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(float, 256), + BENCHMARK_TYPE(double, 256), BENCHMARK_TYPE(int8_t, 256), + BENCHMARK_TYPE(uint8_t, 256), + }; + benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), + new_benchmarks.end()); } -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_block_reduce" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - // using_warp_scan - using reduce_uwr_t = reduce; - add_benchmarks( - benchmarks, "reduce", "BLOCK_REDUCE_WARP_REDUCTIONS", stream, size - ); - // raking reduce - using reduce_rr_t = reduce; - add_benchmarks( - benchmarks, "reduce", "BLOCK_REDUCE_RAKING", stream, size - ); - // raking reduce commutative only - using reduce_rrco_t = reduce; - add_benchmarks( - benchmarks, "reduce", "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY", stream, size - ); - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_block_reduce" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + // using_warp_scan + using reduce_uwr_t = + reduce; + add_benchmarks(benchmarks, "reduce", + "BLOCK_REDUCE_WARP_REDUCTIONS", stream, size); + // raking reduce + using reduce_rr_t = reduce; + add_benchmarks(benchmarks, "reduce", "BLOCK_REDUCE_RAKING", + stream, size); + // raking reduce commutative only + using reduce_rrco_t = reduce< + hipcub::BlockReduceAlgorithm::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY>; + add_benchmarks(benchmarks, "reduce", + "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY", stream, + size); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_block_run_length_decode.cpp b/benchmark/benchmark_block_run_length_decode.cpp index 7a854fe4..9f60a522 100644 --- a/benchmark/benchmark_block_run_length_decode.cpp +++ b/benchmark/benchmark_block_run_length_decode.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -30,233 +30,180 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template< - class ItemT, - class OffsetT, - unsigned BlockSize, - unsigned RunsPerThread, - unsigned DecodedItemsPerThread, - unsigned Trials -> -__global__ -__launch_bounds__(BlockSize) -void block_run_length_decode_kernel( - const ItemT * d_run_items, - const OffsetT * d_run_offsets, - ItemT * d_decoded_items, - bool enable_store = false) -{ - using BlockRunLengthDecodeT = hipcub::BlockRunLengthDecode< - ItemT, - BlockSize, - RunsPerThread, - DecodedItemsPerThread - >; - - ItemT run_items[RunsPerThread]; - OffsetT run_offsets[RunsPerThread]; - - const unsigned global_thread_idx = BlockSize * hipBlockIdx_x + hipThreadIdx_x; - hipcub::LoadDirectBlocked(global_thread_idx, d_run_items, run_items); - hipcub::LoadDirectBlocked(global_thread_idx, d_run_offsets, run_offsets); - - BlockRunLengthDecodeT block_run_length_decode( - run_items, - run_offsets - ); - - const OffsetT total_decoded_size = - d_run_offsets[(hipBlockIdx_x + 1) * BlockSize * RunsPerThread] - - d_run_offsets[hipBlockIdx_x * BlockSize * RunsPerThread]; - - #pragma nounroll - for (unsigned i = 0; i < Trials; ++i) - { - OffsetT decoded_window_offset = 0; - while (decoded_window_offset < total_decoded_size) - { - ItemT decoded_items[DecodedItemsPerThread]; - block_run_length_decode.RunLengthDecode(decoded_items, decoded_window_offset); - - if (enable_store) - { - hipcub::StoreDirectBlocked(global_thread_idx, d_decoded_items + decoded_window_offset, decoded_items); - } - - decoded_window_offset += BlockSize * DecodedItemsPerThread; - } +template +__global__ __launch_bounds__(BlockSize) void block_run_length_decode_kernel( + const ItemT *d_run_items, const OffsetT *d_run_offsets, + ItemT *d_decoded_items, bool enable_store = false) { + using BlockRunLengthDecodeT = + hipcub::BlockRunLengthDecode; + + ItemT run_items[RunsPerThread]; + OffsetT run_offsets[RunsPerThread]; + + const unsigned global_thread_idx = BlockSize * hipBlockIdx_x + hipThreadIdx_x; + hipcub::LoadDirectBlocked(global_thread_idx, d_run_items, run_items); + hipcub::LoadDirectBlocked(global_thread_idx, d_run_offsets, run_offsets); + + BlockRunLengthDecodeT block_run_length_decode(run_items, run_offsets); + + const OffsetT total_decoded_size = + d_run_offsets[(hipBlockIdx_x + 1) * BlockSize * RunsPerThread] - + d_run_offsets[hipBlockIdx_x * BlockSize * RunsPerThread]; + +#pragma nounroll + for (unsigned i = 0; i < Trials; ++i) { + OffsetT decoded_window_offset = 0; + while (decoded_window_offset < total_decoded_size) { + ItemT decoded_items[DecodedItemsPerThread]; + block_run_length_decode.RunLengthDecode(decoded_items, + decoded_window_offset); + + if (enable_store) { + hipcub::StoreDirectBlocked(global_thread_idx, + d_decoded_items + decoded_window_offset, + decoded_items); + } + + decoded_window_offset += BlockSize * DecodedItemsPerThread; } + } } -template< - class ItemT, - class OffsetT, - unsigned MinRunLength, - unsigned MaxRunLength, - unsigned BlockSize, - unsigned RunsPerThread, - unsigned DecodedItemsPerThread, - unsigned Trials = 100 -> -void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) -{ - constexpr auto runs_per_block = BlockSize * RunsPerThread; - const auto target_num_runs = 2 * N / (MinRunLength + MaxRunLength); - const auto num_runs = runs_per_block * ((target_num_runs + runs_per_block - 1)/runs_per_block); - - std::vector run_items(num_runs); - std::vector run_offsets(num_runs + 1); - - std::default_random_engine prng(std::random_device{}()); - using ItemDistribution = std::conditional_t< - std::is_integral::value, - std::uniform_int_distribution, - std::uniform_real_distribution - >; - ItemDistribution run_item_dist(0, 100); - std::uniform_int_distribution run_length_dist(MinRunLength, MaxRunLength); - - for (size_t i = 0; i < num_runs; ++i) - { - run_items[i] = run_item_dist(prng); - } - for (size_t i = 1; i < num_runs + 1; ++i) - { - const OffsetT next_run_length = run_length_dist(prng); - run_offsets[i] = run_offsets[i - 1] + next_run_length; - } - const OffsetT output_length = run_offsets.back(); - - ItemT * d_run_items{}; - HIP_CHECK(hipMalloc(&d_run_items, run_items.size() * sizeof(ItemT))); - HIP_CHECK( - hipMemcpy( - d_run_items, run_items.data(), - run_items.size() * sizeof(ItemT), - hipMemcpyHostToDevice - ) - ); - - OffsetT * d_run_offsets{}; - HIP_CHECK(hipMalloc(&d_run_offsets, run_offsets.size() * sizeof(OffsetT))); - HIP_CHECK( - hipMemcpy( - d_run_offsets, run_offsets.data(), - run_offsets.size() * sizeof(OffsetT), - hipMemcpyHostToDevice - ) - ); - - ItemT * d_output{}; - HIP_CHECK(hipMalloc(&d_output, output_length * sizeof(ItemT))); - - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - block_run_length_decode_kernel< - ItemT, - OffsetT, - BlockSize, - RunsPerThread, - DecodedItemsPerThread, - Trials - > - ), - dim3(num_runs/runs_per_block), dim3(BlockSize), 0, stream, - d_run_items, d_run_offsets, d_output - ); - HIP_CHECK(hipPeekAtLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * output_length * sizeof(ItemT) * Trials); - state.SetItemsProcessed(state.iterations() * output_length * Trials); - - HIP_CHECK(hipFree(d_run_items)); - HIP_CHECK(hipFree(d_run_offsets)); - HIP_CHECK(hipFree(d_output)); +template +void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { + constexpr auto runs_per_block = BlockSize * RunsPerThread; + const auto target_num_runs = 2 * N / (MinRunLength + MaxRunLength); + const auto num_runs = + runs_per_block * + ((target_num_runs + runs_per_block - 1) / runs_per_block); + + std::vector run_items(num_runs); + std::vector run_offsets(num_runs + 1); + + std::default_random_engine prng(std::random_device{}()); + using ItemDistribution = + std::conditional_t::value, + std::uniform_int_distribution, + std::uniform_real_distribution>; + ItemDistribution run_item_dist(0, 100); + std::uniform_int_distribution run_length_dist(MinRunLength, + MaxRunLength); + + for (size_t i = 0; i < num_runs; ++i) { + run_items[i] = run_item_dist(prng); + } + for (size_t i = 1; i < num_runs + 1; ++i) { + const OffsetT next_run_length = run_length_dist(prng); + run_offsets[i] = run_offsets[i - 1] + next_run_length; + } + const OffsetT output_length = run_offsets.back(); + + ItemT *d_run_items{}; + HIP_CHECK(hipMalloc(&d_run_items, run_items.size() * sizeof(ItemT))); + HIP_CHECK(hipMemcpy(d_run_items, run_items.data(), + run_items.size() * sizeof(ItemT), hipMemcpyHostToDevice)); + + OffsetT *d_run_offsets{}; + HIP_CHECK(hipMalloc(&d_run_offsets, run_offsets.size() * sizeof(OffsetT))); + HIP_CHECK(hipMemcpy(d_run_offsets, run_offsets.data(), + run_offsets.size() * sizeof(OffsetT), + hipMemcpyHostToDevice)); + + ItemT *d_output{}; + HIP_CHECK(hipMalloc(&d_output, output_length * sizeof(ItemT))); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(block_run_length_decode_kernel< + ItemT, OffsetT, BlockSize, RunsPerThread, + DecodedItemsPerThread, Trials>), + dim3(num_runs / runs_per_block), dim3(BlockSize), 0, + stream, d_run_items, d_run_offsets, d_output); + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * output_length * sizeof(ItemT) * + Trials); + state.SetItemsProcessed(state.iterations() * output_length * Trials); + + HIP_CHECK(hipFree(d_run_items)); + HIP_CHECK(hipFree(d_run_offsets)); + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT) \ - benchmark::RegisterBenchmark( \ - std::string("block_run_length_decode." \ - ).c_str(), \ - &run_benchmark, \ - stream, size \ - ) - -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_block_run_length_decode" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks - { - CREATE_BENCHMARK(int, int, 1, 5, 128, 2, 4), - CREATE_BENCHMARK(int, int, 1, 10, 128, 2, 4), - CREATE_BENCHMARK(int, int, 1, 50, 128, 2, 4), - CREATE_BENCHMARK(int, int, 1, 100, 128, 2, 4), - CREATE_BENCHMARK(int, int, 1, 500, 128, 2, 4), - CREATE_BENCHMARK(int, int, 1, 1000, 128, 2, 4), - CREATE_BENCHMARK(int, int, 1, 5000, 128, 2, 4), - - CREATE_BENCHMARK(double, long long, 1, 5, 128, 2, 4), - CREATE_BENCHMARK(double, long long, 1, 10, 128, 2, 4), - CREATE_BENCHMARK(double, long long, 1, 50, 128, 2, 4), - CREATE_BENCHMARK(double, long long, 1, 100, 128, 2, 4), - CREATE_BENCHMARK(double, long long, 1, 500, 128, 2, 4), - CREATE_BENCHMARK(double, long long, 1, 1000, 128, 2, 4), - CREATE_BENCHMARK(double, long long, 1, 5000, 128, 2, 4) - }; - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +#define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT) \ + benchmark::RegisterBenchmark( \ + std::string("block_run_length_decode.") \ + .c_str(), \ + &run_benchmark, stream, size) + +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_block_run_length_decode" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks{ + CREATE_BENCHMARK(int, int, 1, 5, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 10, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 50, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 100, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 500, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 1000, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 5000, 128, 2, 4), + + CREATE_BENCHMARK(double, long long, 1, 5, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 10, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 50, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 100, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 500, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 1000, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 5000, 128, 2, 4)}; + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_block_scan.cpp b/benchmark/benchmark_block_scan.cpp index 38097f17..28c0264c 100644 --- a/benchmark/benchmark_block_scan.cpp +++ b/benchmark/benchmark_block_scan.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -25,146 +25,118 @@ // hipCUB API #include "hipcub/block/block_scan.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template -__global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output, const T init) -{ - Runner::template run(input, output, init); +template +__global__ __launch_bounds__(BlockSize) void kernel(const T *input, T *output, + const T init) { + Runner::template run(input, output, + init); } -template -struct inclusive_scan -{ - template - __device__ static void run(const T* input, T* output, const T init) - { - (void)init; - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - - T values[ItemsPerThread]; - for(unsigned int k = 0; k < ItemsPerThread; k++) - { - values[k] = input[i * ItemsPerThread + k]; - } - - using bscan_t = hipcub::BlockScan; - __shared__ typename bscan_t::TempStorage storage; - - #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - bscan_t(storage).InclusiveScan(values, values, hipcub::Sum()); - } - - for(unsigned int k = 0; k < ItemsPerThread; k++) - { - output[i * ItemsPerThread + k] = values[k]; - } +template struct inclusive_scan { + template + __device__ static void run(const T *input, T *output, const T init) { + (void)init; + const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + T values[ItemsPerThread]; + for (unsigned int k = 0; k < ItemsPerThread; k++) { + values[k] = input[i * ItemsPerThread + k]; + } + + using bscan_t = hipcub::BlockScan; + __shared__ typename bscan_t::TempStorage storage; + +#pragma nounroll + for (unsigned int trial = 0; trial < Trials; trial++) { + bscan_t(storage).InclusiveScan(values, values, hipcub::Sum()); + } + + for (unsigned int k = 0; k < ItemsPerThread; k++) { + output[i * ItemsPerThread + k] = values[k]; } + } }; -template -struct exclusive_scan -{ - template - __device__ static void run(const T* input, T* output, const T init) - { - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; +template struct exclusive_scan { + template + __device__ static void run(const T *input, T *output, const T init) { + const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - T values[ItemsPerThread]; - for(unsigned int k = 0; k < ItemsPerThread; k++) - { - values[k] = input[i * ItemsPerThread + k]; - } + T values[ItemsPerThread]; + for (unsigned int k = 0; k < ItemsPerThread; k++) { + values[k] = input[i * ItemsPerThread + k]; + } - using bscan_t = hipcub::BlockScan; - __shared__ typename bscan_t::TempStorage storage; + using bscan_t = hipcub::BlockScan; + __shared__ typename bscan_t::TempStorage storage; #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - bscan_t(storage).ExclusiveScan(values, values, init, hipcub::Sum()); - } - - for(unsigned int k = 0; k < ItemsPerThread; k++) - { - output[i * ItemsPerThread + k] = values[k]; - } + for (unsigned int trial = 0; trial < Trials; trial++) { + bscan_t(storage).ExclusiveScan(values, values, init, hipcub::Sum()); + } + + for (unsigned int k = 0; k < ItemsPerThread; k++) { + output[i * ItemsPerThread + k] = values[k]; } + } }; -template -void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) -{ - // Make sure size is a multiple of BlockSize - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); - // Allocate and fill memory - std::vector input(size, T(1)); - T * d_input; - T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); +template +void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { + // Make sure size is a multiple of BlockSize + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto size = + items_per_block * ((N + items_per_block - 1) / items_per_block); + // Allocate and fill memory + std::vector input(size, T(1)); + T *d_input; + T *d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel), + dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, + d_output, input[0]); + HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), - dim3(size / items_per_block), - dim3(BlockSize), - 0, - stream, - d_input, - d_output, - input[0]); - HIP_CHECK(hipPeekAtLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); - state.SetItemsProcessed(state.iterations() * size * Trials); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); + state.SetItemsProcessed(state.iterations() * size * Trials); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } // IPT - items per thread -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark( \ - (std::string("block_scan.method_name:") + method_name \ - ).c_str(), \ - &run_benchmark, \ - stream, size \ - ) +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark( \ + (std::string("block_scan.method_name:") + \ + method_name) \ + .c_str(), \ + &run_benchmark, stream, size) // clang-format off #define BENCHMARK_TYPE(type, block) \ @@ -176,64 +148,62 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) CREATE_BENCHMARK(type, block, 16) // clang-format on -template -void add_benchmarks(std::vector& benchmarks, - const std::string& method_name, - const std::string& algorithm_name, - hipStream_t stream, - size_t size) -{ - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - - std::vector new_benchmarks = { - // When block size is less than or equal to warp size - BENCHMARK_TYPE(int, 64), - BENCHMARK_TYPE(float, 64), - BENCHMARK_TYPE(double, 64), - BENCHMARK_TYPE(uint8_t, 64), - - BENCHMARK_TYPE(int, 256), - BENCHMARK_TYPE(float, 256), - BENCHMARK_TYPE(double, 256), - BENCHMARK_TYPE(uint8_t, 256), - - CREATE_BENCHMARK(custom_float2, 256, 1), - CREATE_BENCHMARK(custom_float2, 256, 4), - CREATE_BENCHMARK(custom_float2, 256, 8), - - CREATE_BENCHMARK(custom_double2, 256, 1), - CREATE_BENCHMARK(custom_double2, 256, 4), - CREATE_BENCHMARK(custom_double2, 256, 8), - }; - benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); +template +void add_benchmarks(std::vector &benchmarks, + const std::string &method_name, + const std::string &algorithm_name, hipStream_t stream, + size_t size) { + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + + std::vector new_benchmarks = { + // When block size is less than or equal to warp size + BENCHMARK_TYPE(int, 64), + BENCHMARK_TYPE(float, 64), + BENCHMARK_TYPE(double, 64), + BENCHMARK_TYPE(uint8_t, 64), + + BENCHMARK_TYPE(int, 256), + BENCHMARK_TYPE(float, 256), + BENCHMARK_TYPE(double, 256), + BENCHMARK_TYPE(uint8_t, 256), + + CREATE_BENCHMARK(custom_float2, 256, 1), + CREATE_BENCHMARK(custom_float2, 256, 4), + CREATE_BENCHMARK(custom_float2, 256, 8), + + CREATE_BENCHMARK(custom_double2, 256, 1), + CREATE_BENCHMARK(custom_double2, 256, 4), + CREATE_BENCHMARK(custom_double2, 256, 8), + }; + benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), + new_benchmarks.end()); } -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_block_scan" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - // clang-format off +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_block_scan" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + // clang-format off add_benchmarks>( benchmarks, "inclusive_scan", "BLOCK_SCAN_RAKING", stream, size); add_benchmarks>( @@ -246,25 +216,22 @@ int main(int argc, char *argv[]) benchmarks, "exclusive_scan", "BLOCK_SCAN_RAKING_MEMOIZE", stream, size); add_benchmarks>( benchmarks, "exclusive_scan", "BLOCK_SCAN_WARP_SCANS", stream, size); - // clang-format on - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } + // clang-format on + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_block_shuffle.cpp b/benchmark/benchmark_block_shuffle.cpp index 0d7289fb..e7f4fd13 100644 --- a/benchmark/benchmark_block_shuffle.cpp +++ b/benchmark/benchmark_block_shuffle.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -28,320 +28,270 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template -__global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) -{ - Runner::template run(input, output); +template +__global__ __launch_bounds__(BlockSize) void kernel(const T *input, T *output) { + Runner::template run(input, output); } -struct offset -{ - template - __device__ static void run(const T* input, T* output) - { - const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; +struct offset { + template + __device__ static void run(const T *input, T *output) { + const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; - T value = input[tid]; + T value = input[tid]; - using bshuffle_t = hipcub::BlockShuffle; - __shared__ typename bshuffle_t::TempStorage storage; + using bshuffle_t = hipcub::BlockShuffle; + __shared__ typename bshuffle_t::TempStorage storage; #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - bshuffle_t(storage).Offset(value, value, 1); + for (unsigned int trial = 0; trial < Trials; trial++) { + bshuffle_t(storage).Offset(value, value, 1); - // sync is required because of loop since - // temporary storage is accessed next iteration - __syncthreads(); - } - - output[tid] = value; + // sync is required because of loop since + // temporary storage is accessed next iteration + __syncthreads(); } - static constexpr bool uses_ipt = false; + output[tid] = value; + } + + static constexpr bool uses_ipt = false; }; -struct rotate -{ - template - __device__ static void run(const T* input, T* output) - { - const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; +struct rotate { + template + __device__ static void run(const T *input, T *output) { + const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; - T value = input[tid]; + T value = input[tid]; - using bshuffle_t = hipcub::BlockShuffle; - __shared__ typename bshuffle_t::TempStorage storage; + using bshuffle_t = hipcub::BlockShuffle; + __shared__ typename bshuffle_t::TempStorage storage; #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - bshuffle_t(storage).Rotate(value, value, 1); + for (unsigned int trial = 0; trial < Trials; trial++) { + bshuffle_t(storage).Rotate(value, value, 1); - // sync is required because of loop since - // temporary storage is accessed next iteration - __syncthreads(); - } - - output[tid] = value; + // sync is required because of loop since + // temporary storage is accessed next iteration + __syncthreads(); } - static constexpr bool uses_ipt = false; + output[tid] = value; + } + + static constexpr bool uses_ipt = false; }; -struct up -{ - template - __device__ static void run(const T* input, T* output) - { - const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; +struct up { + template + __device__ static void run(const T *input, T *output) { + const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; - T values[ItemsPerThread]; - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - values[i] = input[ItemsPerThread * tid + i]; - } + T values[ItemsPerThread]; + for (unsigned int i = 0; i < ItemsPerThread; i++) { + values[i] = input[ItemsPerThread * tid + i]; + } - using bshuffle_t = hipcub::BlockShuffle; - __shared__ typename bshuffle_t::TempStorage storage; + using bshuffle_t = hipcub::BlockShuffle; + __shared__ typename bshuffle_t::TempStorage storage; #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - bshuffle_t(storage).Up(values, values); - - // sync is required because of loop since - // temporary storage is accessed next iteration - __syncthreads(); - } - - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - output[ItemsPerThread * tid + i] = values[i]; - } + for (unsigned int trial = 0; trial < Trials; trial++) { + bshuffle_t(storage).Up(values, values); + + // sync is required because of loop since + // temporary storage is accessed next iteration + __syncthreads(); + } + + for (unsigned int i = 0; i < ItemsPerThread; i++) { + output[ItemsPerThread * tid + i] = values[i]; } + } - static constexpr bool uses_ipt = true; + static constexpr bool uses_ipt = true; }; -struct down -{ - template - __device__ static void run(const T* input, T* output) - { - const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; +struct down { + template + __device__ static void run(const T *input, T *output) { + const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; - T values[ItemsPerThread]; - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - values[i] = input[ItemsPerThread * tid + i]; - } + T values[ItemsPerThread]; + for (unsigned int i = 0; i < ItemsPerThread; i++) { + values[i] = input[ItemsPerThread * tid + i]; + } - using bshuffle_t = hipcub::BlockShuffle; - __shared__ typename bshuffle_t::TempStorage storage; + using bshuffle_t = hipcub::BlockShuffle; + __shared__ typename bshuffle_t::TempStorage storage; #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - bshuffle_t(storage).Down(values, values); - - // sync is required because of loop since - // temporary storage is accessed next iteration - __syncthreads(); - } - - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - output[ItemsPerThread * tid + i] = values[i]; - } + for (unsigned int trial = 0; trial < Trials; trial++) { + bshuffle_t(storage).Down(values, values); + + // sync is required because of loop since + // temporary storage is accessed next iteration + __syncthreads(); + } + + for (unsigned int i = 0; i < ItemsPerThread; i++) { + output[ItemsPerThread * tid + i] = values[i]; } + } - static constexpr bool uses_ipt = true; + static constexpr bool uses_ipt = true; }; -template -void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) -{ - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); - - std::vector input(size, T(1)); - T* d_input; - T* d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); +template +void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto size = + items_per_block * ((N + items_per_block - 1) / items_per_block); + + std::vector input(size, T(1)); + T *d_input; + T *d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel), + dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, + d_output); + HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); - for(auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), - dim3(size / items_per_block), - dim3(BlockSize), - 0, - stream, - d_input, - d_output); - HIP_CHECK(hipPeekAtLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds - = std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK_IPT(BS, IPT) \ - benchmark::RegisterBenchmark( \ - ("block_shuffle.sub_algorithm_name:" \ - + name \ - ).c_str(), \ - &run_benchmark, \ - stream, \ - size \ - ) - -#define CREATE_BENCHMARK(BS) \ - benchmark::RegisterBenchmark( \ - ("block_shuffle.sub_algorithm_name:" \ - + name \ - ).c_str(), \ - &run_benchmark, \ - stream, \ - size \ - ) - -template = true> -void add_benchmarks_type(const std::string& name, - std::vector& benchmarks, - hipStream_t stream, - size_t size, - const std::string& type_name) -{ - std::vector bs = { - CREATE_BENCHMARK_IPT(256, 1), - CREATE_BENCHMARK_IPT(256, 3), - CREATE_BENCHMARK_IPT(256, 4), - CREATE_BENCHMARK_IPT(256, 8), - CREATE_BENCHMARK_IPT(256, 16), - CREATE_BENCHMARK_IPT(256, 32), - }; - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_BENCHMARK_IPT(BS, IPT) \ + benchmark::RegisterBenchmark( \ + ("block_shuffle.sub_algorithm_name:" + \ + name) \ + .c_str(), \ + &run_benchmark, stream, size) + +#define CREATE_BENCHMARK(BS) \ + benchmark::RegisterBenchmark( \ + ("block_shuffle.sub_algorithm_name:" + name) \ + .c_str(), \ + &run_benchmark, stream, size) + +template = true> +void add_benchmarks_type( + const std::string &name, + std::vector &benchmarks, + hipStream_t stream, size_t size, const std::string &type_name) { + std::vector bs = { + CREATE_BENCHMARK_IPT(256, 1), CREATE_BENCHMARK_IPT(256, 3), + CREATE_BENCHMARK_IPT(256, 4), CREATE_BENCHMARK_IPT(256, 8), + CREATE_BENCHMARK_IPT(256, 16), CREATE_BENCHMARK_IPT(256, 32), + }; + + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -template = true> -void add_benchmarks_type(const std::string& name, - std::vector& benchmarks, - hipStream_t stream, - size_t size, - const std::string& type_name) -{ - std::vector bs = { - CREATE_BENCHMARK(256), - }; - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +template = true> +void add_benchmarks_type( + const std::string &name, + std::vector &benchmarks, + hipStream_t stream, size_t size, const std::string &type_name) { + std::vector bs = { + CREATE_BENCHMARK(256), + }; + + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_BENCHMARKS(T) add_benchmarks_type(name, benchmarks, stream, size, #T) - -template -void add_benchmarks(const std::string& name, - std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - - CREATE_BENCHMARKS(int); - CREATE_BENCHMARKS(float); - CREATE_BENCHMARKS(double); - CREATE_BENCHMARKS(int8_t); - CREATE_BENCHMARKS(long long); - CREATE_BENCHMARKS(custom_float2); - CREATE_BENCHMARKS(custom_double2); +#define CREATE_BENCHMARKS(T) \ + add_benchmarks_type(name, benchmarks, stream, size, #T) + +template +void add_benchmarks(const std::string &name, + std::vector &benchmarks, + hipStream_t stream, size_t size) { + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + + CREATE_BENCHMARKS(int); + CREATE_BENCHMARKS(float); + CREATE_BENCHMARKS(double); + CREATE_BENCHMARKS(int8_t); + CREATE_BENCHMARKS(long long); + CREATE_BENCHMARKS(custom_float2); + CREATE_BENCHMARKS(custom_double2); } -int main(int argc, char* argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_block_shuffle" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_benchmarks("offset", benchmarks, stream, size); - add_benchmarks("rotate", benchmarks, stream, size); - add_benchmarks("up", benchmarks, stream, size); - add_benchmarks("down", benchmarks, stream, size); - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_block_shuffle" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_benchmarks("offset", benchmarks, stream, size); + add_benchmarks("rotate", benchmarks, stream, size); + add_benchmarks("up", benchmarks, stream, size); + add_benchmarks("down", benchmarks, stream, size); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_adjacent_difference.cpp b/benchmark/benchmark_device_adjacent_difference.cpp index 7ac57c0b..9d1a087f 100644 --- a/benchmark/benchmark_device_adjacent_difference.cpp +++ b/benchmark/benchmark_device_adjacent_difference.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -39,159 +39,138 @@ #include #include -namespace -{ +namespace { #ifndef DEFAULT_N constexpr std::size_t DEFAULT_N = 1024 * 1024 * 128; #endif -constexpr unsigned int batch_size = 10; +constexpr unsigned int batch_size = 10; constexpr unsigned int warmup_size = 5; template auto dispatch_adjacent_difference(std::true_type /*left*/, std::true_type /*copy*/, - void* const temporary_storage, - std::size_t& storage_size, - const InputIt input, - const OutputIt output, - Args&&... args) -{ - return ::hipcub::DeviceAdjacentDifference::SubtractLeftCopy( - temporary_storage, storage_size, input, output, std::forward(args)...); + void *const temporary_storage, + std::size_t &storage_size, + const InputIt input, const OutputIt output, + Args &&... args) { + return ::hipcub::DeviceAdjacentDifference::SubtractLeftCopy( + temporary_storage, storage_size, input, output, + std::forward(args)...); } template auto dispatch_adjacent_difference(std::false_type /*left*/, std::true_type /*copy*/, - void* const temporary_storage, - std::size_t& storage_size, - const InputIt input, - const OutputIt output, - Args&&... args) -{ - return ::hipcub::DeviceAdjacentDifference::SubtractRightCopy( - temporary_storage, storage_size, input, output, std::forward(args)...); + void *const temporary_storage, + std::size_t &storage_size, + const InputIt input, const OutputIt output, + Args &&... args) { + return ::hipcub::DeviceAdjacentDifference::SubtractRightCopy( + temporary_storage, storage_size, input, output, + std::forward(args)...); } template auto dispatch_adjacent_difference(std::true_type /*left*/, std::false_type /*copy*/, - void* const temporary_storage, - std::size_t& storage_size, + void *const temporary_storage, + std::size_t &storage_size, const InputIt input, - const OutputIt /*output*/, - Args&&... args) -{ - return ::hipcub::DeviceAdjacentDifference::SubtractLeft( - temporary_storage, storage_size, input, std::forward(args)...); + const OutputIt /*output*/, Args &&... args) { + return ::hipcub::DeviceAdjacentDifference::SubtractLeft( + temporary_storage, storage_size, input, std::forward(args)...); } template auto dispatch_adjacent_difference(std::false_type /*left*/, std::false_type /*copy*/, - void* const temporary_storage, - std::size_t& storage_size, + void *const temporary_storage, + std::size_t &storage_size, const InputIt input, - const OutputIt /*output*/, - Args&&... args) -{ - return ::hipcub::DeviceAdjacentDifference::SubtractRight( - temporary_storage, storage_size, input, std::forward(args)...); + const OutputIt /*output*/, Args &&... args) { + return ::hipcub::DeviceAdjacentDifference::SubtractRight( + temporary_storage, storage_size, input, std::forward(args)...); } template -void run_benchmark(benchmark::State& state, const std::size_t size, const hipStream_t stream) -{ - using output_type = T; - - // Generate data - const std::vector input = benchmark_utils::get_random_data(size, 1, 100); - - T* d_input; - output_type* d_output = nullptr; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); - HIP_CHECK( - hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice)); - - if(copy) - { - HIP_CHECK(hipMalloc(&d_output, size * sizeof(output_type))); - } - - static constexpr std::integral_constant left_tag; - static constexpr std::integral_constant copy_tag; - - // Allocate temporary storage - std::size_t temp_storage_size{}; - void* d_temp_storage = nullptr; - - const auto launch = [&] - { - return dispatch_adjacent_difference(left_tag, - copy_tag, - d_temp_storage, - temp_storage_size, - d_input, - d_output, - size, - hipcub::Sum{}, - stream); - }; +void run_benchmark(benchmark::State &state, const std::size_t size, + const hipStream_t stream) { + using output_type = T; + + // Generate data + const std::vector input = + benchmark_utils::get_random_data(size, 1, 100); + + T *d_input; + output_type *d_output = nullptr; + HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), + hipMemcpyHostToDevice)); + + if (copy) { + HIP_CHECK(hipMalloc(&d_output, size * sizeof(output_type))); + } + + static constexpr std::integral_constant left_tag; + static constexpr std::integral_constant copy_tag; + + // Allocate temporary storage + std::size_t temp_storage_size{}; + void *d_temp_storage = nullptr; + + const auto launch = [&] { + return dispatch_adjacent_difference(left_tag, copy_tag, d_temp_storage, + temp_storage_size, d_input, d_output, + size, hipcub::Sum{}, stream); + }; + HIP_CHECK(launch()); + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size)); + + // Warm-up + for (size_t i = 0; i < warmup_size; i++) { HIP_CHECK(launch()); - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size)); + } + HIP_CHECK(hipDeviceSynchronize()); - // Warm-up - for(size_t i = 0; i < warmup_size; i++) - { - HIP_CHECK(launch()); - } - HIP_CHECK(hipDeviceSynchronize()); - - // Run - for(auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK(launch()); - } - HIP_CHECK(hipStreamSynchronize(stream)); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds - = std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); + // Run + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); - hipFree(d_input); - if(copy) - { - hipFree(d_output); + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK(launch()); } - hipFree(d_temp_storage); + HIP_CHECK(hipStreamSynchronize(stream)); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + hipFree(d_input); + if (copy) { + hipFree(d_output); + } + hipFree(d_temp_storage); } } // namespace using namespace std::string_literals; -#define CREATE_BENCHMARK(T, left, copy) \ - benchmark::RegisterBenchmark( \ - std::string("device_adjacent_difference" \ - "." \ - "sub_algorithm_name:Subtract" \ - + std::string(left ? "Left" : "Right") \ - + std::string(copy ? "Copy" : "") \ - ).c_str(), \ - &run_benchmark, \ - size, \ - stream \ - ) +#define CREATE_BENCHMARK(T, left, copy) \ + benchmark::RegisterBenchmark( \ + std::string("device_adjacent_difference" \ + "." \ + "sub_algorithm_name:Subtract" + \ + std::string(left ? "Left" : "Right") + \ + std::string(copy ? "Copy" : "")) \ + .c_str(), \ + &run_benchmark, size, stream) // clang-format off #define CREATE_BENCHMARKS(T) \ @@ -201,63 +180,56 @@ using namespace std::string_literals; CREATE_BENCHMARK(T, false, true) // clang-format on -int main(int argc, char* argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - // HIP - const hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - - std::cout << "benchmark_device_adjacent_difference" << std::endl; - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - - // Add benchmarks - const std::vector benchmarks = { - CREATE_BENCHMARKS(int), - CREATE_BENCHMARKS(std::int64_t), - - CREATE_BENCHMARKS(uint8_t), - - CREATE_BENCHMARKS(float), - CREATE_BENCHMARKS(double), - - CREATE_BENCHMARKS(custom_float2), - CREATE_BENCHMARKS(custom_double2), - }; - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + // HIP + const hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_device_adjacent_difference" << std::endl; + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + + // Add benchmarks + const std::vector benchmarks = { + CREATE_BENCHMARKS(int), CREATE_BENCHMARKS(std::int64_t), + + CREATE_BENCHMARKS(uint8_t), + + CREATE_BENCHMARKS(float), CREATE_BENCHMARKS(double), + + CREATE_BENCHMARKS(custom_float2), CREATE_BENCHMARKS(custom_double2), + }; + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); - return 0; + return 0; } diff --git a/benchmark/benchmark_device_batch_copy.cpp b/benchmark/benchmark_device_batch_copy.cpp index 06d2e94e..a3c2bffa 100644 --- a/benchmark/benchmark_device_batch_copy.cpp +++ b/benchmark/benchmark_device_batch_copy.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -42,7 +42,7 @@ #include constexpr uint32_t warmup_size = 5; -constexpr int32_t max_size = 1024 * 1024; +constexpr int32_t max_size = 1024 * 1024; constexpr int32_t wlev_min_size = 128; constexpr int32_t blev_min_size = 1024; @@ -53,7 +53,8 @@ constexpr int32_t blev_min_size = 1024; // have source and destinations mappings not be the identity function: // // batch_copy( -// [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, d!) +// [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, +// d!) // [&a0', &b0', &c0', &d0'], // to (order is the same as above too!) // [3 , 2 , 1 , 2 ]) // size // @@ -68,338 +69,296 @@ constexpr int32_t blev_min_size = 1024; // ┌───┬───┬───┬───┬───┬───┬───┬───┐ // │c0'│a0'│a1'│a2'│d0'│d1'│b0'│b1'│ buffer y contains buffers a', b', c', d' // └───┴───┴───┴───┴───┴───┴───┴───┘ -template -std::vector shuffled_exclusive_scan(const std::vector& input, RandomGenerator& rng) -{ - const auto n = input.size(); - assert(n > 0); - - std::vector result(n); - std::vector permute(n); - - std::iota(permute.begin(), permute.end(), 0); - std::shuffle(permute.begin(), permute.end(), rng); - - for(T i = 0, sum = 0; i < n; ++i) - { - result[permute[i]] = sum; - sum += input[permute[i]]; - } +template +std::vector shuffled_exclusive_scan(const std::vector &input, + RandomGenerator &rng) { + const auto n = input.size(); + assert(n > 0); - return result; -} + std::vector result(n); + std::vector permute(n); -using offset_type = size_t; + std::iota(permute.begin(), permute.end(), 0); + std::shuffle(permute.begin(), permute.end(), rng); -template -struct BatchCopyData -{ - size_t total_num_elements = 0; - ValueType* d_input = nullptr; - ValueType* d_output = nullptr; - ValueType** d_buffer_srcs = nullptr; - ValueType** d_buffer_dsts = nullptr; - BufferSizeType* d_buffer_sizes = nullptr; - - BatchCopyData() = default; - BatchCopyData(const BatchCopyData&) = delete; - - BatchCopyData(BatchCopyData&& other) - : total_num_elements{std::exchange(other.total_num_elements, 0)} - , d_input{std::exchange(other.d_input, nullptr)} - , d_output{std::exchange(other.d_output, nullptr)} - , d_buffer_srcs{std::exchange(other.d_buffer_srcs, nullptr)} - , d_buffer_dsts{std::exchange(other.d_buffer_dsts, nullptr)} - , d_buffer_sizes{std::exchange(other.d_buffer_sizes, nullptr)} - {} - - BatchCopyData& operator=(BatchCopyData&& other) - { - total_num_elements = std::exchange(other.total_num_elements, 0); - d_input = std::exchange(other.d_input, nullptr); - d_output = std::exchange(other.d_output, nullptr); - d_buffer_srcs = std::exchange(other.d_buffer_srcs, nullptr); - d_buffer_dsts = std::exchange(other.d_buffer_dsts, nullptr); - d_buffer_sizes = std::exchange(other.d_buffer_sizes, nullptr); - return *this; - }; - - BatchCopyData& operator=(const BatchCopyData&) = delete; - - size_t total_num_bytes() const - { - return total_num_elements * sizeof(ValueType); - } + for (T i = 0, sum = 0; i < n; ++i) { + result[permute[i]] = sum; + sum += input[permute[i]]; + } - ~BatchCopyData() - { - HIP_CHECK(hipFree(d_buffer_sizes)); - HIP_CHECK(hipFree(d_buffer_srcs)); - HIP_CHECK(hipFree(d_buffer_dsts)); - HIP_CHECK(hipFree(d_output)); - HIP_CHECK(hipFree(d_input)); - } -}; - -template -BatchCopyData prepare_data(const int32_t num_tlev_buffers = 1024, - const int32_t num_wlev_buffers = 1024, - const int32_t num_blev_buffers = 1024) -{ - const bool shuffle_buffers = false; - - BatchCopyData result; - const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; - - constexpr int32_t wlev_min_elems - = benchmark_utils::ceiling_div(wlev_min_size, sizeof(ValueType)); - constexpr int32_t blev_min_elems - = benchmark_utils::ceiling_div(blev_min_size, sizeof(ValueType)); - constexpr int32_t max_elems = max_size / sizeof(ValueType); - - // Generate data - std::mt19937_64 rng(std::random_device{}()); - - // Number of elements in each buffer. - std::vector h_buffer_num_elements(num_buffers); - - auto iter = h_buffer_num_elements.begin(); - - iter = benchmark_utils::generate_random_data_n(iter, - num_tlev_buffers, - 1, - wlev_min_elems - 1, - rng); - iter = benchmark_utils::generate_random_data_n(iter, - num_wlev_buffers, - wlev_min_elems, - blev_min_elems - 1, - rng); - iter = benchmark_utils::generate_random_data_n(iter, - num_blev_buffers, - blev_min_elems, - max_elems, - rng); - - // Shuffle the sizes so that size classes aren't clustered - std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng); - - result.total_num_elements - = std::accumulate(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), size_t{0}); - - // Generate data. - std::independent_bits_engine bits_engine{rng}; - - const size_t num_ints - = benchmark_utils::ceiling_div(result.total_num_bytes(), sizeof(uint64_t)); - auto h_input = std::make_unique(num_ints * sizeof(uint64_t)); - - std::for_each(reinterpret_cast(h_input.get()), - reinterpret_cast(h_input.get() + num_ints * sizeof(uint64_t)), - [&bits_engine](uint64_t& elem) { ::new(&elem) uint64_t{bits_engine()}; }); - - HIP_CHECK(hipMalloc(&result.d_input, result.total_num_bytes())); - HIP_CHECK(hipMalloc(&result.d_output, result.total_num_bytes())); - - HIP_CHECK(hipMalloc(&result.d_buffer_srcs, num_buffers * sizeof(ValueType*))); - HIP_CHECK(hipMalloc(&result.d_buffer_dsts, num_buffers * sizeof(ValueType*))); - HIP_CHECK(hipMalloc(&result.d_buffer_sizes, num_buffers * sizeof(BufferSizeType))); - - // Generate the source and shuffled destination offsets. - std::vector src_offsets; - std::vector dst_offsets; - - if(shuffle_buffers) - { - src_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); - dst_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); - } else - { - src_offsets = std::vector(num_buffers); - dst_offsets = std::vector(num_buffers); - - // Consecutive offsets (no shuffling). - // src/dst offsets first element is 0, so skip that! - std::partial_sum(h_buffer_num_elements.begin(), - h_buffer_num_elements.end() - 1, - src_offsets.begin() + 1); - std::partial_sum(h_buffer_num_elements.begin(), - h_buffer_num_elements.end() - 1, - dst_offsets.begin() + 1); - } + return result; +} - // Generate the source and destination pointers. - std::vector h_buffer_srcs(num_buffers); - std::vector h_buffer_dsts(num_buffers); +using offset_type = size_t; - for(size_t i = 0; i < num_buffers; ++i) - { - h_buffer_srcs[i] = result.d_input + src_offsets[i]; - h_buffer_dsts[i] = result.d_output + dst_offsets[i]; - } +template struct BatchCopyData { + size_t total_num_elements = 0; + ValueType *d_input = nullptr; + ValueType *d_output = nullptr; + ValueType **d_buffer_srcs = nullptr; + ValueType **d_buffer_dsts = nullptr; + BufferSizeType *d_buffer_sizes = nullptr; + + BatchCopyData() = default; + BatchCopyData(const BatchCopyData &) = delete; + + BatchCopyData(BatchCopyData &&other) + : total_num_elements{std::exchange(other.total_num_elements, 0)}, + d_input{std::exchange(other.d_input, nullptr)}, d_output{std::exchange( + other.d_output, + nullptr)}, + d_buffer_srcs{std::exchange(other.d_buffer_srcs, nullptr)}, + d_buffer_dsts{std::exchange(other.d_buffer_dsts, nullptr)}, + d_buffer_sizes{std::exchange(other.d_buffer_sizes, nullptr)} {} + + BatchCopyData &operator=(BatchCopyData &&other) { + total_num_elements = std::exchange(other.total_num_elements, 0); + d_input = std::exchange(other.d_input, nullptr); + d_output = std::exchange(other.d_output, nullptr); + d_buffer_srcs = std::exchange(other.d_buffer_srcs, nullptr); + d_buffer_dsts = std::exchange(other.d_buffer_dsts, nullptr); + d_buffer_sizes = std::exchange(other.d_buffer_sizes, nullptr); + return *this; + }; + + BatchCopyData &operator=(const BatchCopyData &) = delete; + + size_t total_num_bytes() const { + return total_num_elements * sizeof(ValueType); + } + + ~BatchCopyData() { + HIP_CHECK(hipFree(d_buffer_sizes)); + HIP_CHECK(hipFree(d_buffer_srcs)); + HIP_CHECK(hipFree(d_buffer_dsts)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + } +}; - // Prepare the batch copy. - HIP_CHECK( - hipMemcpy(result.d_input, h_input.get(), result.total_num_bytes(), hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(result.d_buffer_srcs, - h_buffer_srcs.data(), - h_buffer_srcs.size() * sizeof(ValueType*), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(result.d_buffer_dsts, - h_buffer_dsts.data(), - h_buffer_dsts.size() * sizeof(ValueType*), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(result.d_buffer_sizes, - h_buffer_num_elements.data(), - h_buffer_num_elements.size() * sizeof(BufferSizeType), - hipMemcpyHostToDevice)); - - return result; +template +BatchCopyData +prepare_data(const int32_t num_tlev_buffers = 1024, + const int32_t num_wlev_buffers = 1024, + const int32_t num_blev_buffers = 1024) { + const bool shuffle_buffers = false; + + BatchCopyData result; + const size_t num_buffers = + num_tlev_buffers + num_wlev_buffers + num_blev_buffers; + + constexpr int32_t wlev_min_elems = + benchmark_utils::ceiling_div(wlev_min_size, sizeof(ValueType)); + constexpr int32_t blev_min_elems = + benchmark_utils::ceiling_div(blev_min_size, sizeof(ValueType)); + constexpr int32_t max_elems = max_size / sizeof(ValueType); + + // Generate data + std::mt19937_64 rng(std::random_device{}()); + + // Number of elements in each buffer. + std::vector h_buffer_num_elements(num_buffers); + + auto iter = h_buffer_num_elements.begin(); + + iter = benchmark_utils::generate_random_data_n(iter, num_tlev_buffers, 1, + wlev_min_elems - 1, rng); + iter = benchmark_utils::generate_random_data_n( + iter, num_wlev_buffers, wlev_min_elems, blev_min_elems - 1, rng); + iter = benchmark_utils::generate_random_data_n( + iter, num_blev_buffers, blev_min_elems, max_elems, rng); + + // Shuffle the sizes so that size classes aren't clustered + std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng); + + result.total_num_elements = std::accumulate( + h_buffer_num_elements.begin(), h_buffer_num_elements.end(), size_t{0}); + + // Generate data. + std::independent_bits_engine bits_engine{rng}; + + const size_t num_ints = + benchmark_utils::ceiling_div(result.total_num_bytes(), sizeof(uint64_t)); + auto h_input = std::make_unique(num_ints * sizeof(uint64_t)); + + std::for_each( + reinterpret_cast(h_input.get()), + reinterpret_cast(h_input.get() + num_ints * sizeof(uint64_t)), + [&bits_engine](uint64_t &elem) { + ::new (&elem) uint64_t{bits_engine()}; + }); + + HIP_CHECK(hipMalloc(&result.d_input, result.total_num_bytes())); + HIP_CHECK(hipMalloc(&result.d_output, result.total_num_bytes())); + + HIP_CHECK( + hipMalloc(&result.d_buffer_srcs, num_buffers * sizeof(ValueType *))); + HIP_CHECK( + hipMalloc(&result.d_buffer_dsts, num_buffers * sizeof(ValueType *))); + HIP_CHECK( + hipMalloc(&result.d_buffer_sizes, num_buffers * sizeof(BufferSizeType))); + + // Generate the source and shuffled destination offsets. + std::vector src_offsets; + std::vector dst_offsets; + + if (shuffle_buffers) { + src_offsets = + shuffled_exclusive_scan(h_buffer_num_elements, rng); + dst_offsets = + shuffled_exclusive_scan(h_buffer_num_elements, rng); + } else { + src_offsets = std::vector(num_buffers); + dst_offsets = std::vector(num_buffers); + + // Consecutive offsets (no shuffling). + // src/dst offsets first element is 0, so skip that! + std::partial_sum(h_buffer_num_elements.begin(), + h_buffer_num_elements.end() - 1, src_offsets.begin() + 1); + std::partial_sum(h_buffer_num_elements.begin(), + h_buffer_num_elements.end() - 1, dst_offsets.begin() + 1); + } + + // Generate the source and destination pointers. + std::vector h_buffer_srcs(num_buffers); + std::vector h_buffer_dsts(num_buffers); + + for (size_t i = 0; i < num_buffers; ++i) { + h_buffer_srcs[i] = result.d_input + src_offsets[i]; + h_buffer_dsts[i] = result.d_output + dst_offsets[i]; + } + + // Prepare the batch copy. + HIP_CHECK(hipMemcpy(result.d_input, h_input.get(), result.total_num_bytes(), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_srcs, h_buffer_srcs.data(), + h_buffer_srcs.size() * sizeof(ValueType *), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_dsts, h_buffer_dsts.data(), + h_buffer_dsts.size() * sizeof(ValueType *), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_sizes, h_buffer_num_elements.data(), + h_buffer_num_elements.size() * sizeof(BufferSizeType), + hipMemcpyHostToDevice)); + + return result; } -template -void run_benchmark(benchmark::State& state, - hipStream_t stream, - const int32_t num_tlev_buffers = 1024, - const int32_t num_wlev_buffers = 1024, - const int32_t num_blev_buffers = 1024) -{ - const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; - - size_t temp_storage_bytes = 0; - BatchCopyData data; - HIP_CHECK(hipcub::DeviceCopy::Batched(nullptr, - temp_storage_bytes, - data.d_buffer_srcs, - data.d_buffer_dsts, - data.d_buffer_sizes, - num_buffers)); - - void* d_temp_storage = nullptr; - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); - - data = prepare_data(num_tlev_buffers, - num_wlev_buffers, - num_blev_buffers); - - // Warm-up - for(size_t i = 0; i < warmup_size; i++) - { - HIP_CHECK(hipcub::DeviceCopy::Batched(d_temp_storage, - temp_storage_bytes, - data.d_buffer_srcs, - data.d_buffer_dsts, - data.d_buffer_sizes, - num_buffers, - stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - // HIP events creation - hipEvent_t start, stop; - HIP_CHECK(hipEventCreate(&start)); - HIP_CHECK(hipEventCreate(&stop)); - - for(auto _ : state) - { - // Record start event - HIP_CHECK(hipEventRecord(start, stream)); - - HIP_CHECK(hipcub::DeviceCopy::Batched(d_temp_storage, - temp_storage_bytes, - data.d_buffer_srcs, - data.d_buffer_dsts, - data.d_buffer_sizes, - num_buffers, - stream)); - - // Record stop event and wait until it completes - HIP_CHECK(hipEventRecord(stop, stream)); - HIP_CHECK(hipEventSynchronize(stop)); - - float elapsed_mseconds; - HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); - state.SetIterationTime(elapsed_mseconds / 1000); - } - state.SetBytesProcessed(state.iterations() * data.total_num_bytes()); - state.SetItemsProcessed(state.iterations() * data.total_num_elements); - - HIP_CHECK(hipFree(d_temp_storage)); +template +void run_benchmark(benchmark::State &state, hipStream_t stream, + const int32_t num_tlev_buffers = 1024, + const int32_t num_wlev_buffers = 1024, + const int32_t num_blev_buffers = 1024) { + const size_t num_buffers = + num_tlev_buffers + num_wlev_buffers + num_blev_buffers; + + size_t temp_storage_bytes = 0; + BatchCopyData data; + HIP_CHECK(hipcub::DeviceCopy::Batched(nullptr, temp_storage_bytes, + data.d_buffer_srcs, data.d_buffer_dsts, + data.d_buffer_sizes, num_buffers)); + + void *d_temp_storage = nullptr; + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); + + data = prepare_data( + num_tlev_buffers, num_wlev_buffers, num_blev_buffers); + + // Warm-up + for (size_t i = 0; i < warmup_size; i++) { + HIP_CHECK(hipcub::DeviceCopy::Batched( + d_temp_storage, temp_storage_bytes, data.d_buffer_srcs, + data.d_buffer_dsts, data.d_buffer_sizes, num_buffers, stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + // HIP events creation + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + + for (auto _ : state) { + // Record start event + HIP_CHECK(hipEventRecord(start, stream)); + + HIP_CHECK(hipcub::DeviceCopy::Batched( + d_temp_storage, temp_storage_bytes, data.d_buffer_srcs, + data.d_buffer_dsts, data.d_buffer_sizes, num_buffers, stream)); + + // Record stop event and wait until it completes + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed_mseconds; + HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); + state.SetIterationTime(elapsed_mseconds / 1000); + } + state.SetBytesProcessed(state.iterations() * data.total_num_bytes()); + state.SetItemsProcessed(state.iterations() * data.total_num_elements); + + HIP_CHECK(hipFree(d_temp_storage)); } #define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev) \ - benchmark::RegisterBenchmark( \ - std::string("device_batch_copy" \ - "." \ - ).c_str(), \ - [=](benchmark::State& state){ \ - run_benchmark, \ - T>(state, stream, num_tlev, num_wlev, num_blev); \ - } \ - ) - -#define BENCHMARK_TYPE(item_size, item_alignment) \ - CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ - CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \ - CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000), \ - CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) - -int32_t main(int32_t argc, char* argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", 1024, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.set_optional("name_format", - "name_format", - "human", - "either: json,human,txt"); - - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int32_t trials = parser.get("trials"); - - // HIP - hipStream_t stream = hipStreamDefault; // default - - // Benchmark info - benchmark::AddCustomContext("size", std::to_string(size)); - - // Add benchmarks - std::vector benchmarks; - - benchmarks = {BENCHMARK_TYPE(1, 1), - BENCHMARK_TYPE(1, 2), - BENCHMARK_TYPE(1, 4), - BENCHMARK_TYPE(1, 8), - BENCHMARK_TYPE(2, 2), - BENCHMARK_TYPE(4, 4), - BENCHMARK_TYPE(8, 8)}; - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } + benchmark::RegisterBenchmark( \ + std::string("device_batch_copy" \ + ".") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_benchmark, T>( \ + state, stream, num_tlev, num_wlev, num_blev); \ + }) + +#define BENCHMARK_TYPE(item_size, item_alignment) \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) + +int32_t main(int32_t argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", 1024, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.set_optional("name_format", "name_format", "human", + "either: json,human,txt"); + + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int32_t trials = parser.get("trials"); + + // HIP + hipStream_t stream = hipStreamDefault; // default + + // Benchmark info + benchmark::AddCustomContext("size", std::to_string(size)); + + // Add benchmarks + std::vector benchmarks; + + benchmarks = {BENCHMARK_TYPE(1, 1), BENCHMARK_TYPE(1, 2), + BENCHMARK_TYPE(1, 4), BENCHMARK_TYPE(1, 8), + BENCHMARK_TYPE(2, 2), BENCHMARK_TYPE(4, 4), + BENCHMARK_TYPE(8, 8)}; + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_batch_memcpy.cpp b/benchmark/benchmark_device_batch_memcpy.cpp index 56e639b4..beb98c99 100644 --- a/benchmark/benchmark_device_batch_memcpy.cpp +++ b/benchmark/benchmark_device_batch_memcpy.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -30,8 +30,8 @@ #include "hipcub/hipcub.hpp" #ifdef __HIP_PLATFORM_AMD__ - // Only include this on AMD as it contains specialized config information - #include +// Only include this on AMD as it contains specialized config information +#include #endif #include @@ -45,7 +45,7 @@ #include constexpr uint32_t warmup_size = 5; -constexpr int32_t max_size = 1024 * 1024; +constexpr int32_t max_size = 1024 * 1024; constexpr int32_t wlev_min_size = 128; constexpr int32_t blev_min_size = 1024; @@ -56,7 +56,8 @@ constexpr int32_t blev_min_size = 1024; // have source and destinations mappings not be the identity function: // // batch_memcpy( -// [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, d!) +// [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, +// d!) // [&a0', &b0', &c0', &d0'], // to (order is the same as above too!) // [3 , 2 , 1 , 2 ]) // size // @@ -71,352 +72,310 @@ constexpr int32_t blev_min_size = 1024; // ┌───┬───┬───┬───┬───┬───┬───┬───┐ // │c0'│a0'│a1'│a2'│d0'│d1'│b0'│b1'│ buffer y contains buffers a', b', c', d' // └───┴───┴───┴───┴───┴───┴───┴───┘ -template -std::vector shuffled_exclusive_scan(const std::vector& input, RandomGenerator& rng) -{ - const auto n = input.size(); - assert(n > 0); - - std::vector result(n); - std::vector permute(n); - - std::iota(permute.begin(), permute.end(), 0); - std::shuffle(permute.begin(), permute.end(), rng); - - for(T i = 0, sum = 0; i < n; ++i) - { - result[permute[i]] = sum; - sum += input[permute[i]]; - } - - return result; -} - -using offset_type = size_t; +template +std::vector shuffled_exclusive_scan(const std::vector &input, + RandomGenerator &rng) { + const auto n = input.size(); + assert(n > 0); -template -struct BatchMemcpyData -{ - size_t total_num_elements = 0; - ValueType* d_input = nullptr; - ValueType* d_output = nullptr; - ValueType** d_buffer_srcs = nullptr; - ValueType** d_buffer_dsts = nullptr; - BufferSizeType* d_buffer_sizes = nullptr; - - BatchMemcpyData() = default; - BatchMemcpyData(const BatchMemcpyData&) = delete; - - BatchMemcpyData(BatchMemcpyData&& other) - : total_num_elements{std::exchange(other.total_num_elements, 0)} - , d_input{std::exchange(other.d_input, nullptr)} - , d_output{std::exchange(other.d_output, nullptr)} - , d_buffer_srcs{std::exchange(other.d_buffer_srcs, nullptr)} - , d_buffer_dsts{std::exchange(other.d_buffer_dsts, nullptr)} - , d_buffer_sizes{std::exchange(other.d_buffer_sizes, nullptr)} - {} - - BatchMemcpyData& operator=(BatchMemcpyData&& other) - { - total_num_elements = std::exchange(other.total_num_elements, 0); - d_input = std::exchange(other.d_input, nullptr); - d_output = std::exchange(other.d_output, nullptr); - d_buffer_srcs = std::exchange(other.d_buffer_srcs, nullptr); - d_buffer_dsts = std::exchange(other.d_buffer_dsts, nullptr); - d_buffer_sizes = std::exchange(other.d_buffer_sizes, nullptr); - return *this; - }; - - BatchMemcpyData& operator=(const BatchMemcpyData&) = delete; - - size_t total_num_bytes() const - { - return total_num_elements * sizeof(ValueType); - } + std::vector result(n); + std::vector permute(n); - ~BatchMemcpyData() - { - HIP_CHECK(hipFree(d_buffer_sizes)); - HIP_CHECK(hipFree(d_buffer_srcs)); - HIP_CHECK(hipFree(d_buffer_dsts)); - HIP_CHECK(hipFree(d_output)); - HIP_CHECK(hipFree(d_input)); - } -}; + std::iota(permute.begin(), permute.end(), 0); + std::shuffle(permute.begin(), permute.end(), rng); -template -BatchMemcpyData prepare_data(const int32_t num_tlev_buffers = 1024, - const int32_t num_wlev_buffers = 1024, - const int32_t num_blev_buffers = 1024) -{ - const bool shuffle_buffers = false; - - BatchMemcpyData result; - const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; - - constexpr int32_t wlev_min_elems - = benchmark_utils::ceiling_div(wlev_min_size, sizeof(ValueType)); - constexpr int32_t blev_min_elems - = benchmark_utils::ceiling_div(blev_min_size, sizeof(ValueType)); - constexpr int32_t max_elems = max_size / sizeof(ValueType); - - // Generate data - std::mt19937_64 rng(std::random_device{}()); - - // Number of elements in each buffer. - std::vector h_buffer_num_elements(num_buffers); - - auto iter = h_buffer_num_elements.begin(); - - iter = benchmark_utils::generate_random_data_n(iter, - num_tlev_buffers, - 1, - wlev_min_elems - 1, - rng); - iter = benchmark_utils::generate_random_data_n(iter, - num_wlev_buffers, - wlev_min_elems, - blev_min_elems - 1, - rng); - iter = benchmark_utils::generate_random_data_n(iter, - num_blev_buffers, - blev_min_elems, - max_elems, - rng); - - // Shuffle the sizes so that size classes aren't clustered - std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng); - - // Get the byte size of each buffer - std::vector h_buffer_num_bytes(num_buffers); - for(size_t i = 0; i < num_buffers; ++i) - { - h_buffer_num_bytes[i] = h_buffer_num_elements[i] * sizeof(ValueType); - } + for (T i = 0, sum = 0; i < n; ++i) { + result[permute[i]] = sum; + sum += input[permute[i]]; + } - result.total_num_elements - = std::accumulate(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), size_t{0}); - - // Generate data. - std::independent_bits_engine bits_engine{rng}; - - const size_t num_ints - = benchmark_utils::ceiling_div(result.total_num_bytes(), sizeof(uint64_t)); - auto h_input = std::make_unique(num_ints * sizeof(uint64_t)); - - std::for_each(reinterpret_cast(h_input.get()), - reinterpret_cast(h_input.get() + num_ints * sizeof(uint64_t)), - [&bits_engine](uint64_t& elem) { ::new(&elem) uint64_t{bits_engine()}; }); - - HIP_CHECK(hipMalloc(&result.d_input, result.total_num_bytes())); - HIP_CHECK(hipMalloc(&result.d_output, result.total_num_bytes())); - - HIP_CHECK(hipMalloc(&result.d_buffer_srcs, num_buffers * sizeof(ValueType*))); - HIP_CHECK(hipMalloc(&result.d_buffer_dsts, num_buffers * sizeof(ValueType*))); - HIP_CHECK(hipMalloc(&result.d_buffer_sizes, num_buffers * sizeof(BufferSizeType))); - - // Generate the source and shuffled destination offsets. - std::vector src_offsets; - std::vector dst_offsets; - - if(shuffle_buffers) - { - src_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); - dst_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); - } else - { - src_offsets = std::vector(num_buffers); - dst_offsets = std::vector(num_buffers); - - // Consecutive offsets (no shuffling). - // src/dst offsets first element is 0, so skip that! - std::partial_sum(h_buffer_num_elements.begin(), - h_buffer_num_elements.end() - 1, - src_offsets.begin() + 1); - std::partial_sum(h_buffer_num_elements.begin(), - h_buffer_num_elements.end() - 1, - dst_offsets.begin() + 1); - } + return result; +} - // Generate the source and destination pointers. - std::vector h_buffer_srcs(num_buffers); - std::vector h_buffer_dsts(num_buffers); +using offset_type = size_t; - for(size_t i = 0; i < num_buffers; ++i) - { - h_buffer_srcs[i] = result.d_input + src_offsets[i]; - h_buffer_dsts[i] = result.d_output + dst_offsets[i]; - } +template struct BatchMemcpyData { + size_t total_num_elements = 0; + ValueType *d_input = nullptr; + ValueType *d_output = nullptr; + ValueType **d_buffer_srcs = nullptr; + ValueType **d_buffer_dsts = nullptr; + BufferSizeType *d_buffer_sizes = nullptr; + + BatchMemcpyData() = default; + BatchMemcpyData(const BatchMemcpyData &) = delete; + + BatchMemcpyData(BatchMemcpyData &&other) + : total_num_elements{std::exchange(other.total_num_elements, 0)}, + d_input{std::exchange(other.d_input, nullptr)}, d_output{std::exchange( + other.d_output, + nullptr)}, + d_buffer_srcs{std::exchange(other.d_buffer_srcs, nullptr)}, + d_buffer_dsts{std::exchange(other.d_buffer_dsts, nullptr)}, + d_buffer_sizes{std::exchange(other.d_buffer_sizes, nullptr)} {} + + BatchMemcpyData &operator=(BatchMemcpyData &&other) { + total_num_elements = std::exchange(other.total_num_elements, 0); + d_input = std::exchange(other.d_input, nullptr); + d_output = std::exchange(other.d_output, nullptr); + d_buffer_srcs = std::exchange(other.d_buffer_srcs, nullptr); + d_buffer_dsts = std::exchange(other.d_buffer_dsts, nullptr); + d_buffer_sizes = std::exchange(other.d_buffer_sizes, nullptr); + return *this; + }; + + BatchMemcpyData &operator=(const BatchMemcpyData &) = delete; + + size_t total_num_bytes() const { + return total_num_elements * sizeof(ValueType); + } + + ~BatchMemcpyData() { + HIP_CHECK(hipFree(d_buffer_sizes)); + HIP_CHECK(hipFree(d_buffer_srcs)); + HIP_CHECK(hipFree(d_buffer_dsts)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + } +}; - // Prepare the batch memcpy. - HIP_CHECK( - hipMemcpy(result.d_input, h_input.get(), result.total_num_bytes(), hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(result.d_buffer_srcs, - h_buffer_srcs.data(), - h_buffer_srcs.size() * sizeof(ValueType*), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(result.d_buffer_dsts, - h_buffer_dsts.data(), - h_buffer_dsts.size() * sizeof(ValueType*), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(result.d_buffer_sizes, - h_buffer_num_bytes.data(), - h_buffer_num_bytes.size() * sizeof(BufferSizeType), - hipMemcpyHostToDevice)); - - return result; +template +BatchMemcpyData +prepare_data(const int32_t num_tlev_buffers = 1024, + const int32_t num_wlev_buffers = 1024, + const int32_t num_blev_buffers = 1024) { + const bool shuffle_buffers = false; + + BatchMemcpyData result; + const size_t num_buffers = + num_tlev_buffers + num_wlev_buffers + num_blev_buffers; + + constexpr int32_t wlev_min_elems = + benchmark_utils::ceiling_div(wlev_min_size, sizeof(ValueType)); + constexpr int32_t blev_min_elems = + benchmark_utils::ceiling_div(blev_min_size, sizeof(ValueType)); + constexpr int32_t max_elems = max_size / sizeof(ValueType); + + // Generate data + std::mt19937_64 rng(std::random_device{}()); + + // Number of elements in each buffer. + std::vector h_buffer_num_elements(num_buffers); + + auto iter = h_buffer_num_elements.begin(); + + iter = benchmark_utils::generate_random_data_n(iter, num_tlev_buffers, 1, + wlev_min_elems - 1, rng); + iter = benchmark_utils::generate_random_data_n( + iter, num_wlev_buffers, wlev_min_elems, blev_min_elems - 1, rng); + iter = benchmark_utils::generate_random_data_n( + iter, num_blev_buffers, blev_min_elems, max_elems, rng); + + // Shuffle the sizes so that size classes aren't clustered + std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng); + + // Get the byte size of each buffer + std::vector h_buffer_num_bytes(num_buffers); + for (size_t i = 0; i < num_buffers; ++i) { + h_buffer_num_bytes[i] = h_buffer_num_elements[i] * sizeof(ValueType); + } + + result.total_num_elements = std::accumulate( + h_buffer_num_elements.begin(), h_buffer_num_elements.end(), size_t{0}); + + // Generate data. + std::independent_bits_engine bits_engine{rng}; + + const size_t num_ints = + benchmark_utils::ceiling_div(result.total_num_bytes(), sizeof(uint64_t)); + auto h_input = std::make_unique(num_ints * sizeof(uint64_t)); + + std::for_each( + reinterpret_cast(h_input.get()), + reinterpret_cast(h_input.get() + num_ints * sizeof(uint64_t)), + [&bits_engine](uint64_t &elem) { + ::new (&elem) uint64_t{bits_engine()}; + }); + + HIP_CHECK(hipMalloc(&result.d_input, result.total_num_bytes())); + HIP_CHECK(hipMalloc(&result.d_output, result.total_num_bytes())); + + HIP_CHECK( + hipMalloc(&result.d_buffer_srcs, num_buffers * sizeof(ValueType *))); + HIP_CHECK( + hipMalloc(&result.d_buffer_dsts, num_buffers * sizeof(ValueType *))); + HIP_CHECK( + hipMalloc(&result.d_buffer_sizes, num_buffers * sizeof(BufferSizeType))); + + // Generate the source and shuffled destination offsets. + std::vector src_offsets; + std::vector dst_offsets; + + if (shuffle_buffers) { + src_offsets = + shuffled_exclusive_scan(h_buffer_num_elements, rng); + dst_offsets = + shuffled_exclusive_scan(h_buffer_num_elements, rng); + } else { + src_offsets = std::vector(num_buffers); + dst_offsets = std::vector(num_buffers); + + // Consecutive offsets (no shuffling). + // src/dst offsets first element is 0, so skip that! + std::partial_sum(h_buffer_num_elements.begin(), + h_buffer_num_elements.end() - 1, src_offsets.begin() + 1); + std::partial_sum(h_buffer_num_elements.begin(), + h_buffer_num_elements.end() - 1, dst_offsets.begin() + 1); + } + + // Generate the source and destination pointers. + std::vector h_buffer_srcs(num_buffers); + std::vector h_buffer_dsts(num_buffers); + + for (size_t i = 0; i < num_buffers; ++i) { + h_buffer_srcs[i] = result.d_input + src_offsets[i]; + h_buffer_dsts[i] = result.d_output + dst_offsets[i]; + } + + // Prepare the batch memcpy. + HIP_CHECK(hipMemcpy(result.d_input, h_input.get(), result.total_num_bytes(), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_srcs, h_buffer_srcs.data(), + h_buffer_srcs.size() * sizeof(ValueType *), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_dsts, h_buffer_dsts.data(), + h_buffer_dsts.size() * sizeof(ValueType *), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_sizes, h_buffer_num_bytes.data(), + h_buffer_num_bytes.size() * sizeof(BufferSizeType), + hipMemcpyHostToDevice)); + + return result; } -template -void run_benchmark(benchmark::State& state, - hipStream_t stream, - const int32_t num_tlev_buffers = 1024, - const int32_t num_wlev_buffers = 1024, - const int32_t num_blev_buffers = 1024) -{ - const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; - - size_t temp_storage_bytes = 0; - BatchMemcpyData data; - HIP_CHECK(hipcub::DeviceMemcpy::Batched(nullptr, - temp_storage_bytes, - data.d_buffer_srcs, - data.d_buffer_dsts, - data.d_buffer_sizes, - num_buffers)); - - void* d_temp_storage = nullptr; - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); - - data = prepare_data(num_tlev_buffers, - num_wlev_buffers, - num_blev_buffers); - - // Warm-up - for(size_t i = 0; i < warmup_size; i++) - { - HIP_CHECK(hipcub::DeviceMemcpy::Batched(d_temp_storage, - temp_storage_bytes, - data.d_buffer_srcs, - data.d_buffer_dsts, - data.d_buffer_sizes, - num_buffers, - stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - // HIP events creation - hipEvent_t start, stop; - HIP_CHECK(hipEventCreate(&start)); - HIP_CHECK(hipEventCreate(&stop)); - - for(auto _ : state) - { - // Record start event - HIP_CHECK(hipEventRecord(start, stream)); - - HIP_CHECK(hipcub::DeviceMemcpy::Batched(d_temp_storage, - temp_storage_bytes, - data.d_buffer_srcs, - data.d_buffer_dsts, - data.d_buffer_sizes, - num_buffers, - stream)); - - // Record stop event and wait until it completes - HIP_CHECK(hipEventRecord(stop, stream)); - HIP_CHECK(hipEventSynchronize(stop)); - - float elapsed_mseconds; - HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); - state.SetIterationTime(elapsed_mseconds / 1000); - } - state.SetBytesProcessed(state.iterations() * data.total_num_bytes()); - state.SetItemsProcessed(state.iterations() * data.total_num_elements); - - HIP_CHECK(hipFree(d_temp_storage)); +template +void run_benchmark(benchmark::State &state, hipStream_t stream, + const int32_t num_tlev_buffers = 1024, + const int32_t num_wlev_buffers = 1024, + const int32_t num_blev_buffers = 1024) { + const size_t num_buffers = + num_tlev_buffers + num_wlev_buffers + num_blev_buffers; + + size_t temp_storage_bytes = 0; + BatchMemcpyData data; + HIP_CHECK(hipcub::DeviceMemcpy::Batched( + nullptr, temp_storage_bytes, data.d_buffer_srcs, data.d_buffer_dsts, + data.d_buffer_sizes, num_buffers)); + + void *d_temp_storage = nullptr; + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); + + data = prepare_data( + num_tlev_buffers, num_wlev_buffers, num_blev_buffers); + + // Warm-up + for (size_t i = 0; i < warmup_size; i++) { + HIP_CHECK(hipcub::DeviceMemcpy::Batched( + d_temp_storage, temp_storage_bytes, data.d_buffer_srcs, + data.d_buffer_dsts, data.d_buffer_sizes, num_buffers, stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + // HIP events creation + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + + for (auto _ : state) { + // Record start event + HIP_CHECK(hipEventRecord(start, stream)); + + HIP_CHECK(hipcub::DeviceMemcpy::Batched( + d_temp_storage, temp_storage_bytes, data.d_buffer_srcs, + data.d_buffer_dsts, data.d_buffer_sizes, num_buffers, stream)); + + // Record stop event and wait until it completes + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed_mseconds; + HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); + state.SetIterationTime(elapsed_mseconds / 1000); + } + state.SetBytesProcessed(state.iterations() * data.total_num_bytes()); + state.SetItemsProcessed(state.iterations() * data.total_num_elements); + + HIP_CHECK(hipFree(d_temp_storage)); } -#define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev) \ - benchmark::RegisterBenchmark( \ - std::string("device_batch_memcpy." \ - ).c_str(), \ - [=](benchmark::State& state){ \ - run_benchmark, \ - T>(state, stream, num_tlev, num_wlev, num_blev); \ - } \ - ) - -#define BENCHMARK_TYPE(item_size, item_alignment) \ - CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ - CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \ - CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000), \ - CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) - -int32_t main(int32_t argc, char* argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", 1024, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.set_optional("name_format", - "name_format", - "human", - "either: json,human,txt"); - - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int32_t trials = parser.get("trials"); - - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - - std::cout << "benchmark_device_adjacent_difference" << std::endl; - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // HIP - hipStream_t stream = hipStreamDefault; // default - - // Benchmark info - benchmark::AddCustomContext("size", std::to_string(size)); - - // Add benchmarks - std::vector benchmarks; - - benchmarks = {BENCHMARK_TYPE(1, 1), - BENCHMARK_TYPE(1, 2), - BENCHMARK_TYPE(1, 4), - BENCHMARK_TYPE(1, 8), - BENCHMARK_TYPE(2, 2), - BENCHMARK_TYPE(4, 4), - BENCHMARK_TYPE(8, 8)}; - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +#define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev) \ + benchmark::RegisterBenchmark( \ + std::string("device_batch_memcpy.") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_benchmark, T>( \ + state, stream, num_tlev, num_wlev, num_blev); \ + }) + +#define BENCHMARK_TYPE(item_size, item_alignment) \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) + +int32_t main(int32_t argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", 1024, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.set_optional("name_format", "name_format", "human", + "either: json,human,txt"); + + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int32_t trials = parser.get("trials"); + + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_device_adjacent_difference" << std::endl; + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // HIP + hipStream_t stream = hipStreamDefault; // default + + // Benchmark info + benchmark::AddCustomContext("size", std::to_string(size)); + + // Add benchmarks + std::vector benchmarks; + + benchmarks = {BENCHMARK_TYPE(1, 1), BENCHMARK_TYPE(1, 2), + BENCHMARK_TYPE(1, 4), BENCHMARK_TYPE(1, 8), + BENCHMARK_TYPE(2, 2), BENCHMARK_TYPE(4, 4), + BENCHMARK_TYPE(8, 8)}; + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_histogram.cpp b/benchmark/benchmark_device_histogram.cpp index 663ddd94..c8cb32b7 100644 --- a/benchmark/benchmark_device_histogram.cpp +++ b/benchmark/benchmark_device_histogram.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -23,7 +23,7 @@ // CUB's implementation of DeviceRunLengthEncode has unused parameters, // disable the warning because all warnings are threated as errors: #ifdef __HIP_PLATFORM_NVIDIA__ - #pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wunused-parameter" #endif #include "common_benchmark_header.hpp" @@ -39,667 +39,512 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template -std::vector - generate(size_t size, int entropy_reduction, long long lower_level, long long upper_level) -{ - if(entropy_reduction >= 5) - { - return std::vector(size, (lower_level + upper_level) / 2); - } - - const size_t max_random_size = 1024 * 1024; - - std::random_device rd; - std::default_random_engine gen(rd()); - std::vector data(size); - std::generate(data.begin(), - data.begin() + std::min(size, max_random_size), - [&]() - { - // Reduce entropy by applying bitwise AND to random bits - // "An Improved Supercomputer Sorting Benchmark", 1992 - // Kurt Thearling & Stephen Smith - auto v = gen(); - for(int e = 0; e < entropy_reduction; e++) - { - v &= gen(); - } - return T(lower_level + v % (upper_level - lower_level)); - }); - for(size_t i = max_random_size; i < size; i += max_random_size) - { - std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); - } - return data; +template +std::vector generate(size_t size, int entropy_reduction, + long long lower_level, long long upper_level) { + if (entropy_reduction >= 5) { + return std::vector(size, (lower_level + upper_level) / 2); + } + + const size_t max_random_size = 1024 * 1024; + + std::random_device rd; + std::default_random_engine gen(rd()); + std::vector data(size); + std::generate(data.begin(), data.begin() + std::min(size, max_random_size), + [&]() { + // Reduce entropy by applying bitwise AND to random bits + // "An Improved Supercomputer Sorting Benchmark", 1992 + // Kurt Thearling & Stephen Smith + auto v = gen(); + for (int e = 0; e < entropy_reduction; e++) { + v &= gen(); + } + return T(lower_level + v % (upper_level - lower_level)); + }); + for (size_t i = max_random_size; i < size; i += max_random_size) { + std::copy_n(data.begin(), std::min(size - i, max_random_size), + data.begin() + i); + } + return data; } -int get_entropy_percents(int entropy_reduction) -{ - switch(entropy_reduction) - { - case 0: return 100; - case 1: return 81; - case 2: return 54; - case 3: return 33; - case 4: return 20; - default: return 0; - } +int get_entropy_percents(int entropy_reduction) { + switch (entropy_reduction) { + case 0: + return 100; + case 1: + return 81; + case 2: + return 54; + case 3: + return 33; + case 4: + return 20; + default: + return 0; + } } -const int entropy_reductions[] = { 0, 2, 4, 6 }; - -template -void run_even_benchmark(benchmark::State& state, - size_t bins, - size_t scale, - int entropy_reduction, - hipStream_t stream, - size_t size) -{ - using counter_type = unsigned int; - - const T lower_level = 0; - // casting for compilation with CUB backend because - // there is no casting from size_t (aka unsigned long) to __half - const T upper_level = static_cast(bins * scale); - - // Generate data - std::vector input = generate(size, entropy_reduction, lower_level, upper_level); - - T * d_input; - counter_type * d_histogram; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK(hipcub::DeviceHistogram::HistogramEven(d_temporary_storage, - temporary_storage_bytes, - d_input, - d_histogram, - bins + 1, - lower_level, - upper_level, - int(size), - stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for(size_t i = 0; i < warmup_size; i++) - { - HIP_CHECK(hipcub::DeviceHistogram::HistogramEven(d_temporary_storage, - temporary_storage_bytes, - d_input, - d_histogram, - bins + 1, - lower_level, - upper_level, - int(size), - stream)); +const int entropy_reductions[] = {0, 2, 4, 6}; + +template +void run_even_benchmark(benchmark::State &state, size_t bins, size_t scale, + int entropy_reduction, hipStream_t stream, + size_t size) { + using counter_type = unsigned int; + + const T lower_level = 0; + // casting for compilation with CUB backend because + // there is no casting from size_t (aka unsigned long) to __half + const T upper_level = static_cast(bins * scale); + + // Generate data + std::vector input = + generate(size, entropy_reduction, lower_level, upper_level); + + T *d_input; + counter_type *d_histogram; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), + hipMemcpyHostToDevice)); + + void *d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK(hipcub::DeviceHistogram::HistogramEven( + d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, + bins + 1, lower_level, upper_level, int(size), stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < warmup_size; i++) { + HIP_CHECK(hipcub::DeviceHistogram::HistogramEven( + d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, + bins + 1, lower_level, upper_level, int(size), stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK(hipcub::DeviceHistogram::HistogramEven( + d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, + bins + 1, lower_level, upper_level, int(size), stream)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK(hipcub::DeviceHistogram::HistogramEven(d_temporary_storage, - temporary_storage_bytes, - d_input, - d_histogram, - bins + 1, - lower_level, - upper_level, - int(size), - stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_histogram)); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_histogram)); } -template -void run_multi_even_benchmark(benchmark::State& state, - size_t bins, - size_t scale, - int entropy_reduction, - hipStream_t stream, - size_t size) -{ - using counter_type = unsigned int; - - int num_levels[ActiveChannels]; - int lower_level[ActiveChannels]; - int upper_level[ActiveChannels]; - for(unsigned int channel = 0; channel < ActiveChannels; channel++) - { - lower_level[channel] = 0; - upper_level[channel] = bins * scale; - num_levels[channel] = bins + 1; - } - - // Generate data - std::vector input = generate(size * Channels, entropy_reduction, lower_level[0], upper_level[0]); - - T * d_input; - counter_type * d_histogram[ActiveChannels]; - HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T))); - for(unsigned int channel = 0; channel < ActiveChannels; channel++) - { - HIP_CHECK(hipMalloc(&d_histogram[channel], bins * sizeof(counter_type))); - } +template +void run_multi_even_benchmark(benchmark::State &state, size_t bins, + size_t scale, int entropy_reduction, + hipStream_t stream, size_t size) { + using counter_type = unsigned int; + + int num_levels[ActiveChannels]; + int lower_level[ActiveChannels]; + int upper_level[ActiveChannels]; + for (unsigned int channel = 0; channel < ActiveChannels; channel++) { + lower_level[channel] = 0; + upper_level[channel] = bins * scale; + num_levels[channel] = bins + 1; + } + + // Generate data + std::vector input = generate(size * Channels, entropy_reduction, + lower_level[0], upper_level[0]); + + T *d_input; + counter_type *d_histogram[ActiveChannels]; + HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T))); + for (unsigned int channel = 0; channel < ActiveChannels; channel++) { + HIP_CHECK(hipMalloc(&d_histogram[channel], bins * sizeof(counter_type))); + } + HIP_CHECK(hipMemcpy(d_input, input.data(), size * Channels * sizeof(T), + hipMemcpyHostToDevice)); + + void *d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK( + (hipcub::DeviceHistogram::MultiHistogramEven( + d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, + num_levels, lower_level, upper_level, int(size), stream))); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < warmup_size; i++) { HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * Channels * sizeof(T), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramEven( - d_temporary_storage, - temporary_storage_bytes, - d_input, - d_histogram, - num_levels, - lower_level, - upper_level, - int(size), - stream))); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for(size_t i = 0; i < warmup_size; i++) - { - HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramEven( - d_temporary_storage, - temporary_storage_bytes, - d_input, - d_histogram, - num_levels, - lower_level, - upper_level, - int(size), - stream))); + (hipcub::DeviceHistogram::MultiHistogramEven( + d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, + num_levels, lower_level, upper_level, int(size), stream))); + } + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK(( + hipcub::DeviceHistogram::MultiHistogramEven( + d_temporary_storage, temporary_storage_bytes, d_input, + d_histogram, num_levels, lower_level, upper_level, int(size), + stream))); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramEven( - d_temporary_storage, - temporary_storage_bytes, - d_input, - d_histogram, - num_levels, - lower_level, - upper_level, - int(size), - stream))); - } - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size * Channels); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_input)); - for(unsigned int channel = 0; channel < ActiveChannels; channel++) - { - HIP_CHECK(hipFree(d_histogram[channel])); - } + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * + sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size * Channels); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_input)); + for (unsigned int channel = 0; channel < ActiveChannels; channel++) { + HIP_CHECK(hipFree(d_histogram[channel])); + } } -template -void run_range_benchmark(benchmark::State& state, size_t bins, hipStream_t stream, size_t size) -{ - using counter_type = unsigned int; - - // Generate data - std::vector input = benchmark_utils::get_random_data(size, 0, bins); - - std::vector levels(bins + 1); - std::iota(levels.begin(), levels.end(), static_cast(0)); - - T * d_input; - T * d_levels; - counter_type * d_histogram; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_levels, (bins + 1) * sizeof(T))); - HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_levels, levels.data(), - (bins + 1) * sizeof(T), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK(hipcub::DeviceHistogram::HistogramRange(d_temporary_storage, - temporary_storage_bytes, - d_input, - d_histogram, - bins + 1, - d_levels, - int(size), - stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for(size_t i = 0; i < warmup_size; i++) - { - HIP_CHECK(hipcub::DeviceHistogram::HistogramRange(d_temporary_storage, - temporary_storage_bytes, - d_input, - d_histogram, - bins + 1, - d_levels, - int(size), - stream)); +template +void run_range_benchmark(benchmark::State &state, size_t bins, + hipStream_t stream, size_t size) { + using counter_type = unsigned int; + + // Generate data + std::vector input = benchmark_utils::get_random_data(size, 0, bins); + + std::vector levels(bins + 1); + std::iota(levels.begin(), levels.end(), static_cast(0)); + + T *d_input; + T *d_levels; + counter_type *d_histogram; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_levels, (bins + 1) * sizeof(T))); + HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_levels, levels.data(), (bins + 1) * sizeof(T), + hipMemcpyHostToDevice)); + + void *d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK(hipcub::DeviceHistogram::HistogramRange( + d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, + bins + 1, d_levels, int(size), stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < warmup_size; i++) { + HIP_CHECK(hipcub::DeviceHistogram::HistogramRange( + d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, + bins + 1, d_levels, int(size), stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK(hipcub::DeviceHistogram::HistogramRange( + d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, + bins + 1, d_levels, int(size), stream)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK(hipcub::DeviceHistogram::HistogramRange(d_temporary_storage, - temporary_storage_bytes, - d_input, - d_histogram, - bins + 1, - d_levels, - int(size), - stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_levels)); - HIP_CHECK(hipFree(d_histogram)); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_levels)); + HIP_CHECK(hipFree(d_histogram)); } -template -void run_multi_range_benchmark(benchmark::State& state, size_t bins, hipStream_t stream, size_t size) -{ - using counter_type = unsigned int; - - // Number of levels for a single channel - const int num_levels_channel = bins + 1; - int num_levels[ActiveChannels]; - std::vector levels[ActiveChannels]; - for (unsigned int channel = 0; channel < ActiveChannels; channel++) - { - levels[channel].resize(num_levels_channel); - std::iota(levels[channel].begin(), levels[channel].end(), static_cast(0)); - num_levels[channel] = num_levels_channel; - } - - // Generate data - std::vector input = benchmark_utils::get_random_data(size * Channels, 0, bins); - - T * d_input; - T * d_levels[ActiveChannels]; - counter_type * d_histogram[ActiveChannels]; - HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T))); - for(unsigned int channel = 0; channel < ActiveChannels; channel++) - { - HIP_CHECK(hipMalloc(&d_levels[channel], num_levels_channel * sizeof(T))); - HIP_CHECK(hipMalloc(&d_histogram[channel], size * sizeof(counter_type))); - } - +template +void run_multi_range_benchmark(benchmark::State &state, size_t bins, + hipStream_t stream, size_t size) { + using counter_type = unsigned int; + + // Number of levels for a single channel + const int num_levels_channel = bins + 1; + int num_levels[ActiveChannels]; + std::vector levels[ActiveChannels]; + for (unsigned int channel = 0; channel < ActiveChannels; channel++) { + levels[channel].resize(num_levels_channel); + std::iota(levels[channel].begin(), levels[channel].end(), + static_cast(0)); + num_levels[channel] = num_levels_channel; + } + + // Generate data + std::vector input = + benchmark_utils::get_random_data(size * Channels, 0, bins); + + T *d_input; + T *d_levels[ActiveChannels]; + counter_type *d_histogram[ActiveChannels]; + HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T))); + for (unsigned int channel = 0; channel < ActiveChannels; channel++) { + HIP_CHECK(hipMalloc(&d_levels[channel], num_levels_channel * sizeof(T))); + HIP_CHECK(hipMalloc(&d_histogram[channel], size * sizeof(counter_type))); + } + + HIP_CHECK(hipMemcpy(d_input, input.data(), size * Channels * sizeof(T), + hipMemcpyHostToDevice)); + for (unsigned int channel = 0; channel < ActiveChannels; channel++) { + HIP_CHECK(hipMemcpy(d_levels[channel], levels[channel].data(), + num_levels_channel * sizeof(T), hipMemcpyHostToDevice)); + } + + void *d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK( + (hipcub::DeviceHistogram::MultiHistogramRange( + d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, + num_levels, d_levels, int(size), stream))); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < warmup_size; i++) { HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * Channels * sizeof(T), - hipMemcpyHostToDevice - ) - ); - for(unsigned int channel = 0; channel < ActiveChannels; channel++) - { - HIP_CHECK( - hipMemcpy( - d_levels[channel], levels[channel].data(), - num_levels_channel * sizeof(T), - hipMemcpyHostToDevice - ) - ); - } - - void * d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramRange( - d_temporary_storage, - temporary_storage_bytes, - d_input, - d_histogram, - num_levels, - d_levels, - int(size), - stream))); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for(size_t i = 0; i < warmup_size; i++) - { - HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramRange( - d_temporary_storage, - temporary_storage_bytes, - d_input, - d_histogram, - num_levels, - d_levels, - int(size), - stream))); + (hipcub::DeviceHistogram::MultiHistogramRange( + d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, + num_levels, d_levels, int(size), stream))); + } + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramRange( + d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, + num_levels, d_levels, int(size), stream))); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramRange( - d_temporary_storage, - temporary_storage_bytes, - d_input, - d_histogram, - num_levels, - d_levels, - int(size), - stream))); - } - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size * Channels); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_input)); - for(unsigned int channel = 0; channel < ActiveChannels; channel++) - { - HIP_CHECK(hipFree(d_levels[channel])); - HIP_CHECK(hipFree(d_histogram[channel])); - } + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * + sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size * Channels); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_input)); + for (unsigned int channel = 0; channel < ActiveChannels; channel++) { + HIP_CHECK(hipFree(d_levels[channel])); + HIP_CHECK(hipFree(d_histogram[channel])); + } } -template -struct num_limits -{ - static constexpr T max() - { - return std::numeric_limits::max(); - }; +template struct num_limits { + static constexpr T max() { return std::numeric_limits::max(); }; }; -template<> -struct num_limits<__half> -{ - static constexpr double max() - { - return 65504.0; - }; +template <> struct num_limits<__half> { + static constexpr double max() { return 65504.0; }; }; -#define CREATE_EVEN_BENCHMARK(VECTOR, T, BINS, SCALE) \ - if(num_limits::max() > BINS * SCALE){ \ - VECTOR.push_back( \ - benchmark::RegisterBenchmark( \ - std::string("device_histogram_even" \ - "." \ - "(entropy_percent:" \ - + std::to_string(get_entropy_percents(entropy_reduction)) \ - + "%,bin_count:" \ - + std::to_string(BINS) \ - + " bins)" \ - ).c_str(), \ - [=](benchmark::State& state){ \ - run_even_benchmark(state, BINS, SCALE, entropy_reduction, stream, size); \ - } \ - ) \ - ); \ - } - -#define BENCHMARK_TYPE(VECTOR, T) \ - CREATE_EVEN_BENCHMARK(VECTOR, T, 10, 1234); \ - CREATE_EVEN_BENCHMARK(VECTOR, T, 100, 1234); \ - CREATE_EVEN_BENCHMARK(VECTOR, T, 1000, 1234); \ - CREATE_EVEN_BENCHMARK(VECTOR, T, 16, 10); \ - CREATE_EVEN_BENCHMARK(VECTOR, T, 256, 10); \ - CREATE_EVEN_BENCHMARK(VECTOR, T, 65536, 1) - -void add_even_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - for(int entropy_reduction : entropy_reductions) - { - BENCHMARK_TYPE(benchmarks, long long); - BENCHMARK_TYPE(benchmarks, int); - BENCHMARK_TYPE(benchmarks, unsigned short); - BENCHMARK_TYPE(benchmarks, uint8_t); - BENCHMARK_TYPE(benchmarks, double); - BENCHMARK_TYPE(benchmarks, float); - //this limitation can be removed once https://github.com/NVIDIA/cub/issues/484 is fixed +#define CREATE_EVEN_BENCHMARK(VECTOR, T, BINS, SCALE) \ + if (num_limits::max() > BINS * SCALE) { \ + VECTOR.push_back(benchmark::RegisterBenchmark( \ + std::string("device_histogram_even" \ + "." \ + "(entropy_percent:" + \ + std::to_string(get_entropy_percents(entropy_reduction)) + \ + "%,bin_count:" + std::to_string(BINS) + " bins)") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_even_benchmark(state, BINS, SCALE, entropy_reduction, stream, \ + size); \ + })); \ + } + +#define BENCHMARK_TYPE(VECTOR, T) \ + CREATE_EVEN_BENCHMARK(VECTOR, T, 10, 1234); \ + CREATE_EVEN_BENCHMARK(VECTOR, T, 100, 1234); \ + CREATE_EVEN_BENCHMARK(VECTOR, T, 1000, 1234); \ + CREATE_EVEN_BENCHMARK(VECTOR, T, 16, 10); \ + CREATE_EVEN_BENCHMARK(VECTOR, T, 256, 10); \ + CREATE_EVEN_BENCHMARK(VECTOR, T, 65536, 1) + +void add_even_benchmarks( + std::vector &benchmarks, + hipStream_t stream, size_t size) { + for (int entropy_reduction : entropy_reductions) { + BENCHMARK_TYPE(benchmarks, long long); + BENCHMARK_TYPE(benchmarks, int); + BENCHMARK_TYPE(benchmarks, unsigned short); + BENCHMARK_TYPE(benchmarks, uint8_t); + BENCHMARK_TYPE(benchmarks, double); + BENCHMARK_TYPE(benchmarks, float); + // this limitation can be removed once + // https://github.com/NVIDIA/cub/issues/484 is fixed #ifdef __HIP_PLATFORM_AMD__ - BENCHMARK_TYPE(benchmarks, __half); + BENCHMARK_TYPE(benchmarks, __half); #endif - }; + }; } -#define CREATE_MULTI_EVEN_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS, SCALE) \ - benchmark::RegisterBenchmark( \ - std::string("device_multi_histogram_even" \ - "." \ - "(entropy_percent:" \ - + std::to_string(get_entropy_percents(entropy_reduction)) \ - + "%,bin_count:" + \ - std::to_string(BINS) \ - + " bins)" \ - ).c_str(), \ - [=](benchmark::State& state){ \ - run_multi_even_benchmark( \ - state, BINS, SCALE, entropy_reduction, stream, size \ - ); \ - } \ - ) - -void add_multi_even_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - for(int entropy_reduction : entropy_reductions) - { - std::vector bs = - { - CREATE_MULTI_EVEN_BENCHMARK(4, 3, int, 10, 1234), - CREATE_MULTI_EVEN_BENCHMARK(4, 3, int, 100, 1234), - - CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned char, 16, 10), - CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned char, 256, 1), - - CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned short, 16, 10), - CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned short, 256, 10), - CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned short, 65536, 1), - }; - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_MULTI_EVEN_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS, SCALE) \ + benchmark::RegisterBenchmark( \ + std::string("device_multi_histogram_even" \ + "." \ + "(entropy_percent:" + \ + std::to_string(get_entropy_percents(entropy_reduction)) + \ + "%,bin_count:" + std::to_string(BINS) + " bins)") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_multi_even_benchmark( \ + state, BINS, SCALE, entropy_reduction, stream, size); \ + }) + +void add_multi_even_benchmarks( + std::vector &benchmarks, + hipStream_t stream, size_t size) { + for (int entropy_reduction : entropy_reductions) { + std::vector bs = { + CREATE_MULTI_EVEN_BENCHMARK(4, 3, int, 10, 1234), + CREATE_MULTI_EVEN_BENCHMARK(4, 3, int, 100, 1234), + + CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned char, 16, 10), + CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned char, 256, 1), + + CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned short, 16, 10), + CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned short, 256, 10), + CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned short, 65536, 1), }; -} - -#define CREATE_RANGE_BENCHMARK(T, BINS) \ - benchmark::RegisterBenchmark( \ - std::string("device_histogram_range" \ - "." \ - "(bin_count:" \ - + std::to_string(BINS) \ - + " bins)" \ - ).c_str(), \ - [=](benchmark::State& state){ \ - run_range_benchmark(state, BINS, stream, size); \ - } \ - ) - -#define BENCHMARK_RANGE_TYPE(T) \ - CREATE_RANGE_BENCHMARK(T, 10), CREATE_RANGE_BENCHMARK(T, 100), \ - CREATE_RANGE_BENCHMARK(T, 1000), CREATE_RANGE_BENCHMARK(T, 10000), \ - CREATE_RANGE_BENCHMARK(T, 100000), CREATE_RANGE_BENCHMARK(T, 1000000) - -void add_range_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - std::vector bs - = {BENCHMARK_RANGE_TYPE(float), BENCHMARK_RANGE_TYPE(double)}; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); + }; } -#define CREATE_MULTI_RANGE_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS) \ - benchmark::RegisterBenchmark( \ - std::string("device_multi_histogram_range" \ - ".(bin_count:" \ - + std::to_string(BINS) \ - + " bins)" \ - ).c_str(), \ - [=](benchmark::State& state){ \ - run_multi_range_benchmark( \ - state, BINS, stream, size \ - ); \ - } \ - ) - -void add_multi_range_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - std::vector bs = - { - CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 10), - CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 100), - CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 1000), - CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 10000), - CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 100000), - CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 1000000), - }; - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_RANGE_BENCHMARK(T, BINS) \ + benchmark::RegisterBenchmark(std::string("device_histogram_range" \ + "." \ + "(bin_count:" + \ + std::to_string(BINS) + " bins)") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_range_benchmark(state, BINS, stream, \ + size); \ + }) + +#define BENCHMARK_RANGE_TYPE(T) \ + CREATE_RANGE_BENCHMARK(T, 10), CREATE_RANGE_BENCHMARK(T, 100), \ + CREATE_RANGE_BENCHMARK(T, 1000), CREATE_RANGE_BENCHMARK(T, 10000), \ + CREATE_RANGE_BENCHMARK(T, 100000), CREATE_RANGE_BENCHMARK(T, 1000000) + +void add_range_benchmarks( + std::vector &benchmarks, + hipStream_t stream, size_t size) { + std::vector bs = { + BENCHMARK_RANGE_TYPE(float), BENCHMARK_RANGE_TYPE(double)}; + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_histogram" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_even_benchmarks(benchmarks, stream, size); - add_multi_even_benchmarks(benchmarks, stream, size); - add_range_benchmarks(benchmarks, stream, size); - add_multi_range_benchmarks(benchmarks, stream, size); - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } +#define CREATE_MULTI_RANGE_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS) \ + benchmark::RegisterBenchmark( \ + std::string("device_multi_histogram_range" \ + ".(bin_count:" + \ + std::to_string(BINS) + " bins)") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_multi_range_benchmark(state, BINS, \ + stream, size); \ + }) + +void add_multi_range_benchmarks( + std::vector &benchmarks, + hipStream_t stream, size_t size) { + std::vector bs = { + CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 10), + CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 100), + CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 1000), + CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 10000), + CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 100000), + CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 1000000), + }; + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +} - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_histogram" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_even_benchmarks(benchmarks, stream, size); + add_multi_even_benchmarks(benchmarks, stream, size); + add_range_benchmarks(benchmarks, stream, size); + add_multi_range_benchmarks(benchmarks, stream, size); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_memory.cpp b/benchmark/benchmark_device_memory.cpp index 027f591f..0c16d653 100644 --- a/benchmark/benchmark_device_memory.cpp +++ b/benchmark/benchmark_device_memory.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -26,408 +26,347 @@ #include "hipcub/block/block_scan.hpp" #include "hipcub/block/block_store.hpp" -enum memory_operation_method -{ - direct, - striped, - vectorize, - transpose, - warp_transpose +enum memory_operation_method { + direct, + striped, + vectorize, + transpose, + warp_transpose }; -enum kernel_operation -{ - no_operation, - block_scan, - custom_operation, - atomics_no_collision, - atomics_inter_block_collision, - atomics_inter_warp_collision, +enum kernel_operation { + no_operation, + block_scan, + custom_operation, + atomics_no_collision, + atomics_inter_block_collision, + atomics_inter_warp_collision, }; -struct empty_storage_type -{}; +struct empty_storage_type {}; -template +template struct operation; // no operation -template -struct operation -{ - typedef empty_storage_type storage_type; - - HIPCUB_DEVICE inline void - operator()(storage_type& /*storage*/, T (&)[ItemsPerThread], T* = nullptr) const - {} +template +struct operation { + typedef empty_storage_type storage_type; + + HIPCUB_DEVICE inline void operator()(storage_type & /*storage*/, + T (&)[ItemsPerThread], + T * = nullptr) const {} }; // custom operation -template -struct operation -{ - typedef empty_storage_type storage_type; - - HIPCUB_DEVICE inline void operator()(storage_type& storage, - T (&input)[ItemsPerThread], - T* global_mem_output = nullptr) const - { - (void)storage; - (void)global_mem_output; +template +struct operation { + typedef empty_storage_type storage_type; + + HIPCUB_DEVICE inline void operator()(storage_type &storage, + T (&input)[ItemsPerThread], + T *global_mem_output = nullptr) const { + (void)storage; + (void)global_mem_output; #pragma unroll - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - input[i] = input[i] + 666; - constexpr unsigned int repeats = 30; + for (unsigned int i = 0; i < ItemsPerThread; i++) { + input[i] = input[i] + 666; + constexpr unsigned int repeats = 30; #pragma unroll - for(unsigned int j = 0; j < repeats; j++) - { - input[i] = input[i] * (input[j % ItemsPerThread]); - } - } + for (unsigned int j = 0; j < repeats; j++) { + input[i] = input[i] * (input[j % ItemsPerThread]); + } } + } }; // block scan -template -struct operation -{ - typedef - typename hipcub::BlockScan - block_scan_type; - typedef typename block_scan_type::TempStorage storage_type; - - HIPCUB_DEVICE inline void operator()(storage_type& storage, - T (&input)[ItemsPerThread], - T* global_mem_output = nullptr) - { - (void)global_mem_output; - - // sync before re-using shared memory from load - __syncthreads(); - block_scan_type(storage).InclusiveScan(input, input, hipcub::Sum()); - } +template +struct operation { + typedef typename hipcub::BlockScan< + T, BlockSize, hipcub::BlockScanAlgorithm::BLOCK_SCAN_WARP_SCANS> + block_scan_type; + typedef typename block_scan_type::TempStorage storage_type; + + HIPCUB_DEVICE inline void operator()(storage_type &storage, + T (&input)[ItemsPerThread], + T *global_mem_output = nullptr) { + (void)global_mem_output; + + // sync before re-using shared memory from load + __syncthreads(); + block_scan_type(storage).InclusiveScan(input, input, hipcub::Sum()); + } }; // atomics_no_collision -template -struct operation -{ - typedef empty_storage_type storage_type; - - HIPCUB_DEVICE inline void operator()(storage_type& storage, - T (&input)[ItemsPerThread], - T* global_mem_output = nullptr) - { - (void)storage; - (void)input; - - const unsigned int index - = threadIdx.x * ItemsPerThread + blockIdx.x * blockDim.x * ItemsPerThread; +template +struct operation { + typedef empty_storage_type storage_type; + + HIPCUB_DEVICE inline void operator()(storage_type &storage, + T (&input)[ItemsPerThread], + T *global_mem_output = nullptr) { + (void)storage; + (void)input; + + const unsigned int index = + threadIdx.x * ItemsPerThread + blockIdx.x * blockDim.x * ItemsPerThread; #pragma unroll - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - atomicAdd(&global_mem_output[index + i], T(666)); - } + for (unsigned int i = 0; i < ItemsPerThread; i++) { + atomicAdd(&global_mem_output[index + i], T(666)); } + } }; // atomics_inter_block_collision -template -struct operation -{ - typedef empty_storage_type storage_type; - - HIPCUB_DEVICE inline void operator()(storage_type& storage, - T (&input)[ItemsPerThread], - T* global_mem_output = nullptr) - { - (void)storage; - (void)input; - - const unsigned int index - = (threadIdx.x % warpSize) * ItemsPerThread + blockIdx.x * blockDim.x * ItemsPerThread; +template +struct operation { + typedef empty_storage_type storage_type; + + HIPCUB_DEVICE inline void operator()(storage_type &storage, + T (&input)[ItemsPerThread], + T *global_mem_output = nullptr) { + (void)storage; + (void)input; + + const unsigned int index = (threadIdx.x % warpSize) * ItemsPerThread + + blockIdx.x * blockDim.x * ItemsPerThread; #pragma unroll - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - atomicAdd(&global_mem_output[index + i], T(666)); - } + for (unsigned int i = 0; i < ItemsPerThread; i++) { + atomicAdd(&global_mem_output[index + i], T(666)); } + } }; // atomics_inter_block_collision -template -struct operation -{ - typedef empty_storage_type storage_type; - - HIPCUB_DEVICE inline void operator()(storage_type& storage, - T (&input)[ItemsPerThread], - T* global_mem_output = nullptr) - { - (void)storage; - (void)input; - - const unsigned int index = threadIdx.x * ItemsPerThread; +template +struct operation { + typedef empty_storage_type storage_type; + + HIPCUB_DEVICE inline void operator()(storage_type &storage, + T (&input)[ItemsPerThread], + T *global_mem_output = nullptr) { + (void)storage; + (void)input; + + const unsigned int index = threadIdx.x * ItemsPerThread; #pragma unroll - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - atomicAdd(&global_mem_output[index + i], T(666)); - } + for (unsigned int i = 0; i < ItemsPerThread; i++) { + atomicAdd(&global_mem_output[index + i], T(666)); } + } }; -template -struct memory_operation -{}; - -template<> -struct memory_operation -{ - static constexpr hipcub::BlockLoadAlgorithm load_type - = hipcub::BlockLoadAlgorithm::BLOCK_LOAD_DIRECT; - static constexpr hipcub::BlockStoreAlgorithm store_type - = hipcub::BlockStoreAlgorithm::BLOCK_STORE_DIRECT; +template struct memory_operation {}; + +template <> struct memory_operation { + static constexpr hipcub::BlockLoadAlgorithm load_type = + hipcub::BlockLoadAlgorithm::BLOCK_LOAD_DIRECT; + static constexpr hipcub::BlockStoreAlgorithm store_type = + hipcub::BlockStoreAlgorithm::BLOCK_STORE_DIRECT; }; -template<> -struct memory_operation -{ - static constexpr hipcub::BlockLoadAlgorithm load_type - = hipcub::BlockLoadAlgorithm::BLOCK_LOAD_STRIPED; - static constexpr hipcub::BlockStoreAlgorithm store_type - = hipcub::BlockStoreAlgorithm::BLOCK_STORE_STRIPED; +template <> struct memory_operation { + static constexpr hipcub::BlockLoadAlgorithm load_type = + hipcub::BlockLoadAlgorithm::BLOCK_LOAD_STRIPED; + static constexpr hipcub::BlockStoreAlgorithm store_type = + hipcub::BlockStoreAlgorithm::BLOCK_STORE_STRIPED; }; -template<> -struct memory_operation -{ - static constexpr hipcub::BlockLoadAlgorithm load_type - = hipcub::BlockLoadAlgorithm::BLOCK_LOAD_VECTORIZE; - static constexpr hipcub::BlockStoreAlgorithm store_type - = hipcub::BlockStoreAlgorithm::BLOCK_STORE_VECTORIZE; +template <> struct memory_operation { + static constexpr hipcub::BlockLoadAlgorithm load_type = + hipcub::BlockLoadAlgorithm::BLOCK_LOAD_VECTORIZE; + static constexpr hipcub::BlockStoreAlgorithm store_type = + hipcub::BlockStoreAlgorithm::BLOCK_STORE_VECTORIZE; }; -template<> -struct memory_operation -{ - static constexpr hipcub::BlockLoadAlgorithm load_type - = hipcub::BlockLoadAlgorithm::BLOCK_LOAD_TRANSPOSE; - static constexpr hipcub::BlockStoreAlgorithm store_type - = hipcub::BlockStoreAlgorithm::BLOCK_STORE_TRANSPOSE; +template <> struct memory_operation { + static constexpr hipcub::BlockLoadAlgorithm load_type = + hipcub::BlockLoadAlgorithm::BLOCK_LOAD_TRANSPOSE; + static constexpr hipcub::BlockStoreAlgorithm store_type = + hipcub::BlockStoreAlgorithm::BLOCK_STORE_TRANSPOSE; }; -template<> -struct memory_operation -{ - static constexpr hipcub::BlockLoadAlgorithm load_type - = hipcub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE; - static constexpr hipcub::BlockStoreAlgorithm store_type - = hipcub::BlockStoreAlgorithm::BLOCK_STORE_WARP_TRANSPOSE; +template <> struct memory_operation { + static constexpr hipcub::BlockLoadAlgorithm load_type = + hipcub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE; + static constexpr hipcub::BlockStoreAlgorithm store_type = + hipcub::BlockStoreAlgorithm::BLOCK_STORE_WARP_TRANSPOSE; }; -template -__global__ __launch_bounds__(BlockSize) void operation_kernel(T* input, T* output, CustomOp op) -{ - typedef memory_operation mem_op; - typedef hipcub::BlockLoad load_type; - typedef hipcub::BlockStore store_type; - - __shared__ union - { - typename load_type::TempStorage load; - typename store_type::TempStorage store; - typename CustomOp::storage_type operand; - } storage; - - constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int offset = blockIdx.x * items_per_block; - - T items[ItemsPerThread]; - load_type(storage.load).Load(input + offset, items); - - op(storage.operand, items, output); - // sync before re-using shared memory from load or from operand - __syncthreads(); - store_type(storage.store).Store(output + offset, items); +template +__global__ __launch_bounds__(BlockSize) void operation_kernel(T *input, + T *output, + CustomOp op) { + typedef memory_operation mem_op; + typedef hipcub::BlockLoad + load_type; + typedef hipcub::BlockStore + store_type; + + __shared__ union { + typename load_type::TempStorage load; + typename store_type::TempStorage store; + typename CustomOp::storage_type operand; + } storage; + + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + const unsigned int offset = blockIdx.x * items_per_block; + + T items[ItemsPerThread]; + load_type(storage.load).Load(input + offset, items); + + op(storage.operand, items, output); + // sync before re-using shared memory from load or from operand + __syncthreads(); + store_type(storage.store).Store(output + offset, items); } -template -void run_benchmark(benchmark::State& state, size_t size, const hipStream_t stream) -{ - const size_t grid_size = size / (BlockSize * ItemsPerThread); - std::vector input; - if(std::is_floating_point::value) - { - input = benchmark_utils::get_random_data(size, (T)-1000, (T) + 1000); - } - else - { - input = benchmark_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max()); - } - T* d_input; - T* d_output; - HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); - HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - operation selected_operation; - - // Warm-up - for(size_t i = 0; i < 10; i++) - { - hipLaunchKernelGGL(HIP_KERNEL_NAME(operation_kernel), - dim3(grid_size), - dim3(BlockSize), - 0, - stream, - d_input, - d_output, - selected_operation); - } - HIP_CHECK(hipDeviceSynchronize()); - - // HIP events creation - hipEvent_t start, stop; - HIP_CHECK(hipEventCreate(&start)); - HIP_CHECK(hipEventCreate(&stop)); - - const unsigned int batch_size = 10; - for(auto _ : state) - { - // Record start event - HIP_CHECK(hipEventRecord(start, stream)); - - for(size_t i = 0; i < batch_size; i++) - { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(operation_kernel), - dim3(grid_size), - dim3(BlockSize), - 0, - stream, - d_input, - d_output, - selected_operation); - } - - // Record stop event and wait until it completes - HIP_CHECK(hipEventRecord(stop, stream)); - HIP_CHECK(hipEventSynchronize(stop)); - - float elapsed_mseconds; - HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); - state.SetIterationTime(elapsed_mseconds / 1000); +template +void run_benchmark(benchmark::State &state, size_t size, + const hipStream_t stream) { + const size_t grid_size = size / (BlockSize * ItemsPerThread); + std::vector input; + if (std::is_floating_point::value) { + input = benchmark_utils::get_random_data(size, (T)-1000, (T) + 1000); + } else { + input = benchmark_utils::get_random_data( + size, std::numeric_limits::min(), std::numeric_limits::max()); + } + T *d_input; + T *d_output; + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + operation selected_operation; + + // Warm-up + for (size_t i = 0; i < 10; i++) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(operation_kernel), + dim3(grid_size), dim3(BlockSize), 0, stream, d_input, d_output, + selected_operation); + } + HIP_CHECK(hipDeviceSynchronize()); + + // HIP events creation + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + + const unsigned int batch_size = 10; + for (auto _ : state) { + // Record start event + HIP_CHECK(hipEventRecord(start, stream)); + + for (size_t i = 0; i < batch_size; i++) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + operation_kernel), + dim3(grid_size), dim3(BlockSize), 0, stream, d_input, d_output, + selected_operation); } - // Destroy HIP events - HIP_CHECK(hipEventDestroy(start)); - HIP_CHECK(hipEventDestroy(stop)); + // Record stop event and wait until it completes + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed_mseconds; + HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); + state.SetIterationTime(elapsed_mseconds / 1000); + } + + // Destroy HIP events + HIP_CHECK(hipEventDestroy(start)); + HIP_CHECK(hipEventDestroy(stop)); - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -template -void run_benchmark_memcpy(benchmark::State& state, size_t size, const hipStream_t stream) -{ - // Allocate device buffers - // Note: since this benchmark only tests memcpy performance between device buffers, - // we don't really need to copy data into these from the host - whatever happens - // to be in memory will suffice. - T* d_input; - T* d_output; - HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); - HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); - - // Warm-up - for(size_t i = 0; i < 10; i++) - { - HIP_CHECK(hipMemcpy(d_output, d_input, size * sizeof(T), hipMemcpyDeviceToDevice)); - } - HIP_CHECK(hipDeviceSynchronize()); - - // HIP events creation - hipEvent_t start, stop; - HIP_CHECK(hipEventCreate(&start)); - HIP_CHECK(hipEventCreate(&stop)); - - const unsigned int batch_size = 10; - for(auto _ : state) - { - // Record start event - HIP_CHECK(hipEventRecord(start, stream)); - - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK(hipMemcpy(d_output, d_input, size * sizeof(T), hipMemcpyDeviceToDevice)); - } - - // Record stop event and wait until it completes - HIP_CHECK(hipEventRecord(stop, stream)); - HIP_CHECK(hipEventSynchronize(stop)); - - float elapsed_mseconds; - HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); - state.SetIterationTime(elapsed_mseconds / 1000); +template +void run_benchmark_memcpy(benchmark::State &state, size_t size, + const hipStream_t stream) { + // Allocate device buffers + // Note: since this benchmark only tests memcpy performance between device + // buffers, we don't really need to copy data into these from the host - + // whatever happens to be in memory will suffice. + T *d_input; + T *d_output; + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); + + // Warm-up + for (size_t i = 0; i < 10; i++) { + HIP_CHECK(hipMemcpy(d_output, d_input, size * sizeof(T), + hipMemcpyDeviceToDevice)); + } + HIP_CHECK(hipDeviceSynchronize()); + + // HIP events creation + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + + const unsigned int batch_size = 10; + for (auto _ : state) { + // Record start event + HIP_CHECK(hipEventRecord(start, stream)); + + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK(hipMemcpy(d_output, d_input, size * sizeof(T), + hipMemcpyDeviceToDevice)); } - // Destroy HIP events - HIP_CHECK(hipEventDestroy(start)); - HIP_CHECK(hipEventDestroy(stop)); + // Record stop event and wait until it completes + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed_mseconds; + HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); + state.SetIterationTime(elapsed_mseconds / 1000); + } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); + // Destroy HIP events + HIP_CHECK(hipEventDestroy(start)); + HIP_CHECK(hipEventDestroy(stop)); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK_IPT(METHOD, OPERATION, T, SIZE, BS, IPT) \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - std::string("device_memory." \ - ).c_str(), \ - [=](benchmark::State& state){ \ - run_benchmark(state, SIZE, stream); \ - } \ - ) \ - ); \ - -#define CREATE_BENCHMARK_MEMCPY(T, SIZE) \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - std::string("device_memory_memcpy." \ - ).c_str(), \ - [=](benchmark::State& state){ \ - run_benchmark_memcpy(state, SIZE, stream); \ - } \ - ) \ - ); \ +#define CREATE_BENCHMARK_IPT(METHOD, OPERATION, T, SIZE, BS, IPT) \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_memory.") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_benchmark(state, SIZE, stream); \ + })); + +#define CREATE_BENCHMARK_MEMCPY(T, SIZE) \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_memory_memcpy.") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_benchmark_memcpy(state, SIZE, stream); \ + })); // clang-format off #define CREATE_BENCHMARK_BLOCK_SIZE(MEM_OP, OP, TYPE, SIZE, BLOCK_SIZE) \ @@ -447,65 +386,59 @@ void run_benchmark_memcpy(benchmark::State& state, size_t size, const hipStream_ CREATE_BENCHMARK_MEM_OP(warp_transpose, OP, TYPE, SIZE) // clang-format on -template -constexpr unsigned int megabytes(unsigned int size) -{ - return (size * (1024 * 1024 / sizeof(T))); +template constexpr unsigned int megabytes(unsigned int size) { + return (size * (1024 * 1024 / sizeof(T))); } -int main(int argc, char* argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); - // Parse argv - benchmark::Initialize(&argc, argv); - const int trials = parser.get("trials"); + // Parse argv + benchmark::Initialize(&argc, argv); + const int trials = parser.get("trials"); - std::cout << "benchmark_device_memory" << std::endl; + std::cout << "benchmark_device_memory" << std::endl; - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; - // Add benchmarks - std::vector benchmarks; + // Add benchmarks + std::vector benchmarks; - // Simple memory copy from device to device, not running a kernel - CREATE_BENCHMARK_MEMCPY(int, megabytes(128)) + // Simple memory copy from device to device, not running a kernel + CREATE_BENCHMARK_MEMCPY(int, megabytes(128)) - // clang-format off + // clang-format off CREATE_BENCHMARK(no_operation, int, megabytes(128)) CREATE_BENCHMARK(block_scan, int, megabytes(128)) CREATE_BENCHMARK(custom_operation, int, megabytes(128)) CREATE_BENCHMARK(atomics_no_collision, int, megabytes(128)) CREATE_BENCHMARK(atomics_inter_block_collision, int, megabytes(128)) CREATE_BENCHMARK(atomics_inter_warp_collision, int, megabytes(128)) - // clang-format on - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } + // clang-format on + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); - return 0; + return 0; } diff --git a/benchmark/benchmark_device_merge_sort.cpp b/benchmark/benchmark_device_merge_sort.cpp index 38baada6..4cebcbd1 100644 --- a/benchmark/benchmark_device_merge_sort.cpp +++ b/benchmark/benchmark_device_merge_sort.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -23,8 +23,8 @@ #include "common_benchmark_header.hpp" // HIP API -#include "hipcub/hipcub.hpp" #include "hipcub/device/device_merge_sort.hpp" +#include "hipcub/hipcub.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 32 << 20; @@ -33,311 +33,246 @@ const size_t DEFAULT_N = 32 << 20; const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template -std::vector generate_keys(size_t size) -{ - using key_type = Key; - - if(std::is_floating_point::value) - { - return benchmark_utils::get_random_data(size, static_cast(-1000), static_cast(1000), size); - } - else - { - return benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max(), - size - ); - } +template std::vector generate_keys(size_t size) { + using key_type = Key; + + if (std::is_floating_point::value) { + return benchmark_utils::get_random_data( + size, static_cast(-1000), static_cast(1000), size); + } else { + return benchmark_utils::get_random_data( + size, std::numeric_limits::min(), + std::numeric_limits::max(), size); + } } -template -void run_sort_keys_benchmark(benchmark::State& state, - hipStream_t stream, - size_t size) -{ - using key_type = Key; - auto compare_function = [] __device__ (const key_type & a, const key_type & b) { return a < b; }; - - auto keys_input = generate_keys(size); - - key_type * d_keys_input; - key_type * d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK( - hipcub::DeviceMergeSort::SortKeysCopy( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - compare_function, stream - ) - ); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for(size_t i = 0; i < warmup_size; i++) - { - HIP_CHECK( - hipcub::DeviceMergeSort::SortKeysCopy( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - compare_function, stream - ) - ); +template +void run_sort_keys_benchmark(benchmark::State &state, hipStream_t stream, + size_t size) { + using key_type = Key; + auto compare_function = [] __device__(const key_type &a, const key_type &b) { + return a < b; + }; + + auto keys_input = generate_keys(size); + + key_type *d_keys_input; + key_type *d_keys_output; + HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), + hipMemcpyHostToDevice)); + + void *d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK(hipcub::DeviceMergeSort::SortKeysCopy( + d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, + size, compare_function, stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < warmup_size; i++) { + HIP_CHECK(hipcub::DeviceMergeSort::SortKeysCopy( + d_temporary_storage, temporary_storage_bytes, d_keys_input, + d_keys_output, size, compare_function, stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK(hipcub::DeviceMergeSort::SortKeysCopy( + d_temporary_storage, temporary_storage_bytes, d_keys_input, + d_keys_output, size, compare_function, stream)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK( - hipcub::DeviceMergeSort::SortKeysCopy( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - compare_function, stream - ) - ); - } - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_keys_input)); - HIP_CHECK(hipFree(d_keys_output)); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * + sizeof(key_type)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_keys_output)); } -template -void run_sort_pairs_benchmark(benchmark::State& state, - hipStream_t stream, - size_t size) -{ - using key_type = Key; - using value_type = Value; - auto compare_function = [] __device__ (const key_type & a, const key_type & b) { return a < b; }; - - auto keys_input = generate_keys(size); - std::vector values_input(size); - for(size_t i = 0; i < size; i++) - { - values_input[i] = value_type(i); - } - - key_type * d_keys_input; - key_type * d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; - value_type * d_values_output; - HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); - HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK( - hipcub::DeviceMergeSort::SortPairsCopy( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_values_input, d_keys_output, d_values_output, size, - compare_function, stream - ) - ); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for(size_t i = 0; i < warmup_size; i++) - { - HIP_CHECK( - hipcub::DeviceMergeSort::SortPairsCopy( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_values_input, d_keys_output, d_values_output, size, - compare_function, stream - ) - ); +template +void run_sort_pairs_benchmark(benchmark::State &state, hipStream_t stream, + size_t size) { + using key_type = Key; + using value_type = Value; + auto compare_function = [] __device__(const key_type &a, const key_type &b) { + return a < b; + }; + + auto keys_input = generate_keys(size); + std::vector values_input(size); + for (size_t i = 0; i < size; i++) { + values_input[i] = value_type(i); + } + + key_type *d_keys_input; + key_type *d_keys_output; + HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), + hipMemcpyHostToDevice)); + + value_type *d_values_input; + value_type *d_values_output; + HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); + HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); + HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), + size * sizeof(value_type), hipMemcpyHostToDevice)); + + void *d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK(hipcub::DeviceMergeSort::SortPairsCopy( + d_temporary_storage, temporary_storage_bytes, d_keys_input, + d_values_input, d_keys_output, d_values_output, size, compare_function, + stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < warmup_size; i++) { + HIP_CHECK(hipcub::DeviceMergeSort::SortPairsCopy( + d_temporary_storage, temporary_storage_bytes, d_keys_input, + d_values_input, d_keys_output, d_values_output, size, compare_function, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK(hipcub::DeviceMergeSort::SortPairsCopy( + d_temporary_storage, temporary_storage_bytes, d_keys_input, + d_values_input, d_keys_output, d_values_output, size, + compare_function, stream)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK( - hipcub::DeviceMergeSort::SortPairsCopy( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_values_input, d_keys_output, d_values_output, size, - compare_function, stream - ) - ); - } - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed( - state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type)) - ); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_keys_input)); - HIP_CHECK(hipFree(d_keys_output)); - HIP_CHECK(hipFree(d_values_input)); - HIP_CHECK(hipFree(d_values_output)); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * + (sizeof(key_type) + sizeof(value_type))); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_keys_output)); + HIP_CHECK(hipFree(d_values_input)); + HIP_CHECK(hipFree(d_values_output)); } - -#define CREATE_SORT_KEYS_BENCHMARK(T) \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - std::string("device_merge_sort_sort_keys" \ - "." \ - ).c_str(), \ - [=](benchmark::State& state){ \ - run_sort_keys_benchmark(state, stream, size); \ - } \ - ) \ - ); \ - -#define CREATE_SORT_PAIRS_BENCHMARK(T, V) \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - std::string("device_merge_sort_sort_pairs<" \ - ",key_data_type:" #T \ - ",value_data_type:" #V ">." \ - ).c_str(), \ - [=](benchmark::State& state){ \ - run_sort_pairs_benchmark(state, stream, size); \ - } \ - ) \ - ); \ - - -void add_sort_keys_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - CREATE_SORT_KEYS_BENCHMARK(int) - CREATE_SORT_KEYS_BENCHMARK(long long) - CREATE_SORT_KEYS_BENCHMARK(int8_t) - CREATE_SORT_KEYS_BENCHMARK(uint8_t) - CREATE_SORT_KEYS_BENCHMARK(short) +#define CREATE_SORT_KEYS_BENCHMARK(T) \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_merge_sort_sort_keys" \ + ".") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_sort_keys_benchmark(state, stream, size); \ + })); + +#define CREATE_SORT_PAIRS_BENCHMARK(T, V) \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_merge_sort_sort_pairs<" \ + ",key_data_type:" #T ",value_data_type:" #V ">.") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_sort_pairs_benchmark(state, stream, size); \ + })); + +void add_sort_keys_benchmarks( + std::vector &benchmarks, + hipStream_t stream, size_t size) { + CREATE_SORT_KEYS_BENCHMARK(int) + CREATE_SORT_KEYS_BENCHMARK(long long) + CREATE_SORT_KEYS_BENCHMARK(int8_t) + CREATE_SORT_KEYS_BENCHMARK(uint8_t) + CREATE_SORT_KEYS_BENCHMARK(short) } -void add_sort_pairs_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - using custom_char_double = benchmark_utils::custom_type; - using custom_double_char = benchmark_utils::custom_type; - - CREATE_SORT_PAIRS_BENCHMARK(int, float) - CREATE_SORT_PAIRS_BENCHMARK(int, double) - CREATE_SORT_PAIRS_BENCHMARK(int, custom_float2) - CREATE_SORT_PAIRS_BENCHMARK(int, custom_double2) - CREATE_SORT_PAIRS_BENCHMARK(int, custom_char_double) - CREATE_SORT_PAIRS_BENCHMARK(int, custom_double_char) - - CREATE_SORT_PAIRS_BENCHMARK(long long, float) - CREATE_SORT_PAIRS_BENCHMARK(long long, double) - CREATE_SORT_PAIRS_BENCHMARK(long long, custom_float2) - CREATE_SORT_PAIRS_BENCHMARK(long long, custom_char_double) - CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double_char) - CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double2) - - CREATE_SORT_PAIRS_BENCHMARK(int8_t, int8_t) - CREATE_SORT_PAIRS_BENCHMARK(uint8_t, uint8_t) +void add_sort_pairs_benchmarks( + std::vector &benchmarks, + hipStream_t stream, size_t size) { + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + using custom_char_double = benchmark_utils::custom_type; + using custom_double_char = benchmark_utils::custom_type; + + CREATE_SORT_PAIRS_BENCHMARK(int, float) + CREATE_SORT_PAIRS_BENCHMARK(int, double) + CREATE_SORT_PAIRS_BENCHMARK(int, custom_float2) + CREATE_SORT_PAIRS_BENCHMARK(int, custom_double2) + CREATE_SORT_PAIRS_BENCHMARK(int, custom_char_double) + CREATE_SORT_PAIRS_BENCHMARK(int, custom_double_char) + + CREATE_SORT_PAIRS_BENCHMARK(long long, float) + CREATE_SORT_PAIRS_BENCHMARK(long long, double) + CREATE_SORT_PAIRS_BENCHMARK(long long, custom_float2) + CREATE_SORT_PAIRS_BENCHMARK(long long, custom_char_double) + CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double_char) + CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double2) + + CREATE_SORT_PAIRS_BENCHMARK(int8_t, int8_t) + CREATE_SORT_PAIRS_BENCHMARK(uint8_t, uint8_t) } -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - - std::cout << "benchmark_device_merge_sort" << std::endl; - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_sort_keys_benchmarks(benchmarks, stream, size); - add_sort_pairs_benchmarks(benchmarks, stream, size); - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_device_merge_sort" << std::endl; + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_sort_keys_benchmarks(benchmarks, stream, size); + add_sort_pairs_benchmarks(benchmarks, stream, size); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_partition.cpp b/benchmark/benchmark_device_partition.cpp index f87d4181..5f40b712 100644 --- a/benchmark/benchmark_device_partition.cpp +++ b/benchmark/benchmark_device_partition.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -36,449 +36,339 @@ constexpr unsigned int batch_size = 10; constexpr unsigned int warmup_size = 5; namespace { -template -struct LessOp { - HIPCUB_HOST_DEVICE LessOp(const T& pivot) - : pivot_{pivot} - { - } +template struct LessOp { + HIPCUB_HOST_DEVICE LessOp(const T &pivot) : pivot_{pivot} {} + + HIPCUB_HOST_DEVICE bool operator()(const T &val) const { + return val < pivot_; + } - HIPCUB_HOST_DEVICE bool operator()(const T& val) const { - return val < pivot_; - } private: - T pivot_; + T pivot_; }; -} +} // namespace template -void run_flagged(benchmark::State& state, - const hipStream_t stream, - const T threshold, - const size_t size) -{ - const auto select_op = LessOp{threshold}; - const auto input = - benchmark_utils::get_random_data(size, static_cast(0), static_cast(100)); - - std::vector flags(size); - for(unsigned int i = 0; i < size; i++) { - flags[i] = static_cast(select_op(input[i])); - } - - T* d_input = nullptr; - F* d_flags = nullptr; - T* d_output = nullptr; - unsigned int* d_num_selected_output = nullptr; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_flags, input.size() * sizeof(F))); - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_num_selected_output, sizeof(unsigned int))); - - // Allocate temporary storage - void* d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - HIP_CHECK( - hipcub::DevicePartition::Flagged( - nullptr, - temp_storage_bytes, - d_input, - d_flags, - d_output, - d_num_selected_output, - static_cast(input.size()), - stream - ) - ); - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); - - // Warm-up - HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(F), hipMemcpyHostToDevice)); - for(unsigned int i = 0; i < warmup_size; ++i) { - HIP_CHECK( - hipcub::DevicePartition::Flagged( - d_temp_storage, - temp_storage_bytes, - d_input, - d_flags, - d_output, - d_num_selected_output, - static_cast(input.size()), - stream - ) - ); +void run_flagged(benchmark::State &state, const hipStream_t stream, + const T threshold, const size_t size) { + const auto select_op = LessOp{threshold}; + const auto input = benchmark_utils::get_random_data( + size, static_cast(0), static_cast(100)); + + std::vector flags(size); + for (unsigned int i = 0; i < size; i++) { + flags[i] = static_cast(select_op(input[i])); + } + + T *d_input = nullptr; + F *d_flags = nullptr; + T *d_output = nullptr; + unsigned int *d_num_selected_output = nullptr; + HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_flags, input.size() * sizeof(F))); + HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_num_selected_output, sizeof(unsigned int))); + + // Allocate temporary storage + void *d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + HIP_CHECK(hipcub::DevicePartition::Flagged( + nullptr, temp_storage_bytes, d_input, d_flags, d_output, + d_num_selected_output, static_cast(input.size()), stream)); + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); + + // Warm-up + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(F), + hipMemcpyHostToDevice)); + for (unsigned int i = 0; i < warmup_size; ++i) { + HIP_CHECK(hipcub::DevicePartition::Flagged( + d_temp_storage, temp_storage_bytes, d_input, d_flags, d_output, + d_num_selected_output, static_cast(input.size()), stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + // Run benchmark + for (auto _ : state) { + namespace chrono = std::chrono; + using clock = chrono::high_resolution_clock; + + const auto start = clock::now(); + for (unsigned int i = 0; i < batch_size; ++i) { + HIP_CHECK(hipcub::DevicePartition::Flagged( + d_temp_storage, temp_storage_bytes, d_input, d_flags, d_output, + d_num_selected_output, static_cast(input.size()), stream)); } HIP_CHECK(hipDeviceSynchronize()); - // Run benchmark - for(auto _ : state) { - namespace chrono = std::chrono; - using clock = chrono::high_resolution_clock; - - const auto start = clock::now(); - for (unsigned int i = 0; i < batch_size; ++i) { - HIP_CHECK( - hipcub::DevicePartition::Flagged( - d_temp_storage, - temp_storage_bytes, - d_input, - d_flags, - d_output, - d_num_selected_output, - static_cast(input.size()), - stream - ) - ); - } - HIP_CHECK(hipDeviceSynchronize()); - - const auto end = clock::now(); - using seconds_d = chrono::duration; - const auto elapsed_seconds = chrono::duration_cast(end - start); - - state.SetIterationTime(elapsed_seconds.count()); - } + const auto end = clock::now(); + using seconds_d = chrono::duration; + const auto elapsed_seconds = chrono::duration_cast(end - start); + + state.SetIterationTime(elapsed_seconds.count()); + } - state.SetItemsProcessed(state.iterations() * batch_size * input.size()); - state.SetBytesProcessed( - static_cast(state.iterations() * batch_size * input.size() * sizeof(input[0]))); + state.SetItemsProcessed(state.iterations() * batch_size * input.size()); + state.SetBytesProcessed(static_cast( + state.iterations() * batch_size * input.size() * sizeof(input[0]))); - HIP_CHECK(hipFree(d_temp_storage)); - HIP_CHECK(hipFree(d_num_selected_output)); - HIP_CHECK(hipFree(d_output)); - HIP_CHECK(hipFree(d_flags)); - HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_temp_storage)); + HIP_CHECK(hipFree(d_num_selected_output)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_flags)); + HIP_CHECK(hipFree(d_input)); } template -void run_predicate(benchmark::State& state, - const hipStream_t stream, - const T threshold, - const size_t size) -{ - const auto input = - benchmark_utils::get_random_data(size, static_cast(0), static_cast(100)); - - T* d_input = nullptr; - T* d_output = nullptr; - unsigned int* d_num_selected_output = nullptr; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_num_selected_output, sizeof(unsigned int))); - - const auto select_op = LessOp{threshold}; - - // Allocate temporary storage - void* d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - HIP_CHECK( - hipcub::DevicePartition::If( - nullptr, - temp_storage_bytes, - d_input, - d_output, - d_num_selected_output, - static_cast(input.size()), - select_op, - stream - ) - ); - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); - - // Warm-up - HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); - for(unsigned int i = 0; i < warmup_size; ++i) { - HIP_CHECK( - hipcub::DevicePartition::If( - d_temp_storage, - temp_storage_bytes, - d_input, - d_output, - d_num_selected_output, - static_cast(input.size()), - select_op, - stream - ) - ); +void run_predicate(benchmark::State &state, const hipStream_t stream, + const T threshold, const size_t size) { + const auto input = benchmark_utils::get_random_data( + size, static_cast(0), static_cast(100)); + + T *d_input = nullptr; + T *d_output = nullptr; + unsigned int *d_num_selected_output = nullptr; + HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_num_selected_output, sizeof(unsigned int))); + + const auto select_op = LessOp{threshold}; + + // Allocate temporary storage + void *d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + HIP_CHECK(hipcub::DevicePartition::If( + nullptr, temp_storage_bytes, d_input, d_output, d_num_selected_output, + static_cast(input.size()), select_op, stream)); + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); + + // Warm-up + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), + hipMemcpyHostToDevice)); + for (unsigned int i = 0; i < warmup_size; ++i) { + HIP_CHECK(hipcub::DevicePartition::If( + d_temp_storage, temp_storage_bytes, d_input, d_output, + d_num_selected_output, static_cast(input.size()), select_op, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + // Run benchmark + for (auto _ : state) { + namespace chrono = std::chrono; + using clock = chrono::high_resolution_clock; + + const auto start = clock::now(); + for (unsigned int i = 0; i < batch_size; ++i) { + HIP_CHECK(hipcub::DevicePartition::If( + d_temp_storage, temp_storage_bytes, d_input, d_output, + d_num_selected_output, static_cast(input.size()), select_op, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - // Run benchmark - for(auto _ : state) { - namespace chrono = std::chrono; - using clock = chrono::high_resolution_clock; - - const auto start = clock::now(); - for (unsigned int i = 0; i < batch_size; ++i) { - HIP_CHECK( - hipcub::DevicePartition::If( - d_temp_storage, - temp_storage_bytes, - d_input, - d_output, - d_num_selected_output, - static_cast(input.size()), - select_op, - stream - ) - ); - } - HIP_CHECK(hipDeviceSynchronize()); - - const auto end = clock::now(); - using seconds_d = chrono::duration; - const auto elapsed_seconds = chrono::duration_cast(end - start); - - state.SetIterationTime(elapsed_seconds.count()); - } + const auto end = clock::now(); + using seconds_d = chrono::duration; + const auto elapsed_seconds = chrono::duration_cast(end - start); + + state.SetIterationTime(elapsed_seconds.count()); + } - state.SetItemsProcessed(state.iterations() * batch_size * input.size()); - state.SetBytesProcessed( - static_cast(state.iterations() * batch_size * input.size() * sizeof(input[0]))); + state.SetItemsProcessed(state.iterations() * batch_size * input.size()); + state.SetBytesProcessed(static_cast( + state.iterations() * batch_size * input.size() * sizeof(input[0]))); - HIP_CHECK(hipFree(d_temp_storage)); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); - HIP_CHECK(hipFree(d_num_selected_output)); + HIP_CHECK(hipFree(d_temp_storage)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_num_selected_output)); } template -void run_threeway(benchmark::State& state, - const hipStream_t stream, - const T small_threshold, - const T large_threshold, - const size_t size) -{ - const auto input = - benchmark_utils::get_random_data(size, static_cast(0), static_cast(100)); - - T* d_input = nullptr; - T* d_first_output = nullptr; - T* d_second_output = nullptr; - T* d_unselected_output = nullptr; - unsigned int* d_num_selected_output = nullptr; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_first_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_second_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_unselected_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_num_selected_output, 2 * sizeof(unsigned int))); - - const auto select_first_part_op = LessOp{small_threshold}; - const auto select_second_part_op = LessOp{large_threshold}; - - // Allocate temporary storage - void* d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - HIP_CHECK( - hipcub::DevicePartition::If( - nullptr, - temp_storage_bytes, - d_input, - d_first_output, - d_second_output, - d_unselected_output, - d_num_selected_output, - static_cast(input.size()), - select_first_part_op, - select_second_part_op, - stream - ) - ); - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); - - // Warm-up - HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); - for(unsigned int i = 0; i < warmup_size; ++i) { - HIP_CHECK( - hipcub::DevicePartition::If( - d_temp_storage, - temp_storage_bytes, - d_input, - d_first_output, - d_second_output, - d_unselected_output, - d_num_selected_output, - static_cast(input.size()), - select_first_part_op, - select_second_part_op, - stream - ) - ); +void run_threeway(benchmark::State &state, const hipStream_t stream, + const T small_threshold, const T large_threshold, + const size_t size) { + const auto input = benchmark_utils::get_random_data( + size, static_cast(0), static_cast(100)); + + T *d_input = nullptr; + T *d_first_output = nullptr; + T *d_second_output = nullptr; + T *d_unselected_output = nullptr; + unsigned int *d_num_selected_output = nullptr; + HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_first_output, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_second_output, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_unselected_output, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_num_selected_output, 2 * sizeof(unsigned int))); + + const auto select_first_part_op = LessOp{small_threshold}; + const auto select_second_part_op = LessOp{large_threshold}; + + // Allocate temporary storage + void *d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + HIP_CHECK(hipcub::DevicePartition::If( + nullptr, temp_storage_bytes, d_input, d_first_output, d_second_output, + d_unselected_output, d_num_selected_output, + static_cast(input.size()), select_first_part_op, + select_second_part_op, stream)); + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); + + // Warm-up + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), + hipMemcpyHostToDevice)); + for (unsigned int i = 0; i < warmup_size; ++i) { + HIP_CHECK(hipcub::DevicePartition::If( + d_temp_storage, temp_storage_bytes, d_input, d_first_output, + d_second_output, d_unselected_output, d_num_selected_output, + static_cast(input.size()), select_first_part_op, + select_second_part_op, stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + // Run benchmark + for (auto _ : state) { + namespace chrono = std::chrono; + using clock = chrono::high_resolution_clock; + + const auto start = clock::now(); + for (unsigned int i = 0; i < batch_size; ++i) { + HIP_CHECK(hipcub::DevicePartition::If( + d_temp_storage, temp_storage_bytes, d_input, d_first_output, + d_second_output, d_unselected_output, d_num_selected_output, + static_cast(input.size()), select_first_part_op, + select_second_part_op, stream)); } HIP_CHECK(hipDeviceSynchronize()); - // Run benchmark - for(auto _ : state) { - namespace chrono = std::chrono; - using clock = chrono::high_resolution_clock; - - const auto start = clock::now(); - for (unsigned int i = 0; i < batch_size; ++i) { - HIP_CHECK( - hipcub::DevicePartition::If( - d_temp_storage, - temp_storage_bytes, - d_input, - d_first_output, - d_second_output, - d_unselected_output, - d_num_selected_output, - static_cast(input.size()), - select_first_part_op, - select_second_part_op, - stream - ) - ); - } - HIP_CHECK(hipDeviceSynchronize()); - - const auto end = clock::now(); - using seconds_d = chrono::duration; - const auto elapsed_seconds = chrono::duration_cast(end - start); - - state.SetIterationTime(elapsed_seconds.count()); - } - - state.SetItemsProcessed(state.iterations() * batch_size * input.size()); - state.SetBytesProcessed( - static_cast(state.iterations() * batch_size * input.size() * sizeof(input[0]))); + const auto end = clock::now(); + using seconds_d = chrono::duration; + const auto elapsed_seconds = chrono::duration_cast(end - start); - HIP_CHECK(hipFree(d_temp_storage)); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_first_output)); - HIP_CHECK(hipFree(d_second_output)); - HIP_CHECK(hipFree(d_unselected_output)); - HIP_CHECK(hipFree(d_num_selected_output)); -} + state.SetIterationTime(elapsed_seconds.count()); + } -#define CREATE_BENCHMARK_FLAGGED(T, T_FLAG, SPLIT_T) \ - benchmark::RegisterBenchmark( \ - std::string("device_parition_flagged.(split_threshold:" #SPLIT_T \ - "%)" \ - ).c_str(), \ - &run_flagged, \ - stream, \ - static_cast(SPLIT_T), size \ - ) - -#define CREATE_BENCHMARK_PREDICATE(T, SPLIT_T) \ - benchmark::RegisterBenchmark( \ - std::string("device_parition_predicate.(split_threshold:" #SPLIT_T \ - "%)" \ - ).c_str(), \ - &run_predicate, \ - stream, \ - static_cast(SPLIT_T), size \ - ) - -#define CREATE_BENCHMARK_THREEWAY(T, SMALL_T, LARGE_T) \ - benchmark::RegisterBenchmark( \ - std::string("device_parition_three_way" \ - ".(small_threshold:" #SMALL_T \ - "%,large_threshold:" #LARGE_T \ - "%)" \ - ).c_str(), \ - &run_threeway, \ - stream, \ - static_cast(SMALL_T), \ - static_cast(LARGE_T), size \ - ) - -#define BENCHMARK_FLAGGED_TYPE(type, flag_type) \ - CREATE_BENCHMARK_FLAGGED(type, flag_type, 33), \ - CREATE_BENCHMARK_FLAGGED(type, flag_type, 50), \ - CREATE_BENCHMARK_FLAGGED(type, flag_type, 60), \ - CREATE_BENCHMARK_FLAGGED(type, flag_type, 90) - -#define BENCHMARK_PREDICATE_TYPE(type) \ - CREATE_BENCHMARK_PREDICATE(type, 33), \ - CREATE_BENCHMARK_PREDICATE(type, 50), \ - CREATE_BENCHMARK_PREDICATE(type, 60), \ - CREATE_BENCHMARK_PREDICATE(type, 90) - -#define BENCHMARK_THREEWAY_TYPE(type) \ - CREATE_BENCHMARK_THREEWAY(type, 33, 66), \ - CREATE_BENCHMARK_THREEWAY(type, 10, 66), \ - CREATE_BENCHMARK_THREEWAY(type, 50, 60), \ - CREATE_BENCHMARK_THREEWAY(type, 50, 90) - -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_partition" << std::endl; - - // HIP - const hipStream_t stream = 0; // default - { - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - } + state.SetItemsProcessed(state.iterations() * batch_size * input.size()); + state.SetBytesProcessed(static_cast( + state.iterations() * batch_size * input.size() * sizeof(input[0]))); - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - - // Add benchmarks - std::vector benchmarks = - { - BENCHMARK_FLAGGED_TYPE(int8_t, unsigned char), - BENCHMARK_FLAGGED_TYPE(int, unsigned char), - BENCHMARK_FLAGGED_TYPE(float, unsigned char), - BENCHMARK_FLAGGED_TYPE(long long, uint8_t), - BENCHMARK_FLAGGED_TYPE(double, int8_t), - BENCHMARK_FLAGGED_TYPE(custom_float2, int8_t), - BENCHMARK_FLAGGED_TYPE(custom_double2, unsigned char), - - BENCHMARK_PREDICATE_TYPE(int8_t), - BENCHMARK_PREDICATE_TYPE(int), - BENCHMARK_PREDICATE_TYPE(float), - BENCHMARK_PREDICATE_TYPE(long long), - BENCHMARK_PREDICATE_TYPE(double), - BENCHMARK_PREDICATE_TYPE(custom_float2), - BENCHMARK_PREDICATE_TYPE(custom_double2), - - BENCHMARK_THREEWAY_TYPE(int8_t), - BENCHMARK_THREEWAY_TYPE(int), - BENCHMARK_THREEWAY_TYPE(float), - BENCHMARK_THREEWAY_TYPE(long long), - BENCHMARK_THREEWAY_TYPE(double), - BENCHMARK_THREEWAY_TYPE(custom_float2), - BENCHMARK_THREEWAY_TYPE(custom_double2), - }; - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } + HIP_CHECK(hipFree(d_temp_storage)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_first_output)); + HIP_CHECK(hipFree(d_second_output)); + HIP_CHECK(hipFree(d_unselected_output)); + HIP_CHECK(hipFree(d_num_selected_output)); +} - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +#define CREATE_BENCHMARK_FLAGGED(T, T_FLAG, SPLIT_T) \ + benchmark::RegisterBenchmark( \ + std::string("device_parition_flagged.(split_threshold:" #SPLIT_T "%)") \ + .c_str(), \ + &run_flagged, stream, static_cast(SPLIT_T), size) + +#define CREATE_BENCHMARK_PREDICATE(T, SPLIT_T) \ + benchmark::RegisterBenchmark( \ + std::string("device_parition_predicate.(split_threshold:" #SPLIT_T "%)") \ + .c_str(), \ + &run_predicate, stream, static_cast(SPLIT_T), size) + +#define CREATE_BENCHMARK_THREEWAY(T, SMALL_T, LARGE_T) \ + benchmark::RegisterBenchmark( \ + std::string("device_parition_three_way" \ + ".(small_threshold:" #SMALL_T \ + "%,large_threshold:" #LARGE_T "%)") \ + .c_str(), \ + &run_threeway, stream, static_cast(SMALL_T), \ + static_cast(LARGE_T), size) + +#define BENCHMARK_FLAGGED_TYPE(type, flag_type) \ + CREATE_BENCHMARK_FLAGGED(type, flag_type, 33), \ + CREATE_BENCHMARK_FLAGGED(type, flag_type, 50), \ + CREATE_BENCHMARK_FLAGGED(type, flag_type, 60), \ + CREATE_BENCHMARK_FLAGGED(type, flag_type, 90) + +#define BENCHMARK_PREDICATE_TYPE(type) \ + CREATE_BENCHMARK_PREDICATE(type, 33), CREATE_BENCHMARK_PREDICATE(type, 50), \ + CREATE_BENCHMARK_PREDICATE(type, 60), \ + CREATE_BENCHMARK_PREDICATE(type, 90) + +#define BENCHMARK_THREEWAY_TYPE(type) \ + CREATE_BENCHMARK_THREEWAY(type, 33, 66), \ + CREATE_BENCHMARK_THREEWAY(type, 10, 66), \ + CREATE_BENCHMARK_THREEWAY(type, 50, 60), \ + CREATE_BENCHMARK_THREEWAY(type, 50, 90) + +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_partition" << std::endl; + + // HIP + const hipStream_t stream = 0; // default + { + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + } + + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + + // Add benchmarks + std::vector benchmarks = { + BENCHMARK_FLAGGED_TYPE(int8_t, unsigned char), + BENCHMARK_FLAGGED_TYPE(int, unsigned char), + BENCHMARK_FLAGGED_TYPE(float, unsigned char), + BENCHMARK_FLAGGED_TYPE(long long, uint8_t), + BENCHMARK_FLAGGED_TYPE(double, int8_t), + BENCHMARK_FLAGGED_TYPE(custom_float2, int8_t), + BENCHMARK_FLAGGED_TYPE(custom_double2, unsigned char), + + BENCHMARK_PREDICATE_TYPE(int8_t), + BENCHMARK_PREDICATE_TYPE(int), + BENCHMARK_PREDICATE_TYPE(float), + BENCHMARK_PREDICATE_TYPE(long long), + BENCHMARK_PREDICATE_TYPE(double), + BENCHMARK_PREDICATE_TYPE(custom_float2), + BENCHMARK_PREDICATE_TYPE(custom_double2), + + BENCHMARK_THREEWAY_TYPE(int8_t), + BENCHMARK_THREEWAY_TYPE(int), + BENCHMARK_THREEWAY_TYPE(float), + BENCHMARK_THREEWAY_TYPE(long long), + BENCHMARK_THREEWAY_TYPE(double), + BENCHMARK_THREEWAY_TYPE(custom_float2), + BENCHMARK_THREEWAY_TYPE(custom_double2), + }; + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_radix_sort.cpp b/benchmark/benchmark_device_radix_sort.cpp index 8f646b31..776fc6a4 100644 --- a/benchmark/benchmark_device_radix_sort.cpp +++ b/benchmark/benchmark_device_radix_sort.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -28,7 +28,6 @@ // HIP API #include "hipcub/device/device_radix_sort.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif @@ -36,506 +35,363 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template -std::vector generate_keys(size_t size) -{ - using key_type = Key; - - if(std::is_floating_point::value) - { - return benchmark_utils::get_random_data(size, (key_type)-1000, (key_type)+1000, size); - } - else - { - return benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max(), - size - ); - } +template std::vector generate_keys(size_t size) { + using key_type = Key; + + if (std::is_floating_point::value) { + return benchmark_utils::get_random_data(size, (key_type)-1000, + (key_type) + 1000, size); + } else { + return benchmark_utils::get_random_data( + size, std::numeric_limits::min(), + std::numeric_limits::max(), size); + } } -template -auto invoke_sort_keys(void* d_temp_storage, - size_t& temp_storage_bytes, - Key* d_keys_input, - Key* d_keys_output, - size_t size, +template +auto invoke_sort_keys(void *d_temp_storage, size_t &temp_storage_bytes, + Key *d_keys_input, Key *d_keys_output, size_t size, hipStream_t stream) - -> std::enable_if_t::value, hipError_t> -{ - return hipcub::DeviceRadixSort::SortKeys(d_temp_storage, - temp_storage_bytes, - d_keys_input, - d_keys_output, - size, - 0, - sizeof(Key) * 8, - stream); + -> std::enable_if_t::value, + hipError_t> { + return hipcub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, + d_keys_input, d_keys_output, size, 0, + sizeof(Key) * 8, stream); } -template -auto invoke_sort_keys(void* d_temp_storage, - size_t& temp_storage_bytes, - Key* d_keys_input, - Key* d_keys_output, - size_t size, +template +auto invoke_sort_keys(void *d_temp_storage, size_t &temp_storage_bytes, + Key *d_keys_input, Key *d_keys_output, size_t size, hipStream_t stream) - -> std::enable_if_t::value, hipError_t> -{ - return hipcub::DeviceRadixSort::SortKeysDescending(d_temp_storage, - temp_storage_bytes, - d_keys_input, - d_keys_output, - size, - 0, - sizeof(Key) * 8, - stream); + -> std::enable_if_t::value, + hipError_t> { + return hipcub::DeviceRadixSort::SortKeysDescending( + d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, size, 0, + sizeof(Key) * 8, stream); } -template -auto invoke_sort_keys(void* d_temp_storage, - size_t& temp_storage_bytes, - Key* d_keys_input, - Key* d_keys_output, - size_t size, +template +auto invoke_sort_keys(void *d_temp_storage, size_t &temp_storage_bytes, + Key *d_keys_input, Key *d_keys_output, size_t size, hipStream_t stream) - -> std::enable_if_t::value, hipError_t> -{ - return hipcub::DeviceRadixSort::SortKeys(d_temp_storage, - temp_storage_bytes, - d_keys_input, - d_keys_output, - size, - benchmark_utils::custom_type_decomposer{}, - stream); + -> std::enable_if_t::value, + hipError_t> { + return hipcub::DeviceRadixSort::SortKeys( + d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, size, + benchmark_utils::custom_type_decomposer{}, stream); } -template -auto invoke_sort_keys(void* d_temp_storage, - size_t& temp_storage_bytes, - Key* d_keys_input, - Key* d_keys_output, - size_t size, +template +auto invoke_sort_keys(void *d_temp_storage, size_t &temp_storage_bytes, + Key *d_keys_input, Key *d_keys_output, size_t size, hipStream_t stream) - -> std::enable_if_t::value, hipError_t> -{ - return hipcub::DeviceRadixSort::SortKeysDescending( - d_temp_storage, - temp_storage_bytes, - d_keys_input, - d_keys_output, - size, - benchmark_utils::custom_type_decomposer{}, - stream); + -> std::enable_if_t< + Descending && benchmark_utils::is_custom_type::value, hipError_t> { + return hipcub::DeviceRadixSort::SortKeysDescending( + d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, size, + benchmark_utils::custom_type_decomposer{}, stream); } -template -void run_sort_keys_benchmark(benchmark::State& state, - hipStream_t stream, +template +void run_sort_keys_benchmark(benchmark::State &state, hipStream_t stream, size_t size, - std::shared_ptr> keys_input) -{ - using key_type = Key; - key_type * d_keys_input; - key_type * d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input->data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK(invoke_sort_keys(d_temporary_storage, - temporary_storage_bytes, - d_keys_input, - d_keys_output, - size, - stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for(size_t i = 0; i < warmup_size; i++) - { - HIP_CHECK(invoke_sort_keys(d_temporary_storage, - temporary_storage_bytes, - d_keys_input, - d_keys_output, - size, - stream)); + std::shared_ptr> keys_input) { + using key_type = Key; + key_type *d_keys_input; + key_type *d_keys_output; + HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK(hipMemcpy(d_keys_input, keys_input->data(), size * sizeof(key_type), + hipMemcpyHostToDevice)); + + void *d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK(invoke_sort_keys(d_temporary_storage, + temporary_storage_bytes, d_keys_input, + d_keys_output, size, stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < warmup_size; i++) { + HIP_CHECK(invoke_sort_keys( + d_temporary_storage, temporary_storage_bytes, d_keys_input, + d_keys_output, size, stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK(invoke_sort_keys( + d_temporary_storage, temporary_storage_bytes, d_keys_input, + d_keys_output, size, stream)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK(invoke_sort_keys(d_temporary_storage, - temporary_storage_bytes, - d_keys_input, - d_keys_output, - size, - stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_keys_input)); - HIP_CHECK(hipFree(d_keys_output)); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * + sizeof(key_type)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_keys_output)); } -template -auto invoke_sort_pairs(void* d_temp_storage, - size_t& temp_storage_bytes, - Key* d_keys_input, - Key* d_keys_output, - Value* d_values_input, - Value* d_values_output, - size_t size, - hipStream_t stream) - -> std::enable_if_t::value, hipError_t> -{ - return hipcub::DeviceRadixSort::SortPairs(d_temp_storage, - temp_storage_bytes, - d_keys_input, - d_keys_output, - d_values_input, - d_values_output, - size, - 0, - sizeof(Key) * 8, - stream); +template +auto invoke_sort_pairs(void *d_temp_storage, size_t &temp_storage_bytes, + Key *d_keys_input, Key *d_keys_output, + Value *d_values_input, Value *d_values_output, + size_t size, hipStream_t stream) + -> std::enable_if_t::value, + hipError_t> { + return hipcub::DeviceRadixSort::SortPairs( + d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, + d_values_input, d_values_output, size, 0, sizeof(Key) * 8, stream); } -template -auto invoke_sort_pairs(void* d_temp_storage, - size_t& temp_storage_bytes, - Key* d_keys_input, - Key* d_keys_output, - Value* d_values_input, - Value* d_values_output, - size_t size, - hipStream_t stream) - -> std::enable_if_t::value, hipError_t> -{ - return hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage, - temp_storage_bytes, - d_keys_input, - d_keys_output, - d_values_input, - d_values_output, - size, - 0, - sizeof(Key) * 8, - stream); +template +auto invoke_sort_pairs(void *d_temp_storage, size_t &temp_storage_bytes, + Key *d_keys_input, Key *d_keys_output, + Value *d_values_input, Value *d_values_output, + size_t size, hipStream_t stream) + -> std::enable_if_t::value, + hipError_t> { + return hipcub::DeviceRadixSort::SortPairsDescending( + d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, + d_values_input, d_values_output, size, 0, sizeof(Key) * 8, stream); } -template -auto invoke_sort_pairs(void* d_temp_storage, - size_t& temp_storage_bytes, - Key* d_keys_input, - Key* d_keys_output, - Value* d_values_input, - Value* d_values_output, - size_t size, - hipStream_t stream) - -> std::enable_if_t::value, hipError_t> -{ - return hipcub::DeviceRadixSort::SortPairs(d_temp_storage, - temp_storage_bytes, - d_keys_input, - d_keys_output, - d_values_input, - d_values_output, - size, - benchmark_utils::custom_type_decomposer{}, - stream); +template +auto invoke_sort_pairs(void *d_temp_storage, size_t &temp_storage_bytes, + Key *d_keys_input, Key *d_keys_output, + Value *d_values_input, Value *d_values_output, + size_t size, hipStream_t stream) + -> std::enable_if_t::value, + hipError_t> { + return hipcub::DeviceRadixSort::SortPairs( + d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, + d_values_input, d_values_output, size, + benchmark_utils::custom_type_decomposer{}, stream); } -template -auto invoke_sort_pairs(void* d_temp_storage, - size_t& temp_storage_bytes, - Key* d_keys_input, - Key* d_keys_output, - Value* d_values_input, - Value* d_values_output, - size_t size, - hipStream_t stream) - -> std::enable_if_t::value, hipError_t> -{ - return hipcub::DeviceRadixSort::SortPairsDescending( - d_temp_storage, - temp_storage_bytes, - d_keys_input, - d_keys_output, - d_values_input, - d_values_output, - size, - benchmark_utils::custom_type_decomposer{}, - stream); +template +auto invoke_sort_pairs(void *d_temp_storage, size_t &temp_storage_bytes, + Key *d_keys_input, Key *d_keys_output, + Value *d_values_input, Value *d_values_output, + size_t size, hipStream_t stream) + -> std::enable_if_t< + Descending && benchmark_utils::is_custom_type::value, hipError_t> { + return hipcub::DeviceRadixSort::SortPairsDescending( + d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, + d_values_input, d_values_output, size, + benchmark_utils::custom_type_decomposer{}, stream); } -template -void run_sort_pairs_benchmark(benchmark::State& state, - hipStream_t stream, +template +void run_sort_pairs_benchmark(benchmark::State &state, hipStream_t stream, size_t size, - std::shared_ptr> keys_input) -{ - using key_type = Key; - using value_type = Value; - std::vector values_input(size); - for(size_t i = 0; i < size; i++) - { - values_input[i] = value_type(i); - } - - key_type * d_keys_input; - key_type * d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input->data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; - value_type * d_values_output; - HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); - HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK(invoke_sort_pairs(d_temporary_storage, - temporary_storage_bytes, - d_keys_input, - d_keys_output, - d_values_input, - d_values_output, - size, - stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for(size_t i = 0; i < warmup_size; i++) - { - HIP_CHECK(invoke_sort_pairs(d_temporary_storage, - temporary_storage_bytes, - d_keys_input, - d_keys_output, - d_values_input, - d_values_output, - size, - stream)); + std::shared_ptr> keys_input) { + using key_type = Key; + using value_type = Value; + std::vector values_input(size); + for (size_t i = 0; i < size; i++) { + values_input[i] = value_type(i); + } + + key_type *d_keys_input; + key_type *d_keys_output; + HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK(hipMemcpy(d_keys_input, keys_input->data(), size * sizeof(key_type), + hipMemcpyHostToDevice)); + + value_type *d_values_input; + value_type *d_values_output; + HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); + HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); + HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), + size * sizeof(value_type), hipMemcpyHostToDevice)); + + void *d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK(invoke_sort_pairs( + d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, + d_values_input, d_values_output, size, stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < warmup_size; i++) { + HIP_CHECK(invoke_sort_pairs( + d_temporary_storage, temporary_storage_bytes, d_keys_input, + d_keys_output, d_values_input, d_values_output, size, stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK(invoke_sort_pairs( + d_temporary_storage, temporary_storage_bytes, d_keys_input, + d_keys_output, d_values_input, d_values_output, size, stream)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK(invoke_sort_pairs(d_temporary_storage, - temporary_storage_bytes, - d_keys_input, - d_keys_output, - d_values_input, - d_values_output, - size, - stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed( - state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type)) - ); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_keys_input)); - HIP_CHECK(hipFree(d_keys_output)); - HIP_CHECK(hipFree(d_values_input)); - HIP_CHECK(hipFree(d_values_output)); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * + (sizeof(key_type) + sizeof(value_type))); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_keys_output)); + HIP_CHECK(hipFree(d_values_input)); + HIP_CHECK(hipFree(d_values_output)); } - -#define CREATE_SORT_KEYS_BENCHMARK(Key){ \ - auto keys_input = std::make_shared>(generate_keys(size)); \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - std::string("device_radix_sort_keys_ascending" \ - "." \ - ).c_str(), \ - [=](benchmark::State& state) { \ - run_sort_keys_benchmark(state, stream, size, keys_input); \ - } \ - ) \ - ); \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - std::string("device_radix_sort_keys_descending" \ - "." \ - ).c_str(), \ - [=](benchmark::State& state){ \ - run_sort_keys_benchmark(state, stream, size, keys_input); \ - } \ - ) \ - ); \ -} - - -#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value){ \ - auto keys_input = std::make_shared>(generate_keys(size)); \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - std::string("device_radix_sort_pairs_ascending" \ - "." \ - ).c_str(), \ - [=](benchmark::State& state){ \ - run_sort_pairs_benchmark(state, stream, size, keys_input); \ - } \ - ) \ - ); \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - std::string("device_radix_sort_pairs_descending" \ - "." \ - ).c_str(), \ - [=](benchmark::State& state){ \ - run_sort_pairs_benchmark(state, stream, size, keys_input); \ - } \ - ) \ - ); \ +#define CREATE_SORT_KEYS_BENCHMARK(Key) \ + { \ + auto keys_input = \ + std::make_shared>(generate_keys(size)); \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_radix_sort_keys_ascending" \ + ".") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_sort_keys_benchmark(state, stream, size, keys_input); \ + })); \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_radix_sort_keys_descending" \ + ".") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_sort_keys_benchmark(state, stream, size, keys_input); \ + })); \ + } + +#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value) \ + { \ + auto keys_input = \ + std::make_shared>(generate_keys(size)); \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_radix_sort_pairs_ascending" \ + ".") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_sort_pairs_benchmark(state, stream, size, \ + keys_input); \ + })); \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_radix_sort_pairs_descending" \ + ".") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_sort_pairs_benchmark(state, stream, size, \ + keys_input); \ + })); \ + } + +void add_sort_keys_benchmarks( + std::vector &benchmarks, + hipStream_t stream, size_t size) { + using custom_int_t = benchmark_utils::custom_type; + CREATE_SORT_KEYS_BENCHMARK(int) + CREATE_SORT_KEYS_BENCHMARK(long long) + CREATE_SORT_KEYS_BENCHMARK(int8_t) + CREATE_SORT_KEYS_BENCHMARK(uint8_t) + CREATE_SORT_KEYS_BENCHMARK(short) + CREATE_SORT_KEYS_BENCHMARK(custom_int_t) } -void add_sort_keys_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - using custom_int_t = benchmark_utils::custom_type; - CREATE_SORT_KEYS_BENCHMARK(int) - CREATE_SORT_KEYS_BENCHMARK(long long) - CREATE_SORT_KEYS_BENCHMARK(int8_t) - CREATE_SORT_KEYS_BENCHMARK(uint8_t) - CREATE_SORT_KEYS_BENCHMARK(short) - CREATE_SORT_KEYS_BENCHMARK(custom_int_t) +void add_sort_pairs_benchmarks( + std::vector &benchmarks, + hipStream_t stream, size_t size) { + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + using custom_char_double = benchmark_utils::custom_type; + using custom_double_char = benchmark_utils::custom_type; + using custom_int_t = benchmark_utils::custom_type; + + CREATE_SORT_PAIRS_BENCHMARK(int, float) + CREATE_SORT_PAIRS_BENCHMARK(int, double) + CREATE_SORT_PAIRS_BENCHMARK(int, custom_float2) + CREATE_SORT_PAIRS_BENCHMARK(int, custom_double2) + CREATE_SORT_PAIRS_BENCHMARK(int, custom_char_double) + CREATE_SORT_PAIRS_BENCHMARK(int, custom_double_char) + + CREATE_SORT_PAIRS_BENCHMARK(long long, float) + CREATE_SORT_PAIRS_BENCHMARK(long long, double) + CREATE_SORT_PAIRS_BENCHMARK(long long, custom_float2) + CREATE_SORT_PAIRS_BENCHMARK(long long, custom_char_double) + CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double_char) + CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double2) + + CREATE_SORT_PAIRS_BENCHMARK(int8_t, int8_t) + CREATE_SORT_PAIRS_BENCHMARK(uint8_t, uint8_t) + + CREATE_SORT_PAIRS_BENCHMARK(custom_int_t, float) } -void add_sort_pairs_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - using custom_char_double = benchmark_utils::custom_type; - using custom_double_char = benchmark_utils::custom_type; - using custom_int_t = benchmark_utils::custom_type; - - CREATE_SORT_PAIRS_BENCHMARK(int, float) - CREATE_SORT_PAIRS_BENCHMARK(int, double) - CREATE_SORT_PAIRS_BENCHMARK(int, custom_float2) - CREATE_SORT_PAIRS_BENCHMARK(int, custom_double2) - CREATE_SORT_PAIRS_BENCHMARK(int, custom_char_double) - CREATE_SORT_PAIRS_BENCHMARK(int, custom_double_char) - - CREATE_SORT_PAIRS_BENCHMARK(long long, float) - CREATE_SORT_PAIRS_BENCHMARK(long long, double) - CREATE_SORT_PAIRS_BENCHMARK(long long, custom_float2) - CREATE_SORT_PAIRS_BENCHMARK(long long, custom_char_double) - CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double_char) - CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double2) - - CREATE_SORT_PAIRS_BENCHMARK(int8_t, int8_t) - CREATE_SORT_PAIRS_BENCHMARK(uint8_t, uint8_t) - - CREATE_SORT_PAIRS_BENCHMARK(custom_int_t, float) -} - -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_radix_sort" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_sort_keys_benchmarks(benchmarks, stream, size); - add_sort_pairs_benchmarks(benchmarks, stream, size); - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_radix_sort" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_sort_keys_benchmarks(benchmarks, stream, size); + add_sort_pairs_benchmarks(benchmarks, stream, size); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_reduce.cpp b/benchmark/benchmark_device_reduce.cpp index efc8f9b0..89c8aa55 100644 --- a/benchmark/benchmark_device_reduce.cpp +++ b/benchmark/benchmark_device_reduce.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -25,7 +25,6 @@ // HIP API #include "hipcub/device/device_reduce.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif @@ -33,181 +32,153 @@ const size_t DEFAULT_N = 1024 * 1024 * 128; const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template< - class T, - class OutputT, - class ReduceKernel -> -void run_benchmark(benchmark::State& state, - size_t size, - const hipStream_t stream, - ReduceKernel reduce) -{ - std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); - - T * d_input; - OutputT * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, sizeof(OutputT))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK(hipDeviceSynchronize()); - - // Allocate temporary storage memory - size_t temp_storage_size_bytes = 0; - void * d_temp_storage = nullptr; - // Get size of d_temp_storage - HIP_CHECK(reduce(d_temp_storage, temp_storage_size_bytes, d_input, d_output, size, stream)); - HIP_CHECK(hipMalloc(&d_temp_storage,temp_storage_size_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - for(size_t i = 0; i < warmup_size; i++) - { - HIP_CHECK(reduce(d_temp_storage, temp_storage_size_bytes, d_input, d_output, size, stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - for(auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK( - reduce(d_temp_storage, temp_storage_size_bytes, d_input, d_output, size, stream)); - } - HIP_CHECK(hipStreamSynchronize(stream)); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); +template +void run_benchmark(benchmark::State &state, size_t size, + const hipStream_t stream, ReduceKernel reduce) { + std::vector input = + benchmark_utils::get_random_data(size, T(0), T(1000)); + + T *d_input; + OutputT *d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, sizeof(OutputT))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + // Allocate temporary storage memory + size_t temp_storage_size_bytes = 0; + void *d_temp_storage = nullptr; + // Get size of d_temp_storage + HIP_CHECK(reduce(d_temp_storage, temp_storage_size_bytes, d_input, d_output, + size, stream)); + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + for (size_t i = 0; i < warmup_size; i++) { + HIP_CHECK(reduce(d_temp_storage, temp_storage_size_bytes, d_input, d_output, + size, stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK(reduce(d_temp_storage, temp_storage_size_bytes, d_input, + d_output, size, stream)); } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); - HIP_CHECK(hipFree(d_temp_storage)); + HIP_CHECK(hipStreamSynchronize(stream)); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_temp_storage)); } -template -struct Benchmark; +template struct Benchmark; -template -struct Benchmark { - static void run(benchmark::State& state, size_t size, const hipStream_t stream) - { - hipError_t (*ptr_to_sum)(void*, size_t&, T*, T*, int, hipStream_t) - = &hipcub::DeviceReduce::Sum; - run_benchmark(state, size, stream, ptr_to_sum); - } +template struct Benchmark { + static void run(benchmark::State &state, size_t size, + const hipStream_t stream) { + hipError_t (*ptr_to_sum)(void *, size_t &, T *, T *, int, hipStream_t) = + &hipcub::DeviceReduce::Sum; + run_benchmark(state, size, stream, ptr_to_sum); + } }; -template -struct Benchmark { - static void run(benchmark::State& state, size_t size, const hipStream_t stream) - { - hipError_t (*ptr_to_min)(void*, size_t&, T*, T*, int, hipStream_t) - = &hipcub::DeviceReduce::Min; - run_benchmark(state, size, stream, ptr_to_min); - } +template struct Benchmark { + static void run(benchmark::State &state, size_t size, + const hipStream_t stream) { + hipError_t (*ptr_to_min)(void *, size_t &, T *, T *, int, hipStream_t) = + &hipcub::DeviceReduce::Min; + run_benchmark(state, size, stream, ptr_to_min); + } }; -template -struct Benchmark { - using Difference = int; - using Iterator = typename hipcub::ArgIndexInputIterator; - using KeyValue = typename Iterator::value_type; - - static void run(benchmark::State& state, size_t size, const hipStream_t stream) - { - hipError_t (*ptr_to_argmin)(void*, size_t&, T*, KeyValue*, int, hipStream_t) - = &hipcub::DeviceReduce::ArgMin; - run_benchmark(state, size, stream, ptr_to_argmin); - } +template struct Benchmark { + using Difference = int; + using Iterator = typename hipcub::ArgIndexInputIterator; + using KeyValue = typename Iterator::value_type; + + static void run(benchmark::State &state, size_t size, + const hipStream_t stream) { + hipError_t (*ptr_to_argmin)(void *, size_t &, T *, KeyValue *, int, + hipStream_t) = &hipcub::DeviceReduce::ArgMin; + run_benchmark(state, size, stream, ptr_to_argmin); + } }; -#define CREATE_BENCHMARK(T, REDUCE_OP) \ - benchmark::RegisterBenchmark( \ - std::string("device_reduce" \ - "." \ - ).c_str(), \ - &Benchmark::run, \ - size, \ - stream \ -) - -#define CREATE_BENCHMARKS(REDUCE_OP) \ - CREATE_BENCHMARK(int, REDUCE_OP), \ - CREATE_BENCHMARK(long long, REDUCE_OP), \ - CREATE_BENCHMARK(float, REDUCE_OP), \ - CREATE_BENCHMARK(double, REDUCE_OP), \ - CREATE_BENCHMARK(int8_t, REDUCE_OP) - -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_reduce" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - using custom_double2 = benchmark_utils::custom_type; - - // Add benchmarks - std::vector benchmarks = - { - CREATE_BENCHMARKS(hipcub::Sum), - CREATE_BENCHMARK(custom_double2, hipcub::Sum), - CREATE_BENCHMARKS(hipcub::Min), - #ifdef HIPCUB_ROCPRIM_API - CREATE_BENCHMARK(custom_double2, hipcub::Min), - #endif - CREATE_BENCHMARKS(hipcub::ArgMin), - #ifdef HIPCUB_ROCPRIM_API - CREATE_BENCHMARK(custom_double2, hipcub::ArgMin), - #endif - }; - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +#define CREATE_BENCHMARK(T, REDUCE_OP) \ + benchmark::RegisterBenchmark(std::string("device_reduce" \ + ".") \ + .c_str(), \ + &Benchmark::run, size, stream) + +#define CREATE_BENCHMARKS(REDUCE_OP) \ + CREATE_BENCHMARK(int, REDUCE_OP), CREATE_BENCHMARK(long long, REDUCE_OP), \ + CREATE_BENCHMARK(float, REDUCE_OP), CREATE_BENCHMARK(double, REDUCE_OP), \ + CREATE_BENCHMARK(int8_t, REDUCE_OP) + +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_reduce" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + using custom_double2 = benchmark_utils::custom_type; + + // Add benchmarks + std::vector benchmarks = { + CREATE_BENCHMARKS(hipcub::Sum), + CREATE_BENCHMARK(custom_double2, hipcub::Sum), + CREATE_BENCHMARKS(hipcub::Min), +#ifdef HIPCUB_ROCPRIM_API + CREATE_BENCHMARK(custom_double2, hipcub::Min), +#endif + CREATE_BENCHMARKS(hipcub::ArgMin), +#ifdef HIPCUB_ROCPRIM_API + CREATE_BENCHMARK(custom_double2, hipcub::ArgMin), +#endif + }; + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); - return 0; + return 0; } diff --git a/benchmark/benchmark_device_reduce_by_key.cpp b/benchmark/benchmark_device_reduce_by_key.cpp index 8a3fbdd8..77fbd7ff 100644 --- a/benchmark/benchmark_device_reduce_by_key.cpp +++ b/benchmark/benchmark_device_reduce_by_key.cpp @@ -2,15 +2,15 @@ // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // - // Permission is hereby granted, free of charge, to any person obtaining a copy +// Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -20,10 +20,10 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -// CUB's implementation of single_pass_scan_operators has maybe uninitialized parameters, -// disable the warning because all warnings are threated as errors: +// CUB's implementation of single_pass_scan_operators has maybe uninitialized +// parameters, disable the warning because all warnings are threated as errors: #ifdef __HIP_PLATFORM_NVIDIA__ - #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif #include "common_benchmark_header.hpp" @@ -38,216 +38,174 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template -void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size, BinaryFunction reduce_op) -{ - using key_type = Key; - using value_type = Value; - - // Generate data - std::vector keys_input(size); - - unsigned int unique_count = 0; - std::vector key_counts = benchmark_utils::get_random_data(100000, 1, max_length); - size_t offset = 0; - while(offset < size) - { - const size_t key_count = key_counts[unique_count % key_counts.size()]; - const size_t end = std::min(size, offset + key_count); - for(size_t i = offset; i < end; i++) - { - keys_input[i] = unique_count; - } - - unique_count++; - offset += key_count; +template +void run_benchmark(benchmark::State &state, size_t max_length, + hipStream_t stream, size_t size, BinaryFunction reduce_op) { + using key_type = Key; + using value_type = Value; + + // Generate data + std::vector keys_input(size); + + unsigned int unique_count = 0; + std::vector key_counts = + benchmark_utils::get_random_data(100000, 1, max_length); + size_t offset = 0; + while (offset < size) { + const size_t key_count = key_counts[unique_count % key_counts.size()]; + const size_t end = std::min(size, offset + key_count); + for (size_t i = offset; i < end; i++) { + keys_input[i] = unique_count; } - std::vector values_input(size); - std::iota(values_input.begin(), values_input.end(), 0); - - key_type * d_keys_input; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; - HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); - - key_type * d_unique_output; - value_type * d_aggregates_output; - unsigned int * d_unique_count_output; - HIP_CHECK(hipMalloc(&d_unique_output, unique_count * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_aggregates_output, unique_count * sizeof(value_type))); - HIP_CHECK(hipMalloc(&d_unique_count_output, sizeof(unsigned int))); - - void * d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - - HIP_CHECK( - hipcub::DeviceReduce::ReduceByKey( - nullptr, temporary_storage_bytes, - d_keys_input, d_unique_output, d_values_input, - d_aggregates_output, - d_unique_count_output, - reduce_op, size, - stream - ) - ); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for(size_t i = 0; i < warmup_size; i++) - { - HIP_CHECK( - hipcub::DeviceReduce::ReduceByKey( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, - d_unique_output, d_values_input, d_aggregates_output, - d_unique_count_output, - reduce_op, size, - stream - ) - ); + unique_count++; + offset += key_count; + } + + std::vector values_input(size); + std::iota(values_input.begin(), values_input.end(), 0); + + key_type *d_keys_input; + HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); + HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), + hipMemcpyHostToDevice)); + + value_type *d_values_input; + HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); + HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), + size * sizeof(value_type), hipMemcpyHostToDevice)); + + key_type *d_unique_output; + value_type *d_aggregates_output; + unsigned int *d_unique_count_output; + HIP_CHECK(hipMalloc(&d_unique_output, unique_count * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_aggregates_output, unique_count * sizeof(value_type))); + HIP_CHECK(hipMalloc(&d_unique_count_output, sizeof(unsigned int))); + + void *d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + + HIP_CHECK(hipcub::DeviceReduce::ReduceByKey( + nullptr, temporary_storage_bytes, d_keys_input, d_unique_output, + d_values_input, d_aggregates_output, d_unique_count_output, reduce_op, + size, stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < warmup_size; i++) { + HIP_CHECK(hipcub::DeviceReduce::ReduceByKey( + d_temporary_storage, temporary_storage_bytes, d_keys_input, + d_unique_output, d_values_input, d_aggregates_output, + d_unique_count_output, reduce_op, size, stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK(hipcub::DeviceReduce::ReduceByKey( + d_temporary_storage, temporary_storage_bytes, d_keys_input, + d_unique_output, d_values_input, d_aggregates_output, + d_unique_count_output, reduce_op, size, stream)); } - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK( - hipcub::DeviceReduce::ReduceByKey( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, - d_unique_output, d_values_input, d_aggregates_output, - d_unique_count_output, - reduce_op, size, - stream - ) - ); - } - HIP_CHECK(hipStreamSynchronize(stream)); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_keys_input)); - HIP_CHECK(hipFree(d_values_input)); - HIP_CHECK(hipFree(d_unique_output)); - HIP_CHECK(hipFree(d_aggregates_output)); - HIP_CHECK(hipFree(d_unique_count_output)); + HIP_CHECK(hipStreamSynchronize(stream)); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * + (sizeof(key_type) + sizeof(value_type))); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_values_input)); + HIP_CHECK(hipFree(d_unique_output)); + HIP_CHECK(hipFree(d_aggregates_output)); + HIP_CHECK(hipFree(d_unique_count_output)); } -#define CREATE_BENCHMARK(Key, Value, REDUCE_OP) \ -benchmark::RegisterBenchmark( \ - std::string("device_reduce_by_key" \ - "." \ - "(random_number_range:[1, " \ - + std::to_string(max_length) \ - + "])" \ - ).c_str(), \ - &run_benchmark, \ - max_length, stream, size, REDUCE_OP() \ -) - -#define CREATE_BENCHMARKS(REDUCE_OP) \ - CREATE_BENCHMARK(int, float, REDUCE_OP), \ - CREATE_BENCHMARK(int, double, REDUCE_OP), \ - CREATE_BENCHMARK(int, custom_double2, REDUCE_OP), \ - CREATE_BENCHMARK(int8_t, int8_t, REDUCE_OP), \ - CREATE_BENCHMARK(long long, float, REDUCE_OP), \ - CREATE_BENCHMARK(long long, double, REDUCE_OP) +#define CREATE_BENCHMARK(Key, Value, REDUCE_OP) \ + benchmark::RegisterBenchmark(std::string("device_reduce_by_key" \ + "." \ + "(random_number_range:[1, " + \ + std::to_string(max_length) + "])") \ + .c_str(), \ + &run_benchmark, \ + max_length, stream, size, REDUCE_OP()) + +#define CREATE_BENCHMARKS(REDUCE_OP) \ + CREATE_BENCHMARK(int, float, REDUCE_OP), \ + CREATE_BENCHMARK(int, double, REDUCE_OP), \ + CREATE_BENCHMARK(int, custom_double2, REDUCE_OP), \ + CREATE_BENCHMARK(int8_t, int8_t, REDUCE_OP), \ + CREATE_BENCHMARK(long long, float, REDUCE_OP), \ + CREATE_BENCHMARK(long long, double, REDUCE_OP) void add_benchmarks(size_t max_length, - std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - using custom_double2 = benchmark_utils::custom_type; - - std::vector bs = - { - CREATE_BENCHMARKS(hipcub::Sum), - CREATE_BENCHMARK(long long, custom_double2, hipcub::Sum), - CREATE_BENCHMARKS(hipcub::Min), - #ifdef HIPCUB_ROCPRIM_API - CREATE_BENCHMARK(long long, custom_double2, hipcub::Min), - #endif - }; - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); -} + std::vector &benchmarks, + hipStream_t stream, size_t size) { + using custom_double2 = benchmark_utils::custom_type; + + std::vector bs = { + CREATE_BENCHMARKS(hipcub::Sum), + CREATE_BENCHMARK(long long, custom_double2, hipcub::Sum), + CREATE_BENCHMARKS(hipcub::Min), +#ifdef HIPCUB_ROCPRIM_API + CREATE_BENCHMARK(long long, custom_double2, hipcub::Min), +#endif + }; -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_reduce_by_key" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_benchmarks(1000, benchmarks, stream, size); - add_benchmarks(10, benchmarks, stream, size); - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +} - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_reduce_by_key" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_benchmarks(1000, benchmarks, stream, size); + add_benchmarks(10, benchmarks, stream, size); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_run_length_encode.cpp b/benchmark/benchmark_device_run_length_encode.cpp index dc785f72..ffb41a53 100644 --- a/benchmark/benchmark_device_run_length_encode.cpp +++ b/benchmark/benchmark_device_run_length_encode.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -23,7 +23,7 @@ // CUB's implementation of DeviceRunLengthEncode has unused parameters, // disable the warning because all warnings are threated as errors: #ifdef __HIP_PLATFORM_NVIDIA__ - #pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wunused-parameter" #endif #include "common_benchmark_header.hpp" @@ -31,336 +31,275 @@ // HIP API #include "hipcub/device/device_run_length_encode.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template -void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size) -{ - using key_type = T; - using count_type = unsigned int; - - // Generate data - std::vector input(size); - - unsigned int runs_count = 0; - std::vector key_counts = benchmark_utils::get_random_data(100000, 1, max_length); - size_t offset = 0; - while(offset < size) - { - const size_t key_count = key_counts[runs_count % key_counts.size()]; - const size_t end = std::min(size, offset + key_count); - for(size_t i = offset; i < end; i++) - { - input[i] = runs_count; - } - - runs_count++; - offset += key_count; +template +void run_encode_benchmark(benchmark::State &state, size_t max_length, + hipStream_t stream, size_t size) { + using key_type = T; + using count_type = unsigned int; + + // Generate data + std::vector input(size); + + unsigned int runs_count = 0; + std::vector key_counts = + benchmark_utils::get_random_data(100000, 1, max_length); + size_t offset = 0; + while (offset < size) { + const size_t key_count = key_counts[runs_count % key_counts.size()]; + const size_t end = std::min(size, offset + key_count); + for (size_t i = offset; i < end; i++) { + input[i] = runs_count; } - key_type * d_input; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - key_type * d_unique_output; - count_type * d_counts_output; - count_type * d_runs_count_output; - HIP_CHECK(hipMalloc(&d_unique_output, runs_count * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type))); - HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type))); - - void * d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - - HIP_CHECK(hipcub::DeviceRunLengthEncode::Encode(nullptr, - temporary_storage_bytes, - d_input, - d_unique_output, - d_counts_output, - d_runs_count_output, - size, - stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for(size_t i = 0; i < 10; i++) - { - HIP_CHECK(hipcub::DeviceRunLengthEncode::Encode(d_temporary_storage, - temporary_storage_bytes, - d_input, - d_unique_output, - d_counts_output, - d_runs_count_output, - size, - stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - const unsigned int batch_size = 10; - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for(size_t i = 0; i < batch_size; i++) - { - hipcub::DeviceRunLengthEncode::Encode(d_temporary_storage, - temporary_storage_bytes, - d_input, - d_unique_output, - d_counts_output, - d_runs_count_output, - size, - stream); - } - HIP_CHECK(hipStreamSynchronize(stream)); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); + runs_count++; + offset += key_count; + } + + key_type *d_input; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(key_type), + hipMemcpyHostToDevice)); + + key_type *d_unique_output; + count_type *d_counts_output; + count_type *d_runs_count_output; + HIP_CHECK(hipMalloc(&d_unique_output, runs_count * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type))); + HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type))); + + void *d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + + HIP_CHECK(hipcub::DeviceRunLengthEncode::Encode( + nullptr, temporary_storage_bytes, d_input, d_unique_output, + d_counts_output, d_runs_count_output, size, stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < 10; i++) { + HIP_CHECK(hipcub::DeviceRunLengthEncode::Encode( + d_temporary_storage, temporary_storage_bytes, d_input, d_unique_output, + d_counts_output, d_runs_count_output, size, stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + const unsigned int batch_size = 10; + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < batch_size; i++) { + hipcub::DeviceRunLengthEncode::Encode( + d_temporary_storage, temporary_storage_bytes, d_input, + d_unique_output, d_counts_output, d_runs_count_output, size, stream); } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_unique_output)); - HIP_CHECK(hipFree(d_counts_output)); - HIP_CHECK(hipFree(d_runs_count_output)); + HIP_CHECK(hipStreamSynchronize(stream)); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * + sizeof(key_type)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_unique_output)); + HIP_CHECK(hipFree(d_counts_output)); + HIP_CHECK(hipFree(d_runs_count_output)); } -template -void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size) -{ - using key_type = T; - using offset_type = unsigned int; - using count_type = unsigned int; - - // Generate data - std::vector input(size); - - unsigned int runs_count = 0; - std::vector key_counts = benchmark_utils::get_random_data(100000, 1, max_length); - size_t offset = 0; - while(offset < size) - { - const size_t key_count = key_counts[runs_count % key_counts.size()]; - const size_t end = std::min(size, offset + key_count); - for(size_t i = offset; i < end; i++) - { - input[i] = runs_count; - } - - runs_count++; - offset += key_count; +template +void run_non_trivial_runs_benchmark(benchmark::State &state, size_t max_length, + hipStream_t stream, size_t size) { + using key_type = T; + using offset_type = unsigned int; + using count_type = unsigned int; + + // Generate data + std::vector input(size); + + unsigned int runs_count = 0; + std::vector key_counts = + benchmark_utils::get_random_data(100000, 1, max_length); + size_t offset = 0; + while (offset < size) { + const size_t key_count = key_counts[runs_count % key_counts.size()]; + const size_t end = std::min(size, offset + key_count); + for (size_t i = offset; i < end; i++) { + input[i] = runs_count; } - key_type * d_input; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - offset_type * d_offsets_output; - count_type * d_counts_output; - count_type * d_runs_count_output; - HIP_CHECK(hipMalloc(&d_offsets_output, runs_count * sizeof(offset_type))); - HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type))); - HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type))); - - void * d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - - HIP_CHECK(hipcub::DeviceRunLengthEncode::NonTrivialRuns(nullptr, - temporary_storage_bytes, - d_input, - d_offsets_output, - d_counts_output, - d_runs_count_output, - size, - stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for(size_t i = 0; i < 10; i++) - { - HIP_CHECK(hipcub::DeviceRunLengthEncode::NonTrivialRuns(d_temporary_storage, - temporary_storage_bytes, - d_input, - d_offsets_output, - d_counts_output, - d_runs_count_output, - size, - stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - const unsigned int batch_size = 10; - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for(size_t i = 0; i < batch_size; i++) - { - hipcub::DeviceRunLengthEncode::NonTrivialRuns(d_temporary_storage, - temporary_storage_bytes, - d_input, - d_offsets_output, - d_counts_output, - d_runs_count_output, - size, - stream); - } - HIP_CHECK(hipStreamSynchronize(stream)); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); + runs_count++; + offset += key_count; + } + + key_type *d_input; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(key_type), + hipMemcpyHostToDevice)); + + offset_type *d_offsets_output; + count_type *d_counts_output; + count_type *d_runs_count_output; + HIP_CHECK(hipMalloc(&d_offsets_output, runs_count * sizeof(offset_type))); + HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type))); + HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type))); + + void *d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + + HIP_CHECK(hipcub::DeviceRunLengthEncode::NonTrivialRuns( + nullptr, temporary_storage_bytes, d_input, d_offsets_output, + d_counts_output, d_runs_count_output, size, stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < 10; i++) { + HIP_CHECK(hipcub::DeviceRunLengthEncode::NonTrivialRuns( + d_temporary_storage, temporary_storage_bytes, d_input, d_offsets_output, + d_counts_output, d_runs_count_output, size, stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + const unsigned int batch_size = 10; + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < batch_size; i++) { + hipcub::DeviceRunLengthEncode::NonTrivialRuns( + d_temporary_storage, temporary_storage_bytes, d_input, + d_offsets_output, d_counts_output, d_runs_count_output, size, stream); } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_offsets_output)); - HIP_CHECK(hipFree(d_counts_output)); - HIP_CHECK(hipFree(d_runs_count_output)); + HIP_CHECK(hipStreamSynchronize(stream)); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * + sizeof(key_type)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_offsets_output)); + HIP_CHECK(hipFree(d_counts_output)); + HIP_CHECK(hipFree(d_runs_count_output)); } -#define CREATE_ENCODE_BENCHMARK(T) \ - benchmark::RegisterBenchmark( \ - std::string("device_run_length_encode" \ - "." \ - "(random_number_range:[1, " \ - + std::to_string(max_length) \ - + "])" \ - ).c_str(), \ - &run_encode_benchmark, \ - max_length, stream, size \ - ) - -void add_encode_benchmarks(size_t max_length, - std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - - std::vector bs = - { - CREATE_ENCODE_BENCHMARK(int), - CREATE_ENCODE_BENCHMARK(long long), - - CREATE_ENCODE_BENCHMARK(int8_t), - CREATE_ENCODE_BENCHMARK(uint8_t), - - CREATE_ENCODE_BENCHMARK(custom_float2), - CREATE_ENCODE_BENCHMARK(custom_double2), - }; - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_ENCODE_BENCHMARK(T) \ + benchmark::RegisterBenchmark(std::string("device_run_length_encode" \ + "." \ + "(random_number_range:[1, " + \ + std::to_string(max_length) + "])") \ + .c_str(), \ + &run_encode_benchmark, max_length, stream, \ + size) + +void add_encode_benchmarks( + size_t max_length, + std::vector &benchmarks, + hipStream_t stream, size_t size) { + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + + std::vector bs = { + CREATE_ENCODE_BENCHMARK(int), + CREATE_ENCODE_BENCHMARK(long long), + + CREATE_ENCODE_BENCHMARK(int8_t), + CREATE_ENCODE_BENCHMARK(uint8_t), + + CREATE_ENCODE_BENCHMARK(custom_float2), + CREATE_ENCODE_BENCHMARK(custom_double2), + }; + + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_NON_TRIVIAL_RUNS_BENCHMARK(T) \ - benchmark::RegisterBenchmark( \ - std::string("run_length_encode_non_trivial_runs" \ - "" \ - "(random_number_range:[1, " \ - + std::to_string(max_length) \ - + "])" \ - ).c_str(), \ - &run_non_trivial_runs_benchmark, \ - max_length, stream, size \ - ) - -void add_non_trivial_runs_benchmarks(size_t max_length, - std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - - std::vector bs = - { - CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int), - CREATE_NON_TRIVIAL_RUNS_BENCHMARK(long long), - - CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int8_t), - CREATE_NON_TRIVIAL_RUNS_BENCHMARK(uint8_t), - - CREATE_NON_TRIVIAL_RUNS_BENCHMARK(custom_float2), - CREATE_NON_TRIVIAL_RUNS_BENCHMARK(custom_double2), - }; - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_NON_TRIVIAL_RUNS_BENCHMARK(T) \ + benchmark::RegisterBenchmark( \ + std::string("run_length_encode_non_trivial_runs" \ + "" \ + "(random_number_range:[1, " + \ + std::to_string(max_length) + "])") \ + .c_str(), \ + &run_non_trivial_runs_benchmark, max_length, stream, size) + +void add_non_trivial_runs_benchmarks( + size_t max_length, + std::vector &benchmarks, + hipStream_t stream, size_t size) { + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + + std::vector bs = { + CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int), + CREATE_NON_TRIVIAL_RUNS_BENCHMARK(long long), + + CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int8_t), + CREATE_NON_TRIVIAL_RUNS_BENCHMARK(uint8_t), + + CREATE_NON_TRIVIAL_RUNS_BENCHMARK(custom_float2), + CREATE_NON_TRIVIAL_RUNS_BENCHMARK(custom_double2), + }; + + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_run_length_encode" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_encode_benchmarks(1000, benchmarks, stream, size); - add_encode_benchmarks(10, benchmarks, stream, size); - add_non_trivial_runs_benchmarks(1000, benchmarks, stream, size); - add_non_trivial_runs_benchmarks(10, benchmarks, stream, size); - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_run_length_encode" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_encode_benchmarks(1000, benchmarks, stream, size); + add_encode_benchmarks(10, benchmarks, stream, size); + add_non_trivial_runs_benchmarks(1000, benchmarks, stream, size); + add_non_trivial_runs_benchmarks(10, benchmarks, stream, size); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_scan.cpp b/benchmark/benchmark_device_scan.cpp index 29b2f3c0..f7656c9f 100644 --- a/benchmark/benchmark_device_scan.cpp +++ b/benchmark/benchmark_device_scan.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -20,10 +20,10 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE -// CUB's implementation of single_pass_scan_operators has maybe uninitialized parameters, -// disable the warning because all warnings are threated as errors: +// CUB's implementation of single_pass_scan_operators has maybe uninitialized +// parameters, disable the warning because all warnings are threated as errors: #ifdef __HIP_PLATFORM_NVIDIA__ - #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif #include "common_benchmark_header.hpp" @@ -31,372 +31,255 @@ // HIP API #include "hipcub/device/device_scan.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template -auto run_device_scan(void* temporary_storage, - size_t& storage_size, - T* input, - T* output, - const T initial_value, - const size_t input_size, - BinaryFunction scan_op, - const hipStream_t stream) -> - typename std::enable_if::type -{ - return hipcub::DeviceScan::ExclusiveScan(temporary_storage, - storage_size, - input, - output, - scan_op, - initial_value, - input_size, - stream); +template +auto run_device_scan(void *temporary_storage, size_t &storage_size, T *input, + T *output, const T initial_value, const size_t input_size, + BinaryFunction scan_op, const hipStream_t stream) -> + typename std::enable_if::type { + return hipcub::DeviceScan::ExclusiveScan(temporary_storage, storage_size, + input, output, scan_op, + initial_value, input_size, stream); } -template -auto run_device_scan(void* temporary_storage, - size_t& storage_size, - T* input, - T* output, - const T initial_value, - const size_t input_size, - BinaryFunction scan_op, - const hipStream_t stream) -> - typename std::enable_if::type -{ - (void) initial_value; - return hipcub::DeviceScan::InclusiveScan(temporary_storage, - storage_size, - input, - output, - scan_op, - input_size, - stream); +template +auto run_device_scan(void *temporary_storage, size_t &storage_size, T *input, + T *output, const T initial_value, const size_t input_size, + BinaryFunction scan_op, const hipStream_t stream) -> + typename std::enable_if::type { + (void)initial_value; + return hipcub::DeviceScan::InclusiveScan(temporary_storage, storage_size, + input, output, scan_op, input_size, + stream); } -template -auto run_device_scan_by_key(void* temporary_storage, - size_t& storage_size, - K* keys, - T* input, - T* output, - const T initial_value, - const size_t input_size, - BinaryFunction scan_op, +template +auto run_device_scan_by_key(void *temporary_storage, size_t &storage_size, + K *keys, T *input, T *output, const T initial_value, + const size_t input_size, BinaryFunction scan_op, const hipStream_t stream) -> - typename std::enable_if::type -{ - return hipcub::DeviceScan::ExclusiveScanByKey(temporary_storage, - storage_size, - keys, - input, - output, - scan_op, - initial_value, - static_cast(input_size), - hipcub::Equality(), - stream); + typename std::enable_if::type { + return hipcub::DeviceScan::ExclusiveScanByKey( + temporary_storage, storage_size, keys, input, output, scan_op, + initial_value, static_cast(input_size), hipcub::Equality(), stream); } -template -auto run_device_scan_by_key(void* temporary_storage, - size_t& storage_size, - K* keys, - T* input, - T* output, - const T /*initial_value*/, - const size_t input_size, - BinaryFunction scan_op, - const hipStream_t stream) -> - typename std::enable_if::type -{ - return hipcub::DeviceScan::InclusiveScanByKey(temporary_storage, - storage_size, - keys, - input, - output, - scan_op, - static_cast(input_size), - hipcub::Equality(), - stream); +template +auto run_device_scan_by_key(void *temporary_storage, size_t &storage_size, + K *keys, T *input, T *output, + const T /*initial_value*/, const size_t input_size, + BinaryFunction scan_op, const hipStream_t stream) -> + typename std::enable_if::type { + return hipcub::DeviceScan::InclusiveScanByKey( + temporary_storage, storage_size, keys, input, output, scan_op, + static_cast(input_size), hipcub::Equality(), stream); } -template< - bool Exclusive, - class T, - class BinaryFunction -> -void run_benchmark(benchmark::State& state, - size_t size, - const hipStream_t stream, - BinaryFunction scan_op) -{ - std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); - T initial_value = T(123); - T * d_input; - T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK(hipDeviceSynchronize()); - - // Allocate temporary storage memory - size_t temp_storage_size_bytes = 0; - void * d_temp_storage = nullptr; - // Get size of d_temp_storage - HIP_CHECK(( - run_device_scan( - d_temp_storage, temp_storage_size_bytes, - d_input, d_output, initial_value, size, - scan_op, stream - ) - )); - HIP_CHECK(hipMalloc(&d_temp_storage,temp_storage_size_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for(size_t i = 0; i < 5; i++) - { - HIP_CHECK(( - run_device_scan( - d_temp_storage, temp_storage_size_bytes, - d_input, d_output, initial_value, size, - scan_op, stream - ) - )); +template +void run_benchmark(benchmark::State &state, size_t size, + const hipStream_t stream, BinaryFunction scan_op) { + std::vector input = + benchmark_utils::get_random_data(size, T(0), T(1000)); + T initial_value = T(123); + T *d_input; + T *d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + // Allocate temporary storage memory + size_t temp_storage_size_bytes = 0; + void *d_temp_storage = nullptr; + // Get size of d_temp_storage + HIP_CHECK((run_device_scan(d_temp_storage, temp_storage_size_bytes, + d_input, d_output, initial_value, size, + scan_op, stream))); + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < 5; i++) { + HIP_CHECK((run_device_scan( + d_temp_storage, temp_storage_size_bytes, d_input, d_output, + initial_value, size, scan_op, stream))); + } + HIP_CHECK(hipDeviceSynchronize()); + + const unsigned int batch_size = 10; + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK((run_device_scan( + d_temp_storage, temp_storage_size_bytes, d_input, d_output, + initial_value, size, scan_op, stream))); } - HIP_CHECK(hipDeviceSynchronize()); - - const unsigned int batch_size = 10; - for(auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK(( - run_device_scan( - d_temp_storage, temp_storage_size_bytes, - d_input, d_output, initial_value, size, - scan_op, stream - ) - )); - } - HIP_CHECK(hipStreamSynchronize(stream)); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); - HIP_CHECK(hipFree(d_temp_storage)); + HIP_CHECK(hipStreamSynchronize(stream)); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_temp_storage)); } -template< - bool Exclusive, - class T, - class BinaryFunction -> -void run_benchmark_by_key(benchmark::State& state, - size_t size, - const hipStream_t stream, - BinaryFunction scan_op) -{ - using key_type = int; - constexpr size_t max_segment_length = 100; - - const std::vector keys = benchmark_utils::get_random_segments( - size, max_segment_length, std::random_device{}() - ); - const std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); - const T initial_value = T(123); - key_type * d_keys; - T * d_input; - T * d_output; - HIP_CHECK(hipMalloc(&d_keys, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_keys, keys.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK(hipDeviceSynchronize()); - - // Allocate temporary storage memory - size_t temp_storage_size_bytes = 0; - void * d_temp_storage = nullptr; - // Get size of d_temp_storage - HIP_CHECK(( - run_device_scan_by_key( - d_temp_storage, temp_storage_size_bytes, - d_keys, d_input, d_output, initial_value, - size, scan_op, stream - ) - )); - HIP_CHECK(hipMalloc(&d_temp_storage,temp_storage_size_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for(size_t i = 0; i < 5; i++) - { - HIP_CHECK(( - run_device_scan_by_key( - d_temp_storage, temp_storage_size_bytes, - d_keys, d_input, d_output, initial_value, - size, scan_op, stream - ) - )); +template +void run_benchmark_by_key(benchmark::State &state, size_t size, + const hipStream_t stream, BinaryFunction scan_op) { + using key_type = int; + constexpr size_t max_segment_length = 100; + + const std::vector keys = + benchmark_utils::get_random_segments(size, max_segment_length, + std::random_device{}()); + const std::vector input = + benchmark_utils::get_random_data(size, T(0), T(1000)); + const T initial_value = T(123); + key_type *d_keys; + T *d_input; + T *d_output; + HIP_CHECK(hipMalloc(&d_keys, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_keys, keys.data(), size * sizeof(key_type), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + // Allocate temporary storage memory + size_t temp_storage_size_bytes = 0; + void *d_temp_storage = nullptr; + // Get size of d_temp_storage + HIP_CHECK((run_device_scan_by_key( + d_temp_storage, temp_storage_size_bytes, d_keys, d_input, d_output, + initial_value, size, scan_op, stream))); + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < 5; i++) { + HIP_CHECK((run_device_scan_by_key( + d_temp_storage, temp_storage_size_bytes, d_keys, d_input, d_output, + initial_value, size, scan_op, stream))); + } + HIP_CHECK(hipDeviceSynchronize()); + + const unsigned int batch_size = 10; + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK((run_device_scan_by_key( + d_temp_storage, temp_storage_size_bytes, d_keys, d_input, d_output, + initial_value, size, scan_op, stream))); } - HIP_CHECK(hipDeviceSynchronize()); - - const unsigned int batch_size = 10; - for(auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK(( - run_device_scan_by_key( - d_temp_storage, temp_storage_size_bytes, - d_keys, d_input, d_output, initial_value, - size, scan_op, stream - ) - )); - } - HIP_CHECK(hipStreamSynchronize(stream)); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_keys)); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); - HIP_CHECK(hipFree(d_temp_storage)); + HIP_CHECK(hipStreamSynchronize(stream)); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_keys)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_temp_storage)); } -#define CREATE_BENCHMARK(EXCL, T, SCAN_OP) \ - benchmark::RegisterBenchmark( \ - std::string(std::string(EXCL ? "device_exclusive_scan" : "device_inclusive_scan") \ - +"." \ - ).c_str(), \ - &run_benchmark, \ - size, \ - stream, \ - SCAN_OP() \ - ), \ - benchmark::RegisterBenchmark( \ - std::string(std::string(EXCL ? "device_exclusive_scan_by_key" : "device_inclusive_scan_by_key") \ - + "." \ - ).c_str(), \ - &run_benchmark_by_key, \ - size, stream, \ - SCAN_OP() \ - ) - -#define CREATE_BENCHMARKS(SCAN_OP) \ - CREATE_BENCHMARK(false, int, SCAN_OP), \ - CREATE_BENCHMARK(true, int, SCAN_OP), \ - CREATE_BENCHMARK(false, float, SCAN_OP), \ - CREATE_BENCHMARK(true, float, SCAN_OP), \ - CREATE_BENCHMARK(false, double, SCAN_OP), \ - CREATE_BENCHMARK(true, double, SCAN_OP), \ - CREATE_BENCHMARK(false, long long, SCAN_OP), \ - CREATE_BENCHMARK(true, long long, SCAN_OP), \ - CREATE_BENCHMARK(false, custom_float2, SCAN_OP), \ - CREATE_BENCHMARK(true, custom_float2, SCAN_OP), \ - CREATE_BENCHMARK(false, custom_double2, SCAN_OP), \ - CREATE_BENCHMARK(true, custom_double2, SCAN_OP), \ - CREATE_BENCHMARK(false, int8_t, SCAN_OP), \ - CREATE_BENCHMARK(true, int8_t, SCAN_OP), \ - CREATE_BENCHMARK(false, uint8_t, SCAN_OP), \ - CREATE_BENCHMARK(true, uint8_t, SCAN_OP) - -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_scan" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - using custom_double2 = benchmark_utils::custom_type; - using custom_float2 = benchmark_utils::custom_type; - - // Compilation may never finish, if the compiler needs to compile too many kernels, - // it is recommended to compile benchmarks only for 1-2 types when BENCHMARK_CONFIG_TUNING is used - // (all other CREATE_*_BENCHMARK should be commented/removed). - - // Add benchmarks - std::vector benchmarks = - { - CREATE_BENCHMARKS(hipcub::Sum), - CREATE_BENCHMARKS(hipcub::Min), - }; - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +#define CREATE_BENCHMARK(EXCL, T, SCAN_OP) \ + benchmark::RegisterBenchmark( \ + std::string(std::string(EXCL ? "device_exclusive_scan" \ + : "device_inclusive_scan") + \ + ".") \ + .c_str(), \ + &run_benchmark, size, stream, SCAN_OP()), \ + benchmark::RegisterBenchmark( \ + std::string(std::string(EXCL ? "device_exclusive_scan_by_key" \ + : "device_inclusive_scan_by_key") + \ + ".") \ + .c_str(), \ + &run_benchmark_by_key, size, stream, SCAN_OP()) + +#define CREATE_BENCHMARKS(SCAN_OP) \ + CREATE_BENCHMARK(false, int, SCAN_OP), CREATE_BENCHMARK(true, int, SCAN_OP), \ + CREATE_BENCHMARK(false, float, SCAN_OP), \ + CREATE_BENCHMARK(true, float, SCAN_OP), \ + CREATE_BENCHMARK(false, double, SCAN_OP), \ + CREATE_BENCHMARK(true, double, SCAN_OP), \ + CREATE_BENCHMARK(false, long long, SCAN_OP), \ + CREATE_BENCHMARK(true, long long, SCAN_OP), \ + CREATE_BENCHMARK(false, custom_float2, SCAN_OP), \ + CREATE_BENCHMARK(true, custom_float2, SCAN_OP), \ + CREATE_BENCHMARK(false, custom_double2, SCAN_OP), \ + CREATE_BENCHMARK(true, custom_double2, SCAN_OP), \ + CREATE_BENCHMARK(false, int8_t, SCAN_OP), \ + CREATE_BENCHMARK(true, int8_t, SCAN_OP), \ + CREATE_BENCHMARK(false, uint8_t, SCAN_OP), \ + CREATE_BENCHMARK(true, uint8_t, SCAN_OP) + +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_scan" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + using custom_double2 = benchmark_utils::custom_type; + using custom_float2 = benchmark_utils::custom_type; + + // Compilation may never finish, if the compiler needs to compile too many + // kernels, it is recommended to compile benchmarks only for 1-2 types when + // BENCHMARK_CONFIG_TUNING is used (all other CREATE_*_BENCHMARK should be + // commented/removed). + + // Add benchmarks + std::vector benchmarks = { + CREATE_BENCHMARKS(hipcub::Sum), + CREATE_BENCHMARKS(hipcub::Min), + }; + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); - return 0; + return 0; } diff --git a/benchmark/benchmark_device_segmented_radix_sort.cpp b/benchmark/benchmark_device_segmented_radix_sort.cpp index 22de7d13..548266be 100644 --- a/benchmark/benchmark_device_segmented_radix_sort.cpp +++ b/benchmark/benchmark_device_segmented_radix_sort.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -25,7 +25,6 @@ // HIP API #include "hipcub/hipcub.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif @@ -36,507 +35,377 @@ const unsigned int warmup_size = 2; constexpr bool Ascending = false; constexpr bool Descending = true; -template -void run_sort_keys_benchmark(benchmark::State& state, - size_t desired_segments, - hipStream_t stream, - size_t size, - bool descending = false) -{ - using offset_type = int; - using key_type = Key; - typedef hipError_t (*sort_func)(void*, - size_t&, - const key_type*, - key_type*, - int, - int, - offset_type*, - offset_type*, - int, - int, - hipStream_t); - - sort_func func_ascending = &hipcub::DeviceSegmentedRadixSort::SortKeys - ; - sort_func func_descending = &hipcub::DeviceSegmentedRadixSort::SortKeysDescending - ; - - sort_func sorting = descending ? func_descending : func_ascending; - - // Generate data - std::vector offsets; - - const double avg_segment_length = static_cast(size) / desired_segments; - - const unsigned int seed = 123; - std::default_random_engine gen(seed); - - std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); - - unsigned int segments_count = 0; - size_t offset = 0; - while(offset < size) - { - const size_t segment_length = std::round(segment_length_dis(gen)); - offsets.push_back(offset); - segments_count++; - offset += segment_length; - } - offsets.push_back(size); - - std::vector keys_input; - if(std::is_floating_point::value) - { - keys_input = benchmark_utils::get_random_data( - size, (key_type)-1000, (key_type)+1000); - } - else - { - keys_input = benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); - } - - offset_type * d_offsets; - HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); - HIP_CHECK( - hipMemcpy( - d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice - ) - ); - - key_type * d_keys_input; - key_type * d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK(sorting(d_temporary_storage, - temporary_storage_bytes, - d_keys_input, - d_keys_output, - size, - segments_count, - d_offsets, - d_offsets + 1, - 0, - sizeof(key_type) * 8, +template +void run_sort_keys_benchmark(benchmark::State &state, size_t desired_segments, + hipStream_t stream, size_t size, + bool descending = false) { + using offset_type = int; + using key_type = Key; + typedef hipError_t (*sort_func)(void *, size_t &, const key_type *, + key_type *, int, int, offset_type *, + offset_type *, int, int, hipStream_t); + + sort_func func_ascending = + &hipcub::DeviceSegmentedRadixSort::SortKeys; + sort_func func_descending = + &hipcub::DeviceSegmentedRadixSort::SortKeysDescending; + + sort_func sorting = descending ? func_descending : func_ascending; + + // Generate data + std::vector offsets; + + const double avg_segment_length = + static_cast(size) / desired_segments; + + const unsigned int seed = 123; + std::default_random_engine gen(seed); + + std::uniform_real_distribution segment_length_dis( + 0, avg_segment_length * 2); + + unsigned int segments_count = 0; + size_t offset = 0; + while (offset < size) { + const size_t segment_length = std::round(segment_length_dis(gen)); + offsets.push_back(offset); + segments_count++; + offset += segment_length; + } + offsets.push_back(size); + + std::vector keys_input; + if (std::is_floating_point::value) { + keys_input = benchmark_utils::get_random_data( + size, (key_type)-1000, (key_type) + 1000); + } else { + keys_input = benchmark_utils::get_random_data( + size, std::numeric_limits::min(), + std::numeric_limits::max()); + } + + offset_type *d_offsets; + HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); + HIP_CHECK(hipMemcpy(d_offsets, offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); + + key_type *d_keys_input; + key_type *d_keys_output; + HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), + hipMemcpyHostToDevice)); + + void *d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, d_keys_input, + d_keys_output, size, segments_count, d_offsets, + d_offsets + 1, 0, sizeof(key_type) * 8, stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < warmup_size; i++) { + HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, + d_keys_input, d_keys_output, size, segments_count, + d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream)); + } + HIP_CHECK(hipDeviceSynchronize()); - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); - // Warm-up - for(size_t i = 0; i < warmup_size; i++) - { - HIP_CHECK(sorting(d_temporary_storage, - temporary_storage_bytes, - d_keys_input, - d_keys_output, - size, - segments_count, - d_offsets, - d_offsets + 1, - 0, - sizeof(key_type) * 8, - stream)); + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, + d_keys_input, d_keys_output, size, segments_count, + d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK(sorting(d_temporary_storage, - temporary_storage_bytes, - d_keys_input, - d_keys_output, - size, - segments_count, - d_offsets, - d_offsets + 1, - 0, - sizeof(key_type) * 8, - stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_offsets)); - HIP_CHECK(hipFree(d_keys_input)); - HIP_CHECK(hipFree(d_keys_output)); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * + sizeof(key_type)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_offsets)); + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_keys_output)); } -template -void run_sort_pairs_benchmark(benchmark::State& state, - size_t desired_segments, - hipStream_t stream, - size_t size, - bool descending = false) -{ - using offset_type = int; - using key_type = Key; - using value_type = Value; - typedef hipError_t (*sort_func)(void*, - size_t&, - const key_type*, - key_type*, - const value_type*, - value_type*, - int, - int, - offset_type*, - offset_type*, - int, - int, - hipStream_t); - - sort_func func_ascending = &hipcub::DeviceSegmentedRadixSort::SortPairs - ; - sort_func func_descending = &hipcub::DeviceSegmentedRadixSort::SortPairsDescending - ; - - sort_func sorting = descending ? func_descending : func_ascending; - - // Generate data - std::vector offsets; - - const double avg_segment_length = static_cast(size) / desired_segments; - - const unsigned int seed = 123; - std::default_random_engine gen(seed); - - std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); - - unsigned int segments_count = 0; - size_t offset = 0; - while(offset < size) - { - const size_t segment_length = std::round(segment_length_dis(gen)); - offsets.push_back(offset); - segments_count++; - offset += segment_length; - } - offsets.push_back(size); - - std::vector keys_input; - if(std::is_floating_point::value) - { - keys_input = benchmark_utils::get_random_data( - size, (key_type)-1000, (key_type)+1000); - } - else - { - keys_input = benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); - } - - std::vector values_input(size); - std::iota(values_input.begin(), values_input.end(), 0); - - offset_type * d_offsets; - HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); - HIP_CHECK( - hipMemcpy( - d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice - ) - ); - - key_type * d_keys_input; - key_type * d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; - value_type * d_values_output; - HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); - HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK(sorting(d_temporary_storage, - temporary_storage_bytes, - d_keys_input, - d_keys_output, - d_values_input, - d_values_output, - size, - segments_count, - d_offsets, - d_offsets + 1, - 0, - sizeof(key_type) * 8, - stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for(size_t i = 0; i < warmup_size; i++) - { - HIP_CHECK(sorting(d_temporary_storage, - temporary_storage_bytes, - d_keys_input, - d_keys_output, - d_values_input, - d_values_output, - size, - segments_count, - d_offsets, - d_offsets + 1, - 0, - sizeof(key_type) * 8, - stream)); +template +void run_sort_pairs_benchmark(benchmark::State &state, size_t desired_segments, + hipStream_t stream, size_t size, + bool descending = false) { + using offset_type = int; + using key_type = Key; + using value_type = Value; + typedef hipError_t (*sort_func)(void *, size_t &, const key_type *, + key_type *, const value_type *, value_type *, + int, int, offset_type *, offset_type *, int, + int, hipStream_t); + + sort_func func_ascending = + &hipcub::DeviceSegmentedRadixSort::SortPairs; + sort_func func_descending = + &hipcub::DeviceSegmentedRadixSort::SortPairsDescending< + key_type, value_type, offset_type *>; + + sort_func sorting = descending ? func_descending : func_ascending; + + // Generate data + std::vector offsets; + + const double avg_segment_length = + static_cast(size) / desired_segments; + + const unsigned int seed = 123; + std::default_random_engine gen(seed); + + std::uniform_real_distribution segment_length_dis( + 0, avg_segment_length * 2); + + unsigned int segments_count = 0; + size_t offset = 0; + while (offset < size) { + const size_t segment_length = std::round(segment_length_dis(gen)); + offsets.push_back(offset); + segments_count++; + offset += segment_length; + } + offsets.push_back(size); + + std::vector keys_input; + if (std::is_floating_point::value) { + keys_input = benchmark_utils::get_random_data( + size, (key_type)-1000, (key_type) + 1000); + } else { + keys_input = benchmark_utils::get_random_data( + size, std::numeric_limits::min(), + std::numeric_limits::max()); + } + + std::vector values_input(size); + std::iota(values_input.begin(), values_input.end(), 0); + + offset_type *d_offsets; + HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); + HIP_CHECK(hipMemcpy(d_offsets, offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); + + key_type *d_keys_input; + key_type *d_keys_output; + HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), + hipMemcpyHostToDevice)); + + value_type *d_values_input; + value_type *d_values_output; + HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); + HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); + HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), + size * sizeof(value_type), hipMemcpyHostToDevice)); + + void *d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, d_keys_input, + d_keys_output, d_values_input, d_values_output, size, + segments_count, d_offsets, d_offsets + 1, 0, + sizeof(key_type) * 8, stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < warmup_size; i++) { + HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, + d_keys_input, d_keys_output, d_values_input, + d_values_output, size, segments_count, d_offsets, + d_offsets + 1, 0, sizeof(key_type) * 8, stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, + d_keys_input, d_keys_output, d_values_input, + d_values_output, size, segments_count, d_offsets, + d_offsets + 1, 0, sizeof(key_type) * 8, stream)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK(sorting(d_temporary_storage, - temporary_storage_bytes, - d_keys_input, - d_keys_output, - d_values_input, - d_values_output, - size, - segments_count, - d_offsets, - d_offsets + 1, - 0, - sizeof(key_type) * 8, - stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed( - state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type)) - ); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_offsets)); - HIP_CHECK(hipFree(d_keys_input)); - HIP_CHECK(hipFree(d_keys_output)); - HIP_CHECK(hipFree(d_values_input)); - HIP_CHECK(hipFree(d_values_output)); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * + (sizeof(key_type) + sizeof(value_type))); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_offsets)); + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_keys_output)); + HIP_CHECK(hipFree(d_values_input)); + HIP_CHECK(hipFree(d_values_output)); } -#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ -benchmark::RegisterBenchmark( \ - std::string("device_segmented_radix_sort_keys" \ - "." \ - "(segments:~" \ - + std::to_string(SEGMENTS) \ - + " segments)" \ - ).c_str(), \ - [=](benchmark::State& state){ \ - run_sort_keys_benchmark(state, \ - SEGMENTS, \ - stream, \ - size, \ - Ascending); \ - } \ -) - -#define CREATE_SORT_KEYS_DESCENDING_BENCHMARK(Key, SEGMENTS) \ -benchmark::RegisterBenchmark( \ - std::string("device_segmented_radix_sort_keys" \ - "." \ - "(segments:~" \ - + std::to_string(SEGMENTS) \ - + " segments)" \ - ).c_str(), \ - [=](benchmark::State& state){ \ - run_sort_keys_benchmark(state, SEGMENTS, stream, size, Descending); \ - } \ -) - -#define BENCHMARK_KEY_TYPE(type) \ - CREATE_SORT_KEYS_BENCHMARK(type, 1), \ - CREATE_SORT_KEYS_BENCHMARK(type, 10), \ - CREATE_SORT_KEYS_BENCHMARK(type, 100), \ - CREATE_SORT_KEYS_BENCHMARK(type, 1000), \ - CREATE_SORT_KEYS_BENCHMARK(type, 10000), \ - CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 1), \ - CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 10), \ - CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 100), \ - CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 1000), \ - CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 10000) - - -void add_sort_keys_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - std::vector bs = - { - BENCHMARK_KEY_TYPE(float), - BENCHMARK_KEY_TYPE(double), - BENCHMARK_KEY_TYPE(int8_t), - BENCHMARK_KEY_TYPE(uint8_t), - BENCHMARK_KEY_TYPE(int), - }; - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_radix_sort_keys" \ + "." \ + "(segments:~" + \ + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_sort_keys_benchmark(state, SEGMENTS, stream, size, \ + Ascending); \ + }) + +#define CREATE_SORT_KEYS_DESCENDING_BENCHMARK(Key, SEGMENTS) \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_radix_sort_keys" \ + "." \ + "(segments:~" + \ + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_sort_keys_benchmark(state, SEGMENTS, stream, size, \ + Descending); \ + }) + +#define BENCHMARK_KEY_TYPE(type) \ + CREATE_SORT_KEYS_BENCHMARK(type, 1), CREATE_SORT_KEYS_BENCHMARK(type, 10), \ + CREATE_SORT_KEYS_BENCHMARK(type, 100), \ + CREATE_SORT_KEYS_BENCHMARK(type, 1000), \ + CREATE_SORT_KEYS_BENCHMARK(type, 10000), \ + CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 1), \ + CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 10), \ + CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 100), \ + CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 1000), \ + CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 10000) + +void add_sort_keys_benchmarks( + std::vector &benchmarks, + hipStream_t stream, size_t size) { + std::vector bs = { + BENCHMARK_KEY_TYPE(float), BENCHMARK_KEY_TYPE(double), + BENCHMARK_KEY_TYPE(int8_t), BENCHMARK_KEY_TYPE(uint8_t), + BENCHMARK_KEY_TYPE(int), + }; + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ -benchmark::RegisterBenchmark( \ - std::string("device_segmented_radix_sort_pairs" \ - "." \ - "(segments:~" \ - + std::to_string(SEGMENTS) \ - + " segments)" \ - ).c_str(), \ - [=](benchmark::State& state){ \ - run_sort_pairs_benchmark(state, SEGMENTS, stream, size, Ascending); \ - } \ -) - -#define CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(Key, Value, SEGMENTS) \ -benchmark::RegisterBenchmark( \ - std::string("device_segmented_radix_sort_pairs" \ - "." \ - "(segments:~" \ - + std::to_string(SEGMENTS) \ - + " segments)" \ - ).c_str(), \ - [=](benchmark::State& state){ \ - run_sort_pairs_benchmark(state, SEGMENTS, stream, size, Descending);\ - } \ -) - -#define BENCHMARK_PAIR_TYPE(type, value) \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 1), \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 10), \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 100), \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 1000), \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 10000), \ - CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 1), \ - CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 10), \ - CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 100), \ - CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 1000), \ - CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 10000) - -void add_sort_pairs_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - - std::vector bs = - { - BENCHMARK_PAIR_TYPE(int, float), - BENCHMARK_PAIR_TYPE(long long, double), - BENCHMARK_PAIR_TYPE(int8_t, int8_t), - BENCHMARK_PAIR_TYPE(uint8_t, uint8_t), - BENCHMARK_PAIR_TYPE(int, custom_float2), - BENCHMARK_PAIR_TYPE(long long, custom_double2), - }; - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_radix_sort_pairs" \ + "." \ + "(segments:~" + \ + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_sort_pairs_benchmark(state, SEGMENTS, stream, size, \ + Ascending); \ + }) + +#define CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(Key, Value, SEGMENTS) \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_radix_sort_pairs" \ + "." \ + "(segments:~" + \ + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_sort_pairs_benchmark(state, SEGMENTS, stream, size, \ + Descending); \ + }) + +#define BENCHMARK_PAIR_TYPE(type, value) \ + CREATE_SORT_PAIRS_BENCHMARK(type, value, 1), \ + CREATE_SORT_PAIRS_BENCHMARK(type, value, 10), \ + CREATE_SORT_PAIRS_BENCHMARK(type, value, 100), \ + CREATE_SORT_PAIRS_BENCHMARK(type, value, 1000), \ + CREATE_SORT_PAIRS_BENCHMARK(type, value, 10000), \ + CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 1), \ + CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 10), \ + CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 100), \ + CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 1000), \ + CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 10000) + +void add_sort_pairs_benchmarks( + std::vector &benchmarks, + hipStream_t stream, size_t size) { + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + + std::vector bs = { + BENCHMARK_PAIR_TYPE(int, float), + BENCHMARK_PAIR_TYPE(long long, double), + BENCHMARK_PAIR_TYPE(int8_t, int8_t), + BENCHMARK_PAIR_TYPE(uint8_t, uint8_t), + BENCHMARK_PAIR_TYPE(int, custom_float2), + BENCHMARK_PAIR_TYPE(long long, custom_double2), + }; + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_segmented_radix_sort" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_sort_keys_benchmarks(benchmarks, stream, size); - add_sort_pairs_benchmarks(benchmarks, stream, size); - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_segmented_radix_sort" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_sort_keys_benchmarks(benchmarks, stream, size); + add_sort_pairs_benchmarks(benchmarks, stream, size); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_segmented_reduce.cpp b/benchmark/benchmark_device_segmented_reduce.cpp index 77024237..e008bf41 100644 --- a/benchmark/benchmark_device_segmented_reduce.cpp +++ b/benchmark/benchmark_device_segmented_reduce.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -25,266 +25,216 @@ // HIP API #include "hipcub/device/device_segmented_reduce.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif - const unsigned int batch_size = 10; const unsigned int warmup_size = 5; using OffsetType = int; -template -void run_benchmark(benchmark::State& state, - size_t desired_segments, - hipStream_t stream, - size_t size, - SegmentedReduceKernel segmented_reduce) -{ - using value_type = T; - - // Generate data - const unsigned int seed = 123; - std::default_random_engine gen(seed); - - const double avg_segment_length = static_cast(size) / desired_segments; - std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); - - std::vector offsets; - unsigned int segments_count = 0; - size_t offset = 0; - while(offset < size) - { - const size_t segment_length = std::round(segment_length_dis(gen)); - offsets.push_back(offset); - segments_count++; - offset += segment_length; - } - offsets.push_back(size); - - std::vector values_input(size); - std::iota(values_input.begin(), values_input.end(), 0); - - OffsetType * d_offsets; - HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(OffsetType))); - HIP_CHECK( - hipMemcpy( - d_offsets, offsets.data(), - (segments_count + 1) * sizeof(OffsetType), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; - HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); - - OutputT * d_aggregates_output; - HIP_CHECK(hipMalloc(&d_aggregates_output, segments_count * sizeof(OutputT))); - - void * d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - - HIP_CHECK(segmented_reduce(d_temporary_storage, - temporary_storage_bytes, - d_values_input, - d_aggregates_output, - segments_count, - d_offsets, - d_offsets + 1, - stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for(size_t i = 0; i < warmup_size; i++) - { - HIP_CHECK(segmented_reduce(d_temporary_storage, - temporary_storage_bytes, - d_values_input, - d_aggregates_output, - segments_count, - d_offsets, - d_offsets + 1, - stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK(segmented_reduce(d_temporary_storage, - temporary_storage_bytes, - d_values_input, - d_aggregates_output, - segments_count, - d_offsets, - d_offsets + 1, - stream)); - } - HIP_CHECK(hipStreamSynchronize(stream)); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); +template +void run_benchmark(benchmark::State &state, size_t desired_segments, + hipStream_t stream, size_t size, + SegmentedReduceKernel segmented_reduce) { + using value_type = T; + + // Generate data + const unsigned int seed = 123; + std::default_random_engine gen(seed); + + const double avg_segment_length = + static_cast(size) / desired_segments; + std::uniform_real_distribution segment_length_dis( + 0, avg_segment_length * 2); + + std::vector offsets; + unsigned int segments_count = 0; + size_t offset = 0; + while (offset < size) { + const size_t segment_length = std::round(segment_length_dis(gen)); + offsets.push_back(offset); + segments_count++; + offset += segment_length; + } + offsets.push_back(size); + + std::vector values_input(size); + std::iota(values_input.begin(), values_input.end(), 0); + + OffsetType *d_offsets; + HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(OffsetType))); + HIP_CHECK(hipMemcpy(d_offsets, offsets.data(), + (segments_count + 1) * sizeof(OffsetType), + hipMemcpyHostToDevice)); + + value_type *d_values_input; + HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); + HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), + size * sizeof(value_type), hipMemcpyHostToDevice)); + + OutputT *d_aggregates_output; + HIP_CHECK(hipMalloc(&d_aggregates_output, segments_count * sizeof(OutputT))); + + void *d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + + HIP_CHECK(segmented_reduce(d_temporary_storage, temporary_storage_bytes, + d_values_input, d_aggregates_output, + segments_count, d_offsets, d_offsets + 1, stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < warmup_size; i++) { + HIP_CHECK(segmented_reduce( + d_temporary_storage, temporary_storage_bytes, d_values_input, + d_aggregates_output, segments_count, d_offsets, d_offsets + 1, stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK(segmented_reduce(d_temporary_storage, temporary_storage_bytes, + d_values_input, d_aggregates_output, + segments_count, d_offsets, d_offsets + 1, + stream)); } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(value_type)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_offsets)); - HIP_CHECK(hipFree(d_values_input)); - HIP_CHECK(hipFree(d_aggregates_output)); + HIP_CHECK(hipStreamSynchronize(stream)); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * + sizeof(value_type)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_offsets)); + HIP_CHECK(hipFree(d_values_input)); + HIP_CHECK(hipFree(d_aggregates_output)); } -template -struct Benchmark; +template struct Benchmark; -template -struct Benchmark { - static void run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) - { - hipError_t (*ptr_to_sum)(void*, size_t&, T*, T*, int, OffsetType*, OffsetType*, hipStream_t) - = &hipcub::DeviceSegmentedReduce::Sum; - run_benchmark(state, desired_segments, stream, size, ptr_to_sum); - } +template struct Benchmark { + static void run(benchmark::State &state, size_t desired_segments, + const hipStream_t stream, size_t size) { + hipError_t (*ptr_to_sum)(void *, size_t &, T *, T *, int, OffsetType *, + OffsetType *, hipStream_t) = + &hipcub::DeviceSegmentedReduce::Sum; + run_benchmark(state, desired_segments, stream, size, ptr_to_sum); + } }; -template -struct Benchmark { - static void run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) - { - hipError_t (*ptr_to_min)(void*, size_t&, T*, T*, int, OffsetType*, OffsetType*, hipStream_t) - = &hipcub::DeviceSegmentedReduce::Min; - run_benchmark(state, desired_segments, stream, size, ptr_to_min); - } +template struct Benchmark { + static void run(benchmark::State &state, size_t desired_segments, + const hipStream_t stream, size_t size) { + hipError_t (*ptr_to_min)(void *, size_t &, T *, T *, int, OffsetType *, + OffsetType *, hipStream_t) = + &hipcub::DeviceSegmentedReduce::Min; + run_benchmark(state, desired_segments, stream, size, ptr_to_min); + } }; -template -struct Benchmark { - using Difference = OffsetType; - using Iterator = typename hipcub::ArgIndexInputIterator; - using KeyValue = typename Iterator::value_type; - - static void run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) - { - hipError_t (*ptr_to_argmin)(void*, - size_t&, - T*, - KeyValue*, - int, - OffsetType*, - OffsetType*, - hipStream_t) - = &hipcub::DeviceSegmentedReduce::ArgMin; - run_benchmark(state, desired_segments, stream, size, ptr_to_argmin); - } +template struct Benchmark { + using Difference = OffsetType; + using Iterator = typename hipcub::ArgIndexInputIterator; + using KeyValue = typename Iterator::value_type; + + static void run(benchmark::State &state, size_t desired_segments, + const hipStream_t stream, size_t size) { + hipError_t (*ptr_to_argmin)(void *, size_t &, T *, KeyValue *, int, + OffsetType *, OffsetType *, hipStream_t) = + &hipcub::DeviceSegmentedReduce::ArgMin; + run_benchmark(state, desired_segments, stream, size, + ptr_to_argmin); + } }; -#define CREATE_BENCHMARK(T, SEGMENTS, REDUCE_OP) \ -benchmark::RegisterBenchmark( \ - std::string("device_segmented_reduce" \ - "." \ - "(number_of_segments:~" \ - + std::to_string(SEGMENTS) \ - + " segments)" \ - ).c_str(), \ - &Benchmark::run, \ - SEGMENTS, stream, size \ -) - -#define BENCHMARK_TYPE(type, REDUCE_OP) \ - CREATE_BENCHMARK(type, 1, REDUCE_OP), \ - CREATE_BENCHMARK(type, 100, REDUCE_OP), \ - CREATE_BENCHMARK(type, 10000, REDUCE_OP) - -#define CREATE_BENCHMARKS(REDUCE_OP) \ - BENCHMARK_TYPE(float, REDUCE_OP), \ - BENCHMARK_TYPE(double, REDUCE_OP), \ - BENCHMARK_TYPE(int8_t, REDUCE_OP), \ - BENCHMARK_TYPE(int, REDUCE_OP) - -void add_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - using custom_double2 = benchmark_utils::custom_type; - - std::vector bs = - { - CREATE_BENCHMARKS(hipcub::Sum), - BENCHMARK_TYPE(custom_double2, hipcub::Sum), - CREATE_BENCHMARKS(hipcub::Min), - #ifdef HIPCUB_ROCPRIM_API - BENCHMARK_TYPE(custom_double2, hipcub::Min), - #endif - CREATE_BENCHMARKS(hipcub::ArgMin), - #ifdef HIPCUB_ROCPRIM_API - BENCHMARK_TYPE(custom_double2, hipcub::ArgMin), - #endif - }; - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); -} +#define CREATE_BENCHMARK(T, SEGMENTS, REDUCE_OP) \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_reduce" \ + "." \ + "(number_of_segments:~" + \ + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + &Benchmark::run, SEGMENTS, stream, size) + +#define BENCHMARK_TYPE(type, REDUCE_OP) \ + CREATE_BENCHMARK(type, 1, REDUCE_OP), \ + CREATE_BENCHMARK(type, 100, REDUCE_OP), \ + CREATE_BENCHMARK(type, 10000, REDUCE_OP) + +#define CREATE_BENCHMARKS(REDUCE_OP) \ + BENCHMARK_TYPE(float, REDUCE_OP), BENCHMARK_TYPE(double, REDUCE_OP), \ + BENCHMARK_TYPE(int8_t, REDUCE_OP), BENCHMARK_TYPE(int, REDUCE_OP) + +void add_benchmarks(std::vector &benchmarks, + hipStream_t stream, size_t size) { + using custom_double2 = benchmark_utils::custom_type; + + std::vector bs = { + CREATE_BENCHMARKS(hipcub::Sum), + BENCHMARK_TYPE(custom_double2, hipcub::Sum), + CREATE_BENCHMARKS(hipcub::Min), +#ifdef HIPCUB_ROCPRIM_API + BENCHMARK_TYPE(custom_double2, hipcub::Min), +#endif + CREATE_BENCHMARKS(hipcub::ArgMin), +#ifdef HIPCUB_ROCPRIM_API + BENCHMARK_TYPE(custom_double2, hipcub::ArgMin), +#endif + }; -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_segmented_reduce" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_benchmarks(benchmarks, stream, size); - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +} - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_segmented_reduce" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_benchmarks(benchmarks, stream, size); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_segmented_sort.cpp b/benchmark/benchmark_device_segmented_sort.cpp index 6426ae30..c0954103 100644 --- a/benchmark/benchmark_device_segmented_sort.cpp +++ b/benchmark/benchmark_device_segmented_sort.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -33,542 +33,407 @@ const unsigned int batch_size = 4; const unsigned int warmup_size = 2; template -void run_sort_keys_benchmark(benchmark::State &state, - size_t desired_segments, - hipStream_t stream, - size_t size, - bool Descending = false, - bool Stable = false) -{ - using offset_type = int; - using key_type = Key; - typedef hipError_t (*sort_func)(void*, - size_t&, - const key_type*, - key_type*, - int, - int, - offset_type*, - offset_type*, - hipStream_t); - - sort_func func_ascending = &hipcub::DeviceSegmentedSort::SortKeys - ; - sort_func func_descending = &hipcub::DeviceSegmentedSort::SortKeysDescending - ; - sort_func func_ascending_stable = &hipcub::DeviceSegmentedSort::StableSortKeys - ; - sort_func func_descending_stable = &hipcub::DeviceSegmentedSort::StableSortKeysDescending - ; - - sort_func sorting = Descending ? - (Stable ? func_descending_stable : func_descending) : - (Stable ? func_ascending_stable : func_ascending); - - std::vector offsets; - - const double avg_segment_length = static_cast(size) / desired_segments; - - std::random_device rd; - std::default_random_engine gen(rd()); - - std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); - - unsigned int segments_count = 0; - size_t offset = 0; - while (offset < size) - { - const size_t segment_length = std::round(segment_length_dis(gen)); - offsets.push_back(offset); - ++segments_count; - offset += segment_length; +void run_sort_keys_benchmark(benchmark::State &state, size_t desired_segments, + hipStream_t stream, size_t size, + bool Descending = false, bool Stable = false) { + using offset_type = int; + using key_type = Key; + typedef hipError_t (*sort_func)(void *, size_t &, const key_type *, + key_type *, int, int, offset_type *, + offset_type *, hipStream_t); + + sort_func func_ascending = + &hipcub::DeviceSegmentedSort::SortKeys; + sort_func func_descending = + &hipcub::DeviceSegmentedSort::SortKeysDescending; + sort_func func_ascending_stable = + &hipcub::DeviceSegmentedSort::StableSortKeys; + sort_func func_descending_stable = + &hipcub::DeviceSegmentedSort::StableSortKeysDescending; + + sort_func sorting = Descending + ? (Stable ? func_descending_stable : func_descending) + : (Stable ? func_ascending_stable : func_ascending); + + std::vector offsets; + + const double avg_segment_length = + static_cast(size) / desired_segments; + + std::random_device rd; + std::default_random_engine gen(rd()); + + std::uniform_real_distribution segment_length_dis( + 0, avg_segment_length * 2); + + unsigned int segments_count = 0; + size_t offset = 0; + while (offset < size) { + const size_t segment_length = std::round(segment_length_dis(gen)); + offsets.push_back(offset); + ++segments_count; + offset += segment_length; + } + offsets.push_back(size); + + std::vector keys_input; + if (std::is_floating_point::value) { + keys_input = benchmark_utils::get_random_data( + size, static_cast(-1000), static_cast(1000)); + } else { + keys_input = benchmark_utils::get_random_data( + size, std::numeric_limits::min(), + std::numeric_limits::max()); + } + + offset_type *d_offsets; + HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); + HIP_CHECK(hipMemcpy(d_offsets, offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); + + key_type *d_keys_input; + key_type *d_keys_output; + HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), + hipMemcpyHostToDevice)); + + void *d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, d_keys_input, + d_keys_output, size, segments_count, d_offsets, + d_offsets + 1, stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < warmup_size; ++i) { + HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, + d_keys_input, d_keys_output, size, segments_count, + d_offsets, d_offsets + 1, stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < batch_size; ++i) { + HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, + d_keys_input, d_keys_output, size, segments_count, + d_offsets, d_offsets + 1, stream)); } - offsets.push_back(size); - - std::vector keys_input; - if (std::is_floating_point::value) - { - keys_input = benchmark_utils::get_random_data( - size, - static_cast(-1000), - static_cast(1000) - ); - } - else - { - keys_input = benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); - } - - offset_type * d_offsets; - HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); - HIP_CHECK( - hipMemcpy( - d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice - ) - ); - - key_type * d_keys_input; - key_type * d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK(sorting(d_temporary_storage, - temporary_storage_bytes, - d_keys_input, - d_keys_output, - size, - segments_count, - d_offsets, - d_offsets + 1, - stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); - // Warm-up - for (size_t i = 0; i < warmup_size; ++i) - { - HIP_CHECK(sorting(d_temporary_storage, - temporary_storage_bytes, - d_keys_input, - d_keys_output, - size, - segments_count, - d_offsets, - d_offsets + 1, - stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < batch_size; ++i) - { - HIP_CHECK(sorting(d_temporary_storage, - temporary_storage_bytes, - d_keys_input, - d_keys_output, - size, - segments_count, - d_offsets, - d_offsets + 1, - stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_offsets)); - HIP_CHECK(hipFree(d_keys_input)); - HIP_CHECK(hipFree(d_keys_output)); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * + sizeof(key_type)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_offsets)); + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_keys_output)); } template -void run_sort_pairs_benchmark(benchmark::State &state, - size_t desired_segments, - hipStream_t stream, - size_t size, - bool Descending = false, - bool Stable = false) -{ - using offset_type = int; - using key_type = Key; - using value_type = Value; - typedef hipError_t (*sort_func)(void*, - size_t&, - const key_type*, - key_type*, - const value_type*, - value_type*, - int, - int, - offset_type*, - offset_type*, - hipStream_t); - - sort_func func_ascending = &hipcub::DeviceSegmentedSort::SortPairs - ; - sort_func func_descending = &hipcub::DeviceSegmentedSort::SortPairsDescending - ; - sort_func func_ascending_stable = &hipcub::DeviceSegmentedSort::StableSortPairs - ; - sort_func func_descending_stable = &hipcub::DeviceSegmentedSort::StableSortPairsDescending - ; - - sort_func sorting = Descending ? - (Stable ? func_descending_stable : func_descending) : - (Stable ? func_ascending_stable : func_ascending); - - std::vector offsets; - - const double avg_segment_length = static_cast(size) / desired_segments; - - std::random_device rd; - std::default_random_engine gen(rd()); - - std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); - - unsigned int segments_count = 0; - size_t offset = 0; - while (offset < size) - { - const size_t segment_length = std::round(segment_length_dis(gen)); - offsets.push_back(offset); - ++segments_count; - offset += segment_length; - } - offsets.push_back(size); - - std::vector keys_input; - if (std::is_floating_point::value) - { - keys_input = benchmark_utils::get_random_data( - size, - static_cast(-1000), - static_cast(1000) - ); - } - else - { - keys_input = benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); - } - - std::vector values_input(size); - std::iota(values_input.begin(), values_input.end(), 0); - - offset_type * d_offsets; - HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); - HIP_CHECK( - hipMemcpy( - d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice - ) - ); - - key_type * d_keys_input; - key_type * d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; - value_type * d_values_output; - HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); - HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK(sorting(d_temporary_storage, - temporary_storage_bytes, - d_keys_input, - d_keys_output, - d_values_input, - d_values_output, - size, - segments_count, - d_offsets, - d_offsets + 1, - stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < warmup_size; i++) - { - HIP_CHECK(sorting(d_temporary_storage, - temporary_storage_bytes, - d_keys_input, - d_keys_output, - d_values_input, - d_values_output, - size, - segments_count, - d_offsets, - d_offsets + 1, - stream)); +void run_sort_pairs_benchmark(benchmark::State &state, size_t desired_segments, + hipStream_t stream, size_t size, + bool Descending = false, bool Stable = false) { + using offset_type = int; + using key_type = Key; + using value_type = Value; + typedef hipError_t (*sort_func)( + void *, size_t &, const key_type *, key_type *, const value_type *, + value_type *, int, int, offset_type *, offset_type *, hipStream_t); + + sort_func func_ascending = + &hipcub::DeviceSegmentedSort::SortPairs; + sort_func func_descending = + &hipcub::DeviceSegmentedSort::SortPairsDescending; + sort_func func_ascending_stable = + &hipcub::DeviceSegmentedSort::StableSortPairs; + sort_func func_descending_stable = + &hipcub::DeviceSegmentedSort::StableSortPairsDescending< + key_type, value_type, offset_type *>; + + sort_func sorting = Descending + ? (Stable ? func_descending_stable : func_descending) + : (Stable ? func_ascending_stable : func_ascending); + + std::vector offsets; + + const double avg_segment_length = + static_cast(size) / desired_segments; + + std::random_device rd; + std::default_random_engine gen(rd()); + + std::uniform_real_distribution segment_length_dis( + 0, avg_segment_length * 2); + + unsigned int segments_count = 0; + size_t offset = 0; + while (offset < size) { + const size_t segment_length = std::round(segment_length_dis(gen)); + offsets.push_back(offset); + ++segments_count; + offset += segment_length; + } + offsets.push_back(size); + + std::vector keys_input; + if (std::is_floating_point::value) { + keys_input = benchmark_utils::get_random_data( + size, static_cast(-1000), static_cast(1000)); + } else { + keys_input = benchmark_utils::get_random_data( + size, std::numeric_limits::min(), + std::numeric_limits::max()); + } + + std::vector values_input(size); + std::iota(values_input.begin(), values_input.end(), 0); + + offset_type *d_offsets; + HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); + HIP_CHECK(hipMemcpy(d_offsets, offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); + + key_type *d_keys_input; + key_type *d_keys_output; + HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), + hipMemcpyHostToDevice)); + + value_type *d_values_input; + value_type *d_values_output; + HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); + HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); + HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), + size * sizeof(value_type), hipMemcpyHostToDevice)); + + void *d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, d_keys_input, + d_keys_output, d_values_input, d_values_output, size, + segments_count, d_offsets, d_offsets + 1, stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < warmup_size; i++) { + HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, + d_keys_input, d_keys_output, d_values_input, + d_values_output, size, segments_count, d_offsets, + d_offsets + 1, stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, + d_keys_input, d_keys_output, d_values_input, + d_values_output, size, segments_count, d_offsets, + d_offsets + 1, stream)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < batch_size; i++) - { - HIP_CHECK(sorting(d_temporary_storage, - temporary_storage_bytes, - d_keys_input, - d_keys_output, - d_values_input, - d_values_output, - size, - segments_count, - d_offsets, - d_offsets + 1, - stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed( - state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_offsets)); - HIP_CHECK(hipFree(d_keys_input)); - HIP_CHECK(hipFree(d_keys_output)); - HIP_CHECK(hipFree(d_values_input)); - HIP_CHECK(hipFree(d_values_output)); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * + (sizeof(key_type) + sizeof(value_type))); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_offsets)); + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_keys_output)); + HIP_CHECK(hipFree(d_values_input)); + HIP_CHECK(hipFree(d_values_output)); } -#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ - benchmark::RegisterBenchmark( \ - std::string("device_segmented_sort_keys" \ - "." \ - "(number_of_segments:~" \ - + std::to_string(SEGMENTS) \ - + " segments)" \ - ).c_str(), \ - [=](benchmark::State &state){ \ - run_sort_keys_benchmark(state, SEGMENTS, stream, size); \ - } \ - ), \ - benchmark::RegisterBenchmark( \ - std::string("device_segmented_sort_keys" \ - "." \ - "(number_of_segments:~" \ - + std::to_string(SEGMENTS) \ - + " segments)" \ - ).c_str(), \ - [=](benchmark::State &state){ \ - run_sort_keys_benchmark(state, SEGMENTS, stream, size, true); \ - } \ - ), \ - benchmark::RegisterBenchmark( \ - std::string("device_segmented_sort_keys" \ - "." \ - "(number_of_segments:~" \ - + std::to_string(SEGMENTS) \ - + " segments)" \ - ).c_str(), \ - [=](benchmark::State &state){ \ - run_sort_keys_benchmark(state, SEGMENTS, stream, size, false, true); \ - } \ - ), \ - benchmark::RegisterBenchmark( \ - std::string("device_segmented_sort_keys" \ - "." \ - "(number_of_segments:~" \ - + std::to_string(SEGMENTS) \ - + " segments)" \ - ).c_str(), \ - [=](benchmark::State &state){ \ - run_sort_keys_benchmark(state, SEGMENTS, stream, size, true, true); \ - } \ - ) \ - -#define BENCHMARK_KEY_TYPE(type) \ - CREATE_SORT_KEYS_BENCHMARK(type, 10), \ - CREATE_SORT_KEYS_BENCHMARK(type, 100), \ - CREATE_SORT_KEYS_BENCHMARK(type, 1000), \ - CREATE_SORT_KEYS_BENCHMARK(type, 10000) - -void add_sort_keys_benchmarks(std::vector &benchmarks, - hipStream_t stream, - size_t size) -{ - std::vector bs = - { - BENCHMARK_KEY_TYPE(float), - BENCHMARK_KEY_TYPE(double), - BENCHMARK_KEY_TYPE(int8_t), - BENCHMARK_KEY_TYPE(uint8_t), - BENCHMARK_KEY_TYPE(int), - }; - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_sort_keys" \ + "." \ + "(number_of_segments:~" + \ + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_sort_keys_benchmark(state, SEGMENTS, stream, size); \ + }), \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_sort_keys" \ + "." \ + "(number_of_segments:~" + \ + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_sort_keys_benchmark(state, SEGMENTS, stream, size, true); \ + }), \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_sort_keys" \ + "." \ + "(number_of_segments:~" + \ + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_sort_keys_benchmark(state, SEGMENTS, stream, size, false, \ + true); \ + }), \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_sort_keys" \ + "." \ + "(number_of_segments:~" + \ + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_sort_keys_benchmark(state, SEGMENTS, stream, size, true, \ + true); \ + }) + +#define BENCHMARK_KEY_TYPE(type) \ + CREATE_SORT_KEYS_BENCHMARK(type, 10), CREATE_SORT_KEYS_BENCHMARK(type, 100), \ + CREATE_SORT_KEYS_BENCHMARK(type, 1000), \ + CREATE_SORT_KEYS_BENCHMARK(type, 10000) + +void add_sort_keys_benchmarks( + std::vector &benchmarks, + hipStream_t stream, size_t size) { + std::vector bs = { + BENCHMARK_KEY_TYPE(float), BENCHMARK_KEY_TYPE(double), + BENCHMARK_KEY_TYPE(int8_t), BENCHMARK_KEY_TYPE(uint8_t), + BENCHMARK_KEY_TYPE(int), + }; + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ - benchmark::RegisterBenchmark( \ - (std::string("device_segmented_sort_pairs") \ - + "." + \ - "(number_of_segments:~" \ - + std::to_string(SEGMENTS) \ - + " segments)" \ - ).c_str(), \ - [=](benchmark::State &state){ \ - run_sort_pairs_benchmark(state, SEGMENTS, stream, size); \ - } \ - ), \ - benchmark::RegisterBenchmark( \ - (std::string("device_segmented_sort_pairs") \ - + "." + \ - "(number_of_segments:~" \ - + std::to_string(SEGMENTS) \ - + " segments)" \ - ).c_str(), \ - [=](benchmark::State &state){ \ - run_sort_pairs_benchmark(state, SEGMENTS, stream, size, true); \ - } \ - ), \ - benchmark::RegisterBenchmark( \ - (std::string("device_segmented_sort_pairs") \ - + "." + \ - "(number_of_segments:~" \ - + std::to_string(SEGMENTS) \ - + " segments)" \ - ).c_str(), \ - [=](benchmark::State &state){ \ - run_sort_pairs_benchmark(state, SEGMENTS, stream, size, false, true); \ - } \ - ), \ - benchmark::RegisterBenchmark( \ - (std::string("device_segmented_sort_pairs") \ - + "." + \ - "(number_of_segments:~" \ - + std::to_string(SEGMENTS) \ - + " segments)" \ - ).c_str(), \ - [=](benchmark::State &state){ \ - run_sort_pairs_benchmark(state, SEGMENTS, stream, size, true, true); \ - } \ - ) -#define BENCHMARK_PAIR_TYPE(type, value) \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 10), \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 100), \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 10000) - -void add_sort_pairs_benchmarks(std::vector &benchmarks, - hipStream_t stream, - size_t size) -{ - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - - std::vector bs = - { - BENCHMARK_PAIR_TYPE(int, float), - BENCHMARK_PAIR_TYPE(long long, double), - BENCHMARK_PAIR_TYPE(int8_t, int8_t), - BENCHMARK_PAIR_TYPE(uint8_t, uint8_t), - BENCHMARK_PAIR_TYPE(int, custom_float2), - BENCHMARK_PAIR_TYPE(long long, custom_double2), - }; - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ + benchmark::RegisterBenchmark( \ + (std::string("device_segmented_sort_pairs") + \ + "." + \ + "(number_of_segments:~" + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_sort_pairs_benchmark(state, SEGMENTS, stream, size); \ + }), \ + benchmark::RegisterBenchmark( \ + (std::string("device_segmented_sort_pairs") + \ + "." + \ + "(number_of_segments:~" + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_sort_pairs_benchmark(state, SEGMENTS, stream, \ + size, true); \ + }), \ + benchmark::RegisterBenchmark( \ + (std::string("device_segmented_sort_pairs") + \ + "." + \ + "(number_of_segments:~" + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_sort_pairs_benchmark(state, SEGMENTS, stream, \ + size, false, true); \ + }), \ + benchmark::RegisterBenchmark( \ + (std::string("device_segmented_sort_pairs") + \ + "." + \ + "(number_of_segments:~" + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State &state) { \ + run_sort_pairs_benchmark(state, SEGMENTS, stream, \ + size, true, true); \ + }) +#define BENCHMARK_PAIR_TYPE(type, value) \ + CREATE_SORT_PAIRS_BENCHMARK(type, value, 10), \ + CREATE_SORT_PAIRS_BENCHMARK(type, value, 100), \ + CREATE_SORT_PAIRS_BENCHMARK(type, value, 10000) + +void add_sort_pairs_benchmarks( + std::vector &benchmarks, + hipStream_t stream, size_t size) { + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + + std::vector bs = { + BENCHMARK_PAIR_TYPE(int, float), + BENCHMARK_PAIR_TYPE(long long, double), + BENCHMARK_PAIR_TYPE(int8_t, int8_t), + BENCHMARK_PAIR_TYPE(uint8_t, uint8_t), + BENCHMARK_PAIR_TYPE(int, custom_float2), + BENCHMARK_PAIR_TYPE(long long, custom_double2), + }; + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_segmented_sort" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_sort_keys_benchmarks(benchmarks, stream, size); - add_sort_pairs_benchmarks(benchmarks, stream, size); - - // Use manual timing - for (auto &b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) - { - for (auto &b : benchmarks) - { - b->Iterations(trials); - } +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_segmented_sort" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_sort_keys_benchmarks(benchmarks, stream, size); + add_sort_pairs_benchmarks(benchmarks, stream, size); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_select.cpp b/benchmark/benchmark_device_select.cpp index 9ec49b01..47a4beb0 100644 --- a/benchmark/benchmark_device_select.cpp +++ b/benchmark/benchmark_device_select.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -25,610 +25,441 @@ // HIP API #include "hipcub/device/device_select.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template -void run_flagged_benchmark(benchmark::State& state, - size_t size, - const hipStream_t stream, - float true_probability) -{ - std::vector input; - std::vector flags = benchmark_utils::get_random_data01(size, true_probability); - if(std::is_floating_point::value) - { - input = benchmark_utils::get_random_data(size, T(-1000), T(1000)); - } - else - { - input = benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); - } - - T * d_input; - FlagType * d_flags; - T * d_output; - unsigned int * d_selected_count_output; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_flags, flags.size() * sizeof(FlagType))); - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_flags, flags.data(), - flags.size() * sizeof(FlagType), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK(hipDeviceSynchronize()); - // Allocate temporary storage memory - size_t temp_storage_size_bytes = 0; - - // Get size of d_temp_storage - HIP_CHECK( - hipcub::DeviceSelect::Flagged( - nullptr, - temp_storage_size_bytes, - d_input, - d_flags, - d_output, - d_selected_count_output, - input.size(), - stream - ) - ); - HIP_CHECK(hipDeviceSynchronize()); - - // allocate temporary storage - void * d_temp_storage = nullptr; - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for(size_t i = 0; i < 10; i++) - { - HIP_CHECK( - hipcub::DeviceSelect::Flagged( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_flags, - d_output, - d_selected_count_output, - input.size(), - stream - ) - ); +template +void run_flagged_benchmark(benchmark::State &state, size_t size, + const hipStream_t stream, float true_probability) { + std::vector input; + std::vector flags = + benchmark_utils::get_random_data01(size, true_probability); + if (std::is_floating_point::value) { + input = benchmark_utils::get_random_data(size, T(-1000), T(1000)); + } else { + input = benchmark_utils::get_random_data( + size, std::numeric_limits::min(), std::numeric_limits::max()); + } + + T *d_input; + FlagType *d_flags; + T *d_output; + unsigned int *d_selected_count_output; + HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_flags, flags.size() * sizeof(FlagType))); + HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(FlagType), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + // Allocate temporary storage memory + size_t temp_storage_size_bytes = 0; + + // Get size of d_temp_storage + HIP_CHECK(hipcub::DeviceSelect::Flagged( + nullptr, temp_storage_size_bytes, d_input, d_flags, d_output, + d_selected_count_output, input.size(), stream)); + HIP_CHECK(hipDeviceSynchronize()); + + // allocate temporary storage + void *d_temp_storage = nullptr; + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < 10; i++) { + HIP_CHECK(hipcub::DeviceSelect::Flagged( + d_temp_storage, temp_storage_size_bytes, d_input, d_flags, d_output, + d_selected_count_output, input.size(), stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + const unsigned int batch_size = 10; + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK(hipcub::DeviceSelect::Flagged( + d_temp_storage, temp_storage_size_bytes, d_input, d_flags, d_output, + d_selected_count_output, input.size(), stream)); } HIP_CHECK(hipDeviceSynchronize()); - const unsigned int batch_size = 10; - for(auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK( - hipcub::DeviceSelect::Flagged( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_flags, - d_output, - d_selected_count_output, - input.size(), - stream - ) - ); - } - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - hipFree(d_input); - hipFree(d_flags); - hipFree(d_output); - hipFree(d_selected_count_output); - hipFree(d_temp_storage); - HIP_CHECK(hipDeviceSynchronize()); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + hipFree(d_input); + hipFree(d_flags); + hipFree(d_output); + hipFree(d_selected_count_output); + hipFree(d_temp_storage); + HIP_CHECK(hipDeviceSynchronize()); } -template -void run_selectop_benchmark(benchmark::State& state, - size_t size, - const hipStream_t stream, - float true_probability) -{ - std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); - - auto select_op = [true_probability] __device__ (const T& value) -> bool - { - if(value < T(1000 * true_probability)) return true; - return false; - }; - - T * d_input; - T * d_output; - unsigned int * d_selected_count_output; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK(hipDeviceSynchronize()); - - // Allocate temporary storage memory - size_t temp_storage_size_bytes; - - // Get size of d_temp_storage - HIP_CHECK( - hipcub::DeviceSelect::If( - nullptr, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - select_op, - stream - ) - ); - HIP_CHECK(hipDeviceSynchronize()); - - // allocate temporary storage - void * d_temp_storage = nullptr; - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for(size_t i = 0; i < 10; i++) - { - HIP_CHECK( - hipcub::DeviceSelect::If( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - select_op, - stream - ) - ); +template +void run_selectop_benchmark(benchmark::State &state, size_t size, + const hipStream_t stream, float true_probability) { + std::vector input = + benchmark_utils::get_random_data(size, T(0), T(1000)); + + auto select_op = [true_probability] __device__(const T &value) -> bool { + if (value < T(1000 * true_probability)) + return true; + return false; + }; + + T *d_input; + T *d_output; + unsigned int *d_selected_count_output; + HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + // Allocate temporary storage memory + size_t temp_storage_size_bytes; + + // Get size of d_temp_storage + HIP_CHECK(hipcub::DeviceSelect::If(nullptr, temp_storage_size_bytes, d_input, + d_output, d_selected_count_output, + input.size(), select_op, stream)); + HIP_CHECK(hipDeviceSynchronize()); + + // allocate temporary storage + void *d_temp_storage = nullptr; + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < 10; i++) { + HIP_CHECK(hipcub::DeviceSelect::If( + d_temp_storage, temp_storage_size_bytes, d_input, d_output, + d_selected_count_output, input.size(), select_op, stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + const unsigned int batch_size = 10; + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK(hipcub::DeviceSelect::If( + d_temp_storage, temp_storage_size_bytes, d_input, d_output, + d_selected_count_output, input.size(), select_op, stream)); } HIP_CHECK(hipDeviceSynchronize()); - const unsigned int batch_size = 10; - for(auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK( - hipcub::DeviceSelect::If( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - select_op, - stream - ) - ); - } - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - hipFree(d_input); - hipFree(d_output); - hipFree(d_selected_count_output); - hipFree(d_temp_storage); - HIP_CHECK(hipDeviceSynchronize()); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + hipFree(d_input); + hipFree(d_output); + hipFree(d_selected_count_output); + hipFree(d_temp_storage); + HIP_CHECK(hipDeviceSynchronize()); } -template -void run_unique_benchmark(benchmark::State& state, - size_t size, +template +void run_unique_benchmark(benchmark::State &state, size_t size, const hipStream_t stream, - float discontinuity_probability) -{ - hipcub::Sum op; - - std::vector input(size); - { - auto input01 = benchmark_utils::get_random_data01(size, discontinuity_probability); - auto acc = input01[0]; - input[0] = acc; - for(size_t i = 1; i < input01.size(); i++) - { - input[i] = op(acc, input01[i]); - } + float discontinuity_probability) { + hipcub::Sum op; + + std::vector input(size); + { + auto input01 = + benchmark_utils::get_random_data01(size, discontinuity_probability); + auto acc = input01[0]; + input[0] = acc; + for (size_t i = 1; i < input01.size(); i++) { + input[i] = op(acc, input01[i]); } - - T * d_input; - T * d_output; - unsigned int * d_selected_count_output; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK(hipDeviceSynchronize()); - - // Allocate temporary storage memory - size_t temp_storage_size_bytes; - - // Get size of d_temp_storage - HIP_CHECK( - hipcub::DeviceSelect::Unique( - nullptr, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - stream - ) - ); - HIP_CHECK(hipDeviceSynchronize()); - - // allocate temporary storage - void * d_temp_storage = nullptr; - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for(size_t i = 0; i < 10; i++) - { - HIP_CHECK( - hipcub::DeviceSelect::Unique( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - stream - ) - ); + } + + T *d_input; + T *d_output; + unsigned int *d_selected_count_output; + HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + // Allocate temporary storage memory + size_t temp_storage_size_bytes; + + // Get size of d_temp_storage + HIP_CHECK(hipcub::DeviceSelect::Unique( + nullptr, temp_storage_size_bytes, d_input, d_output, + d_selected_count_output, input.size(), stream)); + HIP_CHECK(hipDeviceSynchronize()); + + // allocate temporary storage + void *d_temp_storage = nullptr; + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < 10; i++) { + HIP_CHECK(hipcub::DeviceSelect::Unique( + d_temp_storage, temp_storage_size_bytes, d_input, d_output, + d_selected_count_output, input.size(), stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + const unsigned int batch_size = 10; + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK(hipcub::DeviceSelect::Unique( + d_temp_storage, temp_storage_size_bytes, d_input, d_output, + d_selected_count_output, input.size(), stream)); } HIP_CHECK(hipDeviceSynchronize()); - const unsigned int batch_size = 10; - for(auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK( - hipcub::DeviceSelect::Unique( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - stream - ) - ); - } - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - hipFree(d_input); - hipFree(d_output); - hipFree(d_selected_count_output); - hipFree(d_temp_storage); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + hipFree(d_input); + hipFree(d_output); + hipFree(d_selected_count_output); + hipFree(d_temp_storage); } -template -void run_unique_by_key_benchmark(benchmark::State& state, - size_t size, +template +void run_unique_by_key_benchmark(benchmark::State &state, size_t size, const hipStream_t stream, - float discontinuity_probability) -{ - hipcub::Sum op; + float discontinuity_probability) { + hipcub::Sum op; - std::vector input_keys(size); - { - auto input01 = benchmark_utils::get_random_data01(size, discontinuity_probability); - auto acc = input01[0]; + std::vector input_keys(size); + { + auto input01 = benchmark_utils::get_random_data01( + size, discontinuity_probability); + auto acc = input01[0]; - input_keys[0] = acc; + input_keys[0] = acc; - for (size_t i = 1; i < input01.size(); i++) - { - input_keys[i] = op(acc, input01[i]); - } + for (size_t i = 1; i < input01.size(); i++) { + input_keys[i] = op(acc, input01[i]); } - - const auto input_values - = benchmark_utils::get_random_data(size, ValueT(-1000), ValueT(1000)); - - KeyT* d_keys_input; - ValueT* d_values_input; - KeyT* d_keys_output; - ValueT* d_values_output; - unsigned int* d_selected_count_output; - - HIP_CHECK(hipMalloc(&d_keys_input, input_keys.size() * sizeof(input_keys[0]))); - HIP_CHECK(hipMalloc(&d_values_input, input_values.size() * sizeof(input_values[0]))); - HIP_CHECK(hipMalloc(&d_keys_output, input_keys.size() * sizeof(input_keys[0]))); - HIP_CHECK(hipMalloc(&d_values_output, input_values.size() * sizeof(input_values[0]))); - HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(*d_selected_count_output))); - - HIP_CHECK( - hipMemcpy( - d_keys_input, - input_keys.data(), - input_keys.size() * sizeof(input_keys[0]), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_values_input, - input_values.data(), - input_values.size() * sizeof(input_values[0]), - hipMemcpyHostToDevice - ) - ); - - // Allocate temporary storage memory - size_t temp_storage_size_bytes; - - // Get size of d_temp_storage - HIP_CHECK( - hipcub::DeviceSelect::UniqueByKey( - nullptr, - temp_storage_size_bytes, - d_keys_input, - d_values_input, - d_keys_output, - d_values_output, - d_selected_count_output, - input_keys.size(), - stream - ) - ); - HIP_CHECK(hipDeviceSynchronize()); - - // allocate temporary storage - void* d_temp_storage = nullptr; - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < 10; i++) - { - HIP_CHECK( - hipcub::DeviceSelect::UniqueByKey( - d_temp_storage, - temp_storage_size_bytes, - d_keys_input, - d_values_input, - d_keys_output, - d_values_output, - d_selected_count_output, - input_keys.size(), - stream - ) - ); + } + + const auto input_values = benchmark_utils::get_random_data( + size, ValueT(-1000), ValueT(1000)); + + KeyT *d_keys_input; + ValueT *d_values_input; + KeyT *d_keys_output; + ValueT *d_values_output; + unsigned int *d_selected_count_output; + + HIP_CHECK( + hipMalloc(&d_keys_input, input_keys.size() * sizeof(input_keys[0]))); + HIP_CHECK(hipMalloc(&d_values_input, + input_values.size() * sizeof(input_values[0]))); + HIP_CHECK( + hipMalloc(&d_keys_output, input_keys.size() * sizeof(input_keys[0]))); + HIP_CHECK(hipMalloc(&d_values_output, + input_values.size() * sizeof(input_values[0]))); + HIP_CHECK( + hipMalloc(&d_selected_count_output, sizeof(*d_selected_count_output))); + + HIP_CHECK(hipMemcpy(d_keys_input, input_keys.data(), + input_keys.size() * sizeof(input_keys[0]), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_values_input, input_values.data(), + input_values.size() * sizeof(input_values[0]), + hipMemcpyHostToDevice)); + + // Allocate temporary storage memory + size_t temp_storage_size_bytes; + + // Get size of d_temp_storage + HIP_CHECK(hipcub::DeviceSelect::UniqueByKey( + nullptr, temp_storage_size_bytes, d_keys_input, d_values_input, + d_keys_output, d_values_output, d_selected_count_output, + input_keys.size(), stream)); + HIP_CHECK(hipDeviceSynchronize()); + + // allocate temporary storage + void *d_temp_storage = nullptr; + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < 10; i++) { + HIP_CHECK(hipcub::DeviceSelect::UniqueByKey( + d_temp_storage, temp_storage_size_bytes, d_keys_input, d_values_input, + d_keys_output, d_values_output, d_selected_count_output, + input_keys.size(), stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + const unsigned int batch_size = 10; + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK(hipcub::DeviceSelect::UniqueByKey( + d_temp_storage, temp_storage_size_bytes, d_keys_input, d_values_input, + d_keys_output, d_values_output, d_selected_count_output, + input_keys.size(), stream)); } HIP_CHECK(hipDeviceSynchronize()); - const unsigned int batch_size = 10; - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < batch_size; i++) - { - HIP_CHECK( - hipcub::DeviceSelect::UniqueByKey( - d_temp_storage, - temp_storage_size_bytes, - d_keys_input, - d_values_input, - d_keys_output, - d_values_output, - d_selected_count_output, - input_keys.size(), - stream - ) - ); - } - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(KeyT) + sizeof(ValueT))); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - hipFree(d_keys_input); - hipFree(d_values_input); - hipFree(d_keys_output); - hipFree(d_values_output); - hipFree(d_selected_count_output); - hipFree(d_temp_storage); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * + (sizeof(KeyT) + sizeof(ValueT))); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + hipFree(d_keys_input); + hipFree(d_values_input); + hipFree(d_keys_output); + hipFree(d_values_output); + hipFree(d_selected_count_output); + hipFree(d_temp_storage); } -#define CREATE_SELECT_FLAGGED_BENCHMARK(T, F, p) \ -benchmark::RegisterBenchmark( \ - ("device_select_flagged.(probability:" #p")"), \ - &run_flagged_benchmark, size, stream, p \ -) - -#define CREATE_SELECT_IF_BENCHMARK(T, p) \ -benchmark::RegisterBenchmark( \ - ("device_select_if.(probability:" #p")"), \ - &run_selectop_benchmark, size, stream, p \ -) - -#define CREATE_UNIQUE_BENCHMARK(T, p) \ -benchmark::RegisterBenchmark( \ - ("device_select_unique.(probability:" #p")"), \ - &run_unique_benchmark, size, stream, p \ -) - -#define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p) \ -benchmark::RegisterBenchmark( \ - ("device_select_unique_by_key.(probability:" #p")"), \ - &run_unique_by_key_benchmark, size, stream, p \ -) - -#define BENCHMARK_FLAGGED_TYPE(type, value) \ - CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.05f), \ - CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.25f), \ - CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.5f), \ - CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.75f) - -#define BENCHMARK_IF_TYPE(type) \ - CREATE_SELECT_IF_BENCHMARK(type, 0.05f), \ - CREATE_SELECT_IF_BENCHMARK(type, 0.25f), \ - CREATE_SELECT_IF_BENCHMARK(type, 0.5f), \ - CREATE_SELECT_IF_BENCHMARK(type, 0.75f) - -#define BENCHMARK_UNIQUE_TYPE(type) \ - CREATE_UNIQUE_BENCHMARK(type, 0.05f), \ - CREATE_UNIQUE_BENCHMARK(type, 0.25f), \ - CREATE_UNIQUE_BENCHMARK(type, 0.5f), \ - CREATE_UNIQUE_BENCHMARK(type, 0.75f) - -#define BENCHMARK_UNIQUE_BY_KEY_TYPE(key_type, value_type) \ - CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.05f), \ - CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.25f), \ - CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.5f), \ - CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.75f) - -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_select" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - using custom_double2 = benchmark_utils::custom_type; - using custom_int_double = benchmark_utils::custom_type; - - // Add benchmarks - std::vector benchmarks = - { - BENCHMARK_FLAGGED_TYPE(int, unsigned char), - BENCHMARK_FLAGGED_TYPE(float, unsigned char), - BENCHMARK_FLAGGED_TYPE(double, unsigned char), - BENCHMARK_FLAGGED_TYPE(uint8_t, uint8_t), - BENCHMARK_FLAGGED_TYPE(int8_t, int8_t), - BENCHMARK_FLAGGED_TYPE(custom_double2, unsigned char), - - BENCHMARK_IF_TYPE(int), - BENCHMARK_IF_TYPE(float), - BENCHMARK_IF_TYPE(double), - BENCHMARK_IF_TYPE(uint8_t), - BENCHMARK_IF_TYPE(int8_t), - BENCHMARK_IF_TYPE(custom_int_double), - - BENCHMARK_UNIQUE_TYPE(int), - BENCHMARK_UNIQUE_TYPE(float), - BENCHMARK_UNIQUE_TYPE(double), - BENCHMARK_UNIQUE_TYPE(uint8_t), - BENCHMARK_UNIQUE_TYPE(int8_t), - BENCHMARK_UNIQUE_TYPE(custom_int_double), - - BENCHMARK_UNIQUE_BY_KEY_TYPE(int, int), - BENCHMARK_UNIQUE_BY_KEY_TYPE(float, double), - BENCHMARK_UNIQUE_BY_KEY_TYPE(double, custom_double2), - BENCHMARK_UNIQUE_BY_KEY_TYPE(uint8_t, uint8_t), - BENCHMARK_UNIQUE_BY_KEY_TYPE(int8_t, double), - BENCHMARK_UNIQUE_BY_KEY_TYPE(custom_int_double, custom_int_double) - }; - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +#define CREATE_SELECT_FLAGGED_BENCHMARK(T, F, p) \ + benchmark::RegisterBenchmark( \ + ("device_select_flagged.(probability:" #p ")"), \ + &run_flagged_benchmark, size, stream, p) + +#define CREATE_SELECT_IF_BENCHMARK(T, p) \ + benchmark::RegisterBenchmark( \ + ("device_select_if.(probability:" #p ")"), \ + &run_selectop_benchmark, size, stream, p) + +#define CREATE_UNIQUE_BENCHMARK(T, p) \ + benchmark::RegisterBenchmark( \ + ("device_select_unique.(probability:" #p ")"), \ + &run_unique_benchmark, size, stream, p) + +#define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p) \ + benchmark::RegisterBenchmark( \ + ("device_select_unique_by_key.(probability:" #p ")"), \ + &run_unique_by_key_benchmark, size, stream, p) + +#define BENCHMARK_FLAGGED_TYPE(type, value) \ + CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.05f), \ + CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.25f), \ + CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.5f), \ + CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.75f) + +#define BENCHMARK_IF_TYPE(type) \ + CREATE_SELECT_IF_BENCHMARK(type, 0.05f), \ + CREATE_SELECT_IF_BENCHMARK(type, 0.25f), \ + CREATE_SELECT_IF_BENCHMARK(type, 0.5f), \ + CREATE_SELECT_IF_BENCHMARK(type, 0.75f) + +#define BENCHMARK_UNIQUE_TYPE(type) \ + CREATE_UNIQUE_BENCHMARK(type, 0.05f), CREATE_UNIQUE_BENCHMARK(type, 0.25f), \ + CREATE_UNIQUE_BENCHMARK(type, 0.5f), \ + CREATE_UNIQUE_BENCHMARK(type, 0.75f) + +#define BENCHMARK_UNIQUE_BY_KEY_TYPE(key_type, value_type) \ + CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.05f), \ + CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.25f), \ + CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.5f), \ + CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.75f) + +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_select" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + using custom_double2 = benchmark_utils::custom_type; + using custom_int_double = benchmark_utils::custom_type; + + // Add benchmarks + std::vector benchmarks = { + BENCHMARK_FLAGGED_TYPE(int, unsigned char), + BENCHMARK_FLAGGED_TYPE(float, unsigned char), + BENCHMARK_FLAGGED_TYPE(double, unsigned char), + BENCHMARK_FLAGGED_TYPE(uint8_t, uint8_t), + BENCHMARK_FLAGGED_TYPE(int8_t, int8_t), + BENCHMARK_FLAGGED_TYPE(custom_double2, unsigned char), + + BENCHMARK_IF_TYPE(int), + BENCHMARK_IF_TYPE(float), + BENCHMARK_IF_TYPE(double), + BENCHMARK_IF_TYPE(uint8_t), + BENCHMARK_IF_TYPE(int8_t), + BENCHMARK_IF_TYPE(custom_int_double), + + BENCHMARK_UNIQUE_TYPE(int), + BENCHMARK_UNIQUE_TYPE(float), + BENCHMARK_UNIQUE_TYPE(double), + BENCHMARK_UNIQUE_TYPE(uint8_t), + BENCHMARK_UNIQUE_TYPE(int8_t), + BENCHMARK_UNIQUE_TYPE(custom_int_double), + + BENCHMARK_UNIQUE_BY_KEY_TYPE(int, int), + BENCHMARK_UNIQUE_BY_KEY_TYPE(float, double), + BENCHMARK_UNIQUE_BY_KEY_TYPE(double, custom_double2), + BENCHMARK_UNIQUE_BY_KEY_TYPE(uint8_t, uint8_t), + BENCHMARK_UNIQUE_BY_KEY_TYPE(int8_t, double), + BENCHMARK_UNIQUE_BY_KEY_TYPE(custom_int_double, custom_int_double)}; + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); - return 0; + return 0; } diff --git a/benchmark/benchmark_device_spmv.cpp b/benchmark/benchmark_device_spmv.cpp index a0ac69a2..aaa5ffa6 100644 --- a/benchmark/benchmark_device_spmv.cpp +++ b/benchmark/benchmark_device_spmv.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -32,207 +32,192 @@ const size_t DEFAULT_N = 1024 * 32; const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template -void run_benchmark(benchmark::State& state, - size_t size, - const hipStream_t stream, - float probability) -{ - const T rand_min = T(1); - const T rand_max = T(10); - - // generate a lexicograhically sorted list of (row, column) index tuples - // number of nonzeroes cannot be guaranteed as duplicates may exist - const int num_nonzeroes_attempt = static_cast(std::min( - static_cast(INT_MAX), static_cast(probability * static_cast(size * size)))); - std::vector> indices(num_nonzeroes_attempt); - { - std::vector flat_indices = benchmark_utils::get_random_data( - 2 * num_nonzeroes_attempt, 0, size - 1, 2 * num_nonzeroes_attempt); - for(int i = 0; i < num_nonzeroes_attempt; i++) - { - indices[i] = std::make_pair(flat_indices[2 * i], flat_indices[2 * i + 1]); - } - std::sort(indices.begin(), indices.end()); +template +void run_benchmark(benchmark::State &state, size_t size, + const hipStream_t stream, float probability) { + const T rand_min = T(1); + const T rand_max = T(10); + + // generate a lexicograhically sorted list of (row, column) index tuples + // number of nonzeroes cannot be guaranteed as duplicates may exist + const int num_nonzeroes_attempt = static_cast(std::min( + static_cast(INT_MAX), + static_cast(probability * static_cast(size * size)))); + std::vector> indices(num_nonzeroes_attempt); + { + std::vector flat_indices = benchmark_utils::get_random_data( + 2 * num_nonzeroes_attempt, 0, size - 1, 2 * num_nonzeroes_attempt); + for (int i = 0; i < num_nonzeroes_attempt; i++) { + indices[i] = std::make_pair(flat_indices[2 * i], flat_indices[2 * i + 1]); } - - // generate the compressed sparse rows matrix - std::pair prev_cell = std::make_pair(-1, -1); - int num_nonzeroes = 0; - std::vector row_offsets(size + 1); - // this vector might be too large, but doing the allocation now eliminates a scan - std::vector column_indices(num_nonzeroes_attempt); - row_offsets[0] = 0; - int last_row_written = 0; - for(int i = 0; i < num_nonzeroes_attempt; i++) - { - if(indices[i] != prev_cell) - { - // update the row offets if we go to the next row (or skip some) - if(indices[i].first != last_row_written) - { - for(int j = last_row_written + 1; j <= indices[i].first; j++) - { - row_offsets[j] = num_nonzeroes; - } - last_row_written = indices[i].first; - } - - column_indices[num_nonzeroes++] = indices[i].second; - - prev_cell = indices[i]; + std::sort(indices.begin(), indices.end()); + } + + // generate the compressed sparse rows matrix + std::pair prev_cell = std::make_pair(-1, -1); + int num_nonzeroes = 0; + std::vector row_offsets(size + 1); + // this vector might be too large, but doing the allocation now eliminates a + // scan + std::vector column_indices(num_nonzeroes_attempt); + row_offsets[0] = 0; + int last_row_written = 0; + for (int i = 0; i < num_nonzeroes_attempt; i++) { + if (indices[i] != prev_cell) { + // update the row offets if we go to the next row (or skip some) + if (indices[i].first != last_row_written) { + for (int j = last_row_written + 1; j <= indices[i].first; j++) { + row_offsets[j] = num_nonzeroes; } - } - // fill in the entries for any missing rows - for(int j = last_row_written + 1; j < static_cast(size) + 1; j++) - { - row_offsets[j] = num_nonzeroes; - } + last_row_written = indices[i].first; + } - // generate the random data once the actual number of nonzeroes are known - std::vector values = benchmark_utils::get_random_data(num_nonzeroes, rand_min, rand_max); - - std::vector vector_x = benchmark_utils::get_random_data(size, rand_min, rand_max); - - T * d_values; - int * d_row_offsets; - int * d_column_indices; - T * d_vector_x; - T * d_vector_y; - HIP_CHECK(hipMalloc(&d_values, values.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_row_offsets, row_offsets.size() * sizeof(int))); - HIP_CHECK(hipMalloc(&d_column_indices, num_nonzeroes * sizeof(int))); - HIP_CHECK(hipMalloc(&d_vector_x, vector_x.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_vector_y, size * sizeof(T))); - HIP_CHECK(hipMemcpy( - d_values, values.data(), values.size() * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy( - d_row_offsets, row_offsets.data(), row_offsets.size() * sizeof(int), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy( - d_column_indices, column_indices.data(), num_nonzeroes * sizeof(int), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy( - d_vector_x, vector_x.data(), vector_x.size() * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); + column_indices[num_nonzeroes++] = indices[i].second; - // Allocate temporary storage memory - size_t temp_storage_size_bytes; - - // Get size of d_temp_storage + prev_cell = indices[i]; + } + } + // fill in the entries for any missing rows + for (int j = last_row_written + 1; j < static_cast(size) + 1; j++) { + row_offsets[j] = num_nonzeroes; + } + + // generate the random data once the actual number of nonzeroes are known + std::vector values = + benchmark_utils::get_random_data(num_nonzeroes, rand_min, rand_max); + + std::vector vector_x = + benchmark_utils::get_random_data(size, rand_min, rand_max); + + T *d_values; + int *d_row_offsets; + int *d_column_indices; + T *d_vector_x; + T *d_vector_y; + HIP_CHECK(hipMalloc(&d_values, values.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_row_offsets, row_offsets.size() * sizeof(int))); + HIP_CHECK(hipMalloc(&d_column_indices, num_nonzeroes * sizeof(int))); + HIP_CHECK(hipMalloc(&d_vector_x, vector_x.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_vector_y, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_values, values.data(), values.size() * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_row_offsets, row_offsets.data(), + row_offsets.size() * sizeof(int), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_column_indices, column_indices.data(), + num_nonzeroes * sizeof(int), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_vector_x, vector_x.data(), vector_x.size() * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + // Allocate temporary storage memory + size_t temp_storage_size_bytes; + + // Get size of d_temp_storage + HIP_CHECK(hipcub::DeviceSpmv::CsrMV(nullptr, temp_storage_size_bytes, + d_values, d_row_offsets, d_column_indices, + d_vector_x, d_vector_y, size, size, + num_nonzeroes, stream)); + HIP_CHECK(hipDeviceSynchronize()); + + // allocate temporary storage + void *d_temp_storage = nullptr; + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for (size_t i = 0; i < warmup_size; i++) { HIP_CHECK(hipcub::DeviceSpmv::CsrMV( - nullptr, temp_storage_size_bytes, d_values, d_row_offsets, - d_column_indices, d_vector_x, d_vector_y, size, size, num_nonzeroes, stream)); - HIP_CHECK(hipDeviceSynchronize()); - - // allocate temporary storage - void * d_temp_storage = nullptr; - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for(size_t i = 0; i < warmup_size; i++) - { - HIP_CHECK(hipcub::DeviceSpmv::CsrMV( - d_temp_storage, temp_storage_size_bytes, d_values, d_row_offsets, - d_column_indices, d_vector_x, d_vector_y, size, size, num_nonzeroes, stream)); + d_temp_storage, temp_storage_size_bytes, d_values, d_row_offsets, + d_column_indices, d_vector_x, d_vector_y, size, size, num_nonzeroes, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < batch_size; i++) { + HIP_CHECK(hipcub::DeviceSpmv::CsrMV( + d_temp_storage, temp_storage_size_bytes, d_values, d_row_offsets, + d_column_indices, d_vector_x, d_vector_y, size, size, num_nonzeroes, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - for(auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - for(size_t i = 0; i < batch_size; i++) - { - HIP_CHECK(hipcub::DeviceSpmv::CsrMV( - d_temp_storage, temp_storage_size_bytes, d_values, d_row_offsets, - d_column_indices, d_vector_x, d_vector_y, size, size, num_nonzeroes, stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * (num_nonzeroes + size) * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * (num_nonzeroes + size)); - - hipFree(d_temp_storage); - hipFree(d_vector_y); - hipFree(d_vector_x); - hipFree(d_column_indices); - hipFree(d_row_offsets); - hipFree(d_values); - HIP_CHECK(hipDeviceSynchronize()); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * + (num_nonzeroes + size) * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * + (num_nonzeroes + size)); + + hipFree(d_temp_storage); + hipFree(d_vector_y); + hipFree(d_vector_x); + hipFree(d_column_indices); + hipFree(d_row_offsets); + hipFree(d_values); + HIP_CHECK(hipDeviceSynchronize()); } #define CREATE_BENCHMARK(T, p) \ - benchmark::RegisterBenchmark( \ - (std::string("device_spmv_CsrMV.") \ - ).c_str(), \ - &run_benchmark, size, stream, p \ - ) - -#define BENCHMARK_TYPE(type) \ - CREATE_BENCHMARK(type, 1.0e-6f), \ - CREATE_BENCHMARK(type, 1.0e-5f), \ - CREATE_BENCHMARK(type, 1.0e-4f), \ - CREATE_BENCHMARK(type, 1.0e-3f), \ - CREATE_BENCHMARK(type, 1.0e-2f) - -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - - std::cout << "benchmark_device_spmv" << std::endl; - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks = - { - BENCHMARK_TYPE(int), - BENCHMARK_TYPE(unsigned int), - BENCHMARK_TYPE(float), - BENCHMARK_TYPE(double), - }; - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } + benchmark::RegisterBenchmark( \ + (std::string("device_spmv_CsrMV.")) \ + .c_str(), \ + &run_benchmark, size, stream, p) + +#define BENCHMARK_TYPE(type) \ + CREATE_BENCHMARK(type, 1.0e-6f), CREATE_BENCHMARK(type, 1.0e-5f), \ + CREATE_BENCHMARK(type, 1.0e-4f), CREATE_BENCHMARK(type, 1.0e-3f), \ + CREATE_BENCHMARK(type, 1.0e-2f) + +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_device_spmv" << std::endl; + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks = { + BENCHMARK_TYPE(int), + BENCHMARK_TYPE(unsigned int), + BENCHMARK_TYPE(float), + BENCHMARK_TYPE(double), + }; + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); - return 0; + return 0; } diff --git a/benchmark/benchmark_utils.hpp b/benchmark/benchmark_utils.hpp index 6e2f9793..a68fd7db 100644 --- a/benchmark/benchmark_utils.hpp +++ b/benchmark/benchmark_utils.hpp @@ -24,15 +24,15 @@ #define HIPCUB_BENCHMARK_UTILS_HPP_ #ifndef BENCHMARK_UTILS_INCLUDE_GUARD - #error benchmark_utils.hpp must ONLY be included by common_benchmark_header.hpp. Please include common_benchmark_header.hpp instead. +#error benchmark_utils.hpp must ONLY be included by common_benchmark_header.hpp. Please include common_benchmark_header.hpp instead. #endif // hipCUB API #ifdef __HIP_PLATFORM_AMD__ - #include "hipcub/backend/rocprim/util_ptx.hpp" +#include "hipcub/backend/rocprim/util_ptx.hpp" #elif defined(__HIP_PLATFORM_NVIDIA__) - #include "hipcub/config.hpp" - #include +#include "hipcub/config.hpp" +#include #endif #include "hipcub/tuple.hpp" @@ -43,448 +43,391 @@ #define HIPCUB_WARP_THREADS_MACRO CUB_PTX_WARP_THREADS #endif -namespace benchmark_utils -{ +namespace benchmark_utils { const size_t default_max_random_size = 1024 * 1024; // get_random_data() generates only part of sequence and replicates it, // because benchmarks usually do not need "true" random sequence. -template -inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = default_max_random_size) - -> typename std::enable_if::value, std::vector>::type -{ - std::random_device rd; - std::default_random_engine gen(rd()); - using distribution_type = typename std::conditional<(sizeof(T)==1), short, T>::type; - std::uniform_int_distribution distribution(min, max); - std::vector data(size); - std::generate( - data.begin(), data.begin() + std::min(size, max_random_size), - [&]() { return distribution(gen); } - ); - for(size_t i = max_random_size; i < size; i += max_random_size) - { - std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); - } - return data; +template +inline auto get_random_data(size_t size, T min, T max, + size_t max_random_size = default_max_random_size) -> + typename std::enable_if::value, std::vector>::type { + std::random_device rd; + std::default_random_engine gen(rd()); + using distribution_type = + typename std::conditional<(sizeof(T) == 1), short, T>::type; + std::uniform_int_distribution distribution(min, max); + std::vector data(size); + std::generate(data.begin(), data.begin() + std::min(size, max_random_size), + [&]() { return distribution(gen); }); + for (size_t i = max_random_size; i < size; i += max_random_size) { + std::copy_n(data.begin(), std::min(size - i, max_random_size), + data.begin() + i); + } + return data; } -template -inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = default_max_random_size) - -> typename std::enable_if::value, std::vector>::type -{ - std::random_device rd; - std::default_random_engine gen(rd()); - std::uniform_real_distribution distribution(min, max); - std::vector data(size); - std::generate( - data.begin(), data.begin() + std::min(size, max_random_size), - [&]() { return distribution(gen); } - ); - for(size_t i = max_random_size; i < size; i += max_random_size) - { - std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); - } - return data; +template +inline auto get_random_data(size_t size, T min, T max, + size_t max_random_size = default_max_random_size) -> + typename std::enable_if::value, + std::vector>::type { + std::random_device rd; + std::default_random_engine gen(rd()); + std::uniform_real_distribution distribution(min, max); + std::vector data(size); + std::generate(data.begin(), data.begin() + std::min(size, max_random_size), + [&]() { return distribution(gen); }); + for (size_t i = max_random_size; i < size; i += max_random_size) { + std::copy_n(data.begin(), std::min(size - i, max_random_size), + data.begin() + i); + } + return data; } -template -inline std::vector get_random_data01(size_t size, float p, size_t max_random_size = default_max_random_size) -{ - std::random_device rd; - std::default_random_engine gen(rd()); - std::bernoulli_distribution distribution(p); - std::vector data(size); - std::generate( - data.begin(), data.begin() + std::min(size, max_random_size), - [&]() { return distribution(gen); } - ); - for(size_t i = max_random_size; i < size; i += max_random_size) - { - std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); - } - return data; +template +inline std::vector +get_random_data01(size_t size, float p, + size_t max_random_size = default_max_random_size) { + std::random_device rd; + std::default_random_engine gen(rd()); + std::bernoulli_distribution distribution(p); + std::vector data(size); + std::generate(data.begin(), data.begin() + std::min(size, max_random_size), + [&]() { return distribution(gen); }); + for (size_t i = max_random_size; i < size; i += max_random_size) { + std::copy_n(data.begin(), std::min(size - i, max_random_size), + data.begin() + i); + } + return data; } -template -inline T get_random_value(T min, T max) -{ - return get_random_data(1, min, max)[0]; +template inline T get_random_value(T min, T max) { + return get_random_data(1, min, max)[0]; } - // Can't use std::prefix_sum for inclusive/exclusive scan, because // it does not handle short[] -> int(int a, int b) { a + b; } -> int[] // they way we expect. That's because sum in std::prefix_sum's implementation // is of type typename std::iterator_traits::value_type (short) -template -OutputIt host_inclusive_scan(InputIt first, InputIt last, - OutputIt d_first, BinaryOperation op) -{ - using input_type = typename std::iterator_traits::value_type; - using output_type = typename std::iterator_traits::value_type; - using result_type = - typename std::conditional< - std::is_void::value, input_type, output_type - >::type; - - if (first == last) return d_first; - - result_type sum = *first; - *d_first = sum; - - while (++first != last) { - sum = op(sum, static_cast(*first)); - *++d_first = sum; - } - return ++d_first; +template +OutputIt host_inclusive_scan(InputIt first, InputIt last, OutputIt d_first, + BinaryOperation op) { + using input_type = typename std::iterator_traits::value_type; + using output_type = typename std::iterator_traits::value_type; + using result_type = + typename std::conditional::value, input_type, + output_type>::type; + + if (first == last) + return d_first; + + result_type sum = *first; + *d_first = sum; + + while (++first != last) { + sum = op(sum, static_cast(*first)); + *++d_first = sum; + } + return ++d_first; } -template -OutputIt host_exclusive_scan(InputIt first, InputIt last, - T initial_value, OutputIt d_first, - BinaryOperation op) -{ - using input_type = typename std::iterator_traits::value_type; - using output_type = typename std::iterator_traits::value_type; - using result_type = - typename std::conditional< - std::is_void::value, input_type, output_type - >::type; - - if (first == last) return d_first; - - result_type sum = initial_value; - *d_first = initial_value; - - while ((first+1) != last) - { - sum = op(sum, static_cast(*first)); - *++d_first = sum; - first++; - } - return ++d_first; +template +OutputIt host_exclusive_scan(InputIt first, InputIt last, T initial_value, + OutputIt d_first, BinaryOperation op) { + using input_type = typename std::iterator_traits::value_type; + using output_type = typename std::iterator_traits::value_type; + using result_type = + typename std::conditional::value, input_type, + output_type>::type; + + if (first == last) + return d_first; + + result_type sum = initial_value; + *d_first = initial_value; + + while ((first + 1) != last) { + sum = op(sum, static_cast(*first)); + *++d_first = sum; + first++; + } + return ++d_first; } -template +template OutputIt host_exclusive_scan_by_key(InputIt first, InputIt last, KeyIt k_first, T initial_value, OutputIt d_first, - BinaryOperation op, KeyCompare key_compare_op) -{ - using input_type = typename std::iterator_traits::value_type; - using output_type = typename std::iterator_traits::value_type; - using result_type = - typename std::conditional< - std::is_void::value, input_type, output_type - >::type; - - if (first == last) return d_first; - - result_type sum = initial_value; - *d_first = initial_value; - - while ((first+1) != last) - { - if(key_compare_op(*k_first, *++k_first)) - { - sum = op(sum, static_cast(*first)); - } - else - { - sum = initial_value; - } - *++d_first = sum; - first++; + BinaryOperation op, + KeyCompare key_compare_op) { + using input_type = typename std::iterator_traits::value_type; + using output_type = typename std::iterator_traits::value_type; + using result_type = + typename std::conditional::value, input_type, + output_type>::type; + + if (first == last) + return d_first; + + result_type sum = initial_value; + *d_first = initial_value; + + while ((first + 1) != last) { + if (key_compare_op(*k_first, *++k_first)) { + sum = op(sum, static_cast(*first)); + } else { + sum = initial_value; } - return ++d_first; + *++d_first = sum; + first++; + } + return ++d_first; } -template -struct custom_type -{ - using first_type = T; - using second_type = U; - - T x; - U y; +template struct custom_type { + using first_type = T; + using second_type = U; - HIPCUB_HOST_DEVICE inline - constexpr custom_type() : x(T()), y(U()) {} + T x; + U y; - HIPCUB_HOST_DEVICE inline - constexpr custom_type(T xx, U yy) : x(xx), y(yy) - { - } - - HIPCUB_HOST_DEVICE inline - constexpr custom_type(T xy) : x(xy), y(xy) - { - } + HIPCUB_HOST_DEVICE inline constexpr custom_type() : x(T()), y(U()) {} - template - HIPCUB_HOST_DEVICE inline - custom_type(const custom_type& other) : x(other.x), y(other.y) - { - } - - #ifndef HIPCUB_CUB_API - HIPCUB_HOST_DEVICE inline - ~custom_type() = default; - #endif - - HIPCUB_HOST_DEVICE inline - custom_type& operator=(const custom_type& other) - { - x = other.x; - y = other.y; - return *this; - } - - HIPCUB_HOST_DEVICE inline - custom_type operator+(const custom_type& rhs) const - { - return custom_type(x + rhs.x, y + rhs.y); - } - - HIPCUB_HOST_DEVICE inline - custom_type operator-(const custom_type& other) const - { - return custom_type(x - other.x, y - other.y); - } - - HIPCUB_HOST_DEVICE inline - bool operator<(const custom_type& rhs) const - { - // intentionally suboptimal choice for short-circuting, - // required to generate more performant device code - return ((x == rhs.x && y < rhs.y) || x < rhs.x); - } + HIPCUB_HOST_DEVICE inline constexpr custom_type(T xx, U yy) : x(xx), y(yy) {} - HIPCUB_HOST_DEVICE inline - bool operator>(const custom_type& other) const - { - return (x > other.x || (x == other.x && y > other.y)); - } + HIPCUB_HOST_DEVICE inline constexpr custom_type(T xy) : x(xy), y(xy) {} - HIPCUB_HOST_DEVICE inline - bool operator==(const custom_type& rhs) const - { - return x == rhs.x && y == rhs.y; - } + template + HIPCUB_HOST_DEVICE inline custom_type(const custom_type &other) + : x(other.x), y(other.y) {} - HIPCUB_HOST_DEVICE inline - bool operator!=(const custom_type& other) const - { - return !(*this == other); - } +#ifndef HIPCUB_CUB_API + HIPCUB_HOST_DEVICE inline ~custom_type() = default; +#endif - HIPCUB_HOST_DEVICE custom_type& operator+=(const custom_type& rhs) - { - this->x += rhs.x; - this->y += rhs.y; - return *this; - } + HIPCUB_HOST_DEVICE inline custom_type &operator=(const custom_type &other) { + x = other.x; + y = other.y; + return *this; + } + + HIPCUB_HOST_DEVICE inline custom_type + operator+(const custom_type &rhs) const { + return custom_type(x + rhs.x, y + rhs.y); + } + + HIPCUB_HOST_DEVICE inline custom_type + operator-(const custom_type &other) const { + return custom_type(x - other.x, y - other.y); + } + + HIPCUB_HOST_DEVICE inline bool operator<(const custom_type &rhs) const { + // intentionally suboptimal choice for short-circuting, + // required to generate more performant device code + return ((x == rhs.x && y < rhs.y) || x < rhs.x); + } + + HIPCUB_HOST_DEVICE inline bool operator>(const custom_type &other) const { + return (x > other.x || (x == other.x && y > other.y)); + } + + HIPCUB_HOST_DEVICE inline bool operator==(const custom_type &rhs) const { + return x == rhs.x && y == rhs.y; + } + + HIPCUB_HOST_DEVICE inline bool operator!=(const custom_type &other) const { + return !(*this == other); + } + + HIPCUB_HOST_DEVICE custom_type &operator+=(const custom_type &rhs) { + this->x += rhs.x; + this->y += rhs.y; + return *this; + } }; -template -struct is_custom_type : std::false_type {}; +template struct is_custom_type : std::false_type {}; -template -struct is_custom_type> : std::true_type {}; +template +struct is_custom_type> : std::true_type {}; -template -struct custom_type_decomposer -{ - static_assert(is_custom_type::value, - "custom_type_decomposer can only be used with instantiations of custom_type"); +template struct custom_type_decomposer { + static_assert(is_custom_type::value, + "custom_type_decomposer can only be used with instantiations " + "of custom_type"); - using T = typename CustomType::first_type; - using U = typename CustomType::second_type; + using T = typename CustomType::first_type; + using U = typename CustomType::second_type; - HIPCUB_HOST_DEVICE ::hipcub::tuple operator()(CustomType& key) const - { - return ::hipcub::tuple{key.x, key.y}; - } + HIPCUB_HOST_DEVICE ::hipcub::tuple + operator()(CustomType &key) const { + return ::hipcub::tuple{key.x, key.y}; + } }; -template -inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) - -> typename std::enable_if::value, std::vector>::type -{ - using first_type = typename T::first_type; - using second_type = typename T::second_type; - std::vector data(size); - auto fdata = get_random_data(size, min.x, max.x, max_random_size); - auto sdata = get_random_data(size, min.y, max.y, max_random_size); - for(size_t i = 0; i < size; i++) - { - data[i] = T(fdata[i], sdata[i]); - } - return data; +template +inline auto get_random_data(size_t size, T min, T max, + size_t max_random_size = 1024 * 1024) -> + typename std::enable_if::value, std::vector>::type { + using first_type = typename T::first_type; + using second_type = typename T::second_type; + std::vector data(size); + auto fdata = get_random_data(size, min.x, max.x, max_random_size); + auto sdata = + get_random_data(size, min.y, max.y, max_random_size); + for (size_t i = 0; i < size; i++) { + data[i] = T(fdata[i], sdata[i]); + } + return data; } -template -inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) - -> typename std::enable_if::value && !std::is_same::value, std::vector>::type -{ - - using field_type = decltype(max.x); - std::vector data(size); - auto field_data = get_random_data(size, min.x, max.x, max_random_size); - for(size_t i = 0; i < size; i++) - { - data[i] = T(field_data[i]); - } - return data; +template +inline auto get_random_data(size_t size, T min, T max, + size_t max_random_size = 1024 * 1024) -> + typename std::enable_if::value && + !std::is_same::value, + std::vector>::type { + + using field_type = decltype(max.x); + std::vector data(size); + auto field_data = + get_random_data(size, min.x, max.x, max_random_size); + for (size_t i = 0; i < size; i++) { + data[i] = T(field_data[i]); + } + return data; } -template +template std::vector get_random_segments(const size_t size, const size_t max_segment_length, - const int seed_value) -{ - static_assert(std::is_arithmetic::value, "Key type must be arithmetic"); - - std::default_random_engine prng(seed_value); - std::uniform_int_distribution segment_length_distribution(max_segment_length); - using key_distribution_type = std::conditional_t< - std::is_integral::value, - std::uniform_int_distribution, - std::uniform_real_distribution - >; - key_distribution_type key_distribution(std::numeric_limits::max()); - std::vector keys(size); - - size_t keys_start_index = 0; - while (keys_start_index < size) - { - const size_t new_segment_length = segment_length_distribution(prng); - const size_t new_segment_end = std::min(size, keys_start_index + new_segment_length); - const T key = key_distribution(prng); - std::fill( - std::next(keys.begin(), keys_start_index), - std::next(keys.begin(), new_segment_end), - key - ); - keys_start_index += new_segment_length; - } - return keys; + const int seed_value) { + static_assert(std::is_arithmetic::value, "Key type must be arithmetic"); + + std::default_random_engine prng(seed_value); + std::uniform_int_distribution segment_length_distribution( + max_segment_length); + using key_distribution_type = + std::conditional_t::value, + std::uniform_int_distribution, + std::uniform_real_distribution>; + key_distribution_type key_distribution(std::numeric_limits::max()); + std::vector keys(size); + + size_t keys_start_index = 0; + while (keys_start_index < size) { + const size_t new_segment_length = segment_length_distribution(prng); + const size_t new_segment_end = + std::min(size, keys_start_index + new_segment_length); + const T key = key_distribution(prng); + std::fill(std::next(keys.begin(), keys_start_index), + std::next(keys.begin(), new_segment_end), key); + keys_start_index += new_segment_length; + } + return keys; } -bool is_warp_size_supported(const unsigned required_warp_size) -{ - return HIPCUB_HOST_WARP_THREADS >= required_warp_size; +bool is_warp_size_supported(const unsigned required_warp_size) { + return HIPCUB_HOST_WARP_THREADS >= required_warp_size; } -template -__device__ constexpr bool device_test_enabled_for_warp_size_v - = HIPCUB_DEVICE_WARP_THREADS >= LogicalWarpSize; +template +__device__ constexpr bool device_test_enabled_for_warp_size_v = + HIPCUB_DEVICE_WARP_THREADS >= LogicalWarpSize; -template +template using it_value_t = typename std::iterator_traits::value_type; using engine_type = std::default_random_engine; // generate_random_data_n() generates only part of sequence and replicates it, // because benchmarks usually do not need "true" random sequence. -template -inline auto generate_random_data_n( - OutputIter it, size_t size, U min, V max, Generator& gen, size_t max_random_size = 1024 * 1024) - -> typename std::enable_if_t>::value, OutputIter> -{ - using T = it_value_t; - - using dis_type = typename std::conditional<(sizeof(T) == 1), short, T>::type; - std::uniform_int_distribution distribution((T)min, (T)max); - std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); }); - for(size_t i = max_random_size; i < size; i += max_random_size) - { - std::copy_n(it, std::min(size - i, max_random_size), it + i); - } - return it + size; +template +inline auto generate_random_data_n(OutputIter it, size_t size, U min, V max, + Generator &gen, + size_t max_random_size = 1024 * 1024) -> + typename std::enable_if_t>::value, + OutputIter> { + using T = it_value_t; + + using dis_type = typename std::conditional<(sizeof(T) == 1), short, T>::type; + std::uniform_int_distribution distribution((T)min, (T)max); + std::generate_n(it, std::min(size, max_random_size), + [&]() { return distribution(gen); }); + for (size_t i = max_random_size; i < size; i += max_random_size) { + std::copy_n(it, std::min(size - i, max_random_size), it + i); + } + return it + size; } -template -inline auto generate_random_data_n(OutputIterator it, - size_t size, - U min, - V max, - Generator& gen, - size_t max_random_size = 1024 * 1024) - -> std::enable_if_t>::value, OutputIterator> -{ - using T = typename std::iterator_traits::value_type; - - std::uniform_real_distribution distribution((T)min, (T)max); - std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); }); - for(size_t i = max_random_size; i < size; i += max_random_size) - { - std::copy_n(it, std::min(size - i, max_random_size), it + i); - } - return it + size; +template +inline auto generate_random_data_n(OutputIterator it, size_t size, U min, V max, + Generator &gen, + size_t max_random_size = 1024 * 1024) + -> std::enable_if_t< + std::is_floating_point>::value, + OutputIterator> { + using T = typename std::iterator_traits::value_type; + + std::uniform_real_distribution distribution((T)min, (T)max); + std::generate_n(it, std::min(size, max_random_size), + [&]() { return distribution(gen); }); + for (size_t i = max_random_size; i < size; i += max_random_size) { + std::copy_n(it, std::min(size - i, max_random_size), it + i); + } + return it + size; } -template -struct alignas(Alignment) custom_aligned_type -{ - unsigned char data[Size]; +template +struct alignas(Alignment) custom_aligned_type { + unsigned char data[Size]; }; -template::value && std::is_unsigned::value, int> = 0> -inline constexpr auto ceiling_div(const T a, const U b) -{ - return a / b + (a % b > 0 ? 1 : 0); +template < + typename T, typename U, + std::enable_if_t::value && std::is_unsigned::value, + int> = 0> +inline constexpr auto ceiling_div(const T a, const U b) { + return a / b + (a % b > 0 ? 1 : 0); } -} // end benchmark_util namespace +} // namespace benchmark_utils // Need for hipcub::DeviceReduce::Min/Max etc. -namespace std -{ - template<> - class numeric_limits> - { - using T = typename benchmark_utils::custom_type; - - public: - static constexpr inline T min() - { - return std::numeric_limits::min(); - } - - static constexpr inline T max() - { - return std::numeric_limits::max(); - } - - static constexpr inline T lowest() - { - return std::numeric_limits::lowest(); - } - }; - - template<> - class numeric_limits> - { - using T = typename benchmark_utils::custom_type; - - public: - static constexpr inline T min() - { - return std::numeric_limits::min(); - } - - static constexpr inline T max() - { - return std::numeric_limits::max(); - } - - static constexpr inline T lowest() - { - return std::numeric_limits::lowest(); - } - }; -} +namespace std { +template <> class numeric_limits> { + using T = typename benchmark_utils::custom_type; + +public: + static constexpr inline T min() { + return std::numeric_limits::min(); + } + + static constexpr inline T max() { + return std::numeric_limits::max(); + } + + static constexpr inline T lowest() { + return std::numeric_limits::lowest(); + } +}; + +template <> class numeric_limits> { + using T = typename benchmark_utils::custom_type; + +public: + static constexpr inline T min() { + return std::numeric_limits::min(); + } + + static constexpr inline T max() { + return std::numeric_limits::max(); + } + + static constexpr inline T lowest() { + return std::numeric_limits::lowest(); + } +}; +} // namespace std #endif // HIPCUB_BENCHMARK_UTILS_HPP_ diff --git a/benchmark/benchmark_warp_exchange.cpp b/benchmark/benchmark_warp_exchange.cpp index cec62c71..3c649c3c 100644 --- a/benchmark/benchmark_warp_exchange.cpp +++ b/benchmark/benchmark_warp_exchange.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -31,342 +31,303 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template -__device__ auto warp_exchange_benchmark(T* d_output) - -> std::enable_if_t> -{ - T thread_data[ItemsPerThread]; - #pragma unroll - for (unsigned i = 0; i < ItemsPerThread; ++i) - { - thread_data[i] = static_cast(i); - } - - using WarpExchangeT = ::hipcub::WarpExchange; - constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; - __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_in_block]; - const unsigned warp_id = threadIdx.x / LogicalWarpSize; - - WarpExchangeT warp_exchange(temp_storage[warp_id]); - Op{}(warp_exchange, thread_data); - - #pragma unroll - for (unsigned i = 0; i < ItemsPerThread; ++i) - { - const unsigned global_idx = (BlockSize * blockIdx.x + threadIdx.x) * ItemsPerThread + i; - d_output[global_idx] = thread_data[i]; - } +template +__device__ auto warp_exchange_benchmark(T *d_output) -> std::enable_if_t< + benchmark_utils::device_test_enabled_for_warp_size_v> { + T thread_data[ItemsPerThread]; +#pragma unroll + for (unsigned i = 0; i < ItemsPerThread; ++i) { + thread_data[i] = static_cast(i); + } + + using WarpExchangeT = + ::hipcub::WarpExchange; + constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; + __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_in_block]; + const unsigned warp_id = threadIdx.x / LogicalWarpSize; + + WarpExchangeT warp_exchange(temp_storage[warp_id]); + Op{}(warp_exchange, thread_data); + +#pragma unroll + for (unsigned i = 0; i < ItemsPerThread; ++i) { + const unsigned global_idx = + (BlockSize * blockIdx.x + threadIdx.x) * ItemsPerThread + i; + d_output[global_idx] = thread_data[i]; + } } -template -__device__ auto warp_exchange_benchmark(T* /*d_output*/) - -> std::enable_if_t> -{} - -template -__global__ __launch_bounds__(BlockSize) void warp_exchange_kernel(T* d_output) -{ - warp_exchange_benchmark(d_output); -} - -template -__device__ auto warp_exchange_scatter_to_striped_benchmark(T* d_output) - -> std::enable_if_t> -{ - const unsigned warp_id = threadIdx.x / LogicalWarpSize; - T thread_data[ItemsPerThread]; - OffsetT thread_ranks[ItemsPerThread]; - #pragma unroll - for (unsigned i = 0; i < ItemsPerThread; ++i) - { - thread_data[i] = static_cast(i); - thread_ranks[i] = static_cast(LogicalWarpSize - warp_id * ItemsPerThread - i - 1); - } - - using WarpExchangeT = ::hipcub::WarpExchange; - constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; - __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_in_block]; +template +__device__ auto warp_exchange_benchmark(T * /*d_output*/) -> std::enable_if_t< + !benchmark_utils::device_test_enabled_for_warp_size_v> {} - WarpExchangeT(temp_storage[warp_id]).ScatterToStriped(thread_data, thread_ranks); +template +__global__ __launch_bounds__(BlockSize) void warp_exchange_kernel(T *d_output) { + warp_exchange_benchmark(d_output); +} - #pragma unroll - for (unsigned i = 0; i < ItemsPerThread; ++i) - { - const unsigned striped_global_idx - = BlockSize * ItemsPerThread * blockIdx.x + BlockSize * i + threadIdx.x; - d_output[striped_global_idx] = thread_data[i]; - } +template +__device__ auto warp_exchange_scatter_to_striped_benchmark(T *d_output) + -> std::enable_if_t< + benchmark_utils::device_test_enabled_for_warp_size_v> { + const unsigned warp_id = threadIdx.x / LogicalWarpSize; + T thread_data[ItemsPerThread]; + OffsetT thread_ranks[ItemsPerThread]; +#pragma unroll + for (unsigned i = 0; i < ItemsPerThread; ++i) { + thread_data[i] = static_cast(i); + thread_ranks[i] = static_cast(LogicalWarpSize - + warp_id * ItemsPerThread - i - 1); + } + + using WarpExchangeT = + ::hipcub::WarpExchange; + constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; + __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_in_block]; + + WarpExchangeT(temp_storage[warp_id]) + .ScatterToStriped(thread_data, thread_ranks); + +#pragma unroll + for (unsigned i = 0; i < ItemsPerThread; ++i) { + const unsigned striped_global_idx = + BlockSize * ItemsPerThread * blockIdx.x + BlockSize * i + threadIdx.x; + d_output[striped_global_idx] = thread_data[i]; + } } -template -__device__ auto warp_exchange_scatter_to_striped_benchmark(T* /*d_output*/) - -> std::enable_if_t> -{} - -template -__global__ __launch_bounds__(BlockSize) void warp_exchange_scatter_to_striped_kernel(T* d_output) -{ - warp_exchange_scatter_to_striped_benchmark( - d_output); +template +__device__ auto warp_exchange_scatter_to_striped_benchmark(T * /*d_output*/) + -> std::enable_if_t> {} + +template +__global__ __launch_bounds__( + BlockSize) void warp_exchange_scatter_to_striped_kernel(T *d_output) { + warp_exchange_scatter_to_striped_benchmark(d_output); } -template -void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) -{ - constexpr unsigned trials = 100; - constexpr unsigned items_per_block = BlockSize * ItemsPerThread; - const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); - - T * d_output; - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < trials; ++i) - { - warp_exchange_kernel - <<>>(d_output); - } - - HIP_CHECK(hipPeekAtLastError()) - HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); +template +void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { + constexpr unsigned trials = 100; + constexpr unsigned items_per_block = BlockSize * ItemsPerThread; + const unsigned size = + items_per_block * ((N + items_per_block - 1) / items_per_block); + + T *d_output; + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < trials; ++i) { + warp_exchange_kernel + <<>>( + d_output); } - state.SetBytesProcessed(state.iterations() * trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * trials * size); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipPeekAtLastError()) + HIP_CHECK(hipDeviceSynchronize()); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * trials * size); + + HIP_CHECK(hipFree(d_output)); } -template< - class T, - class OffsetT, - unsigned BlockSize, - unsigned ItemsPerThread, - unsigned LogicalWarpSize -> -void run_benchmark_scatter_to_striped(benchmark::State& state, hipStream_t stream, size_t N) -{ - constexpr unsigned trials = 100; - constexpr unsigned items_per_block = BlockSize * ItemsPerThread; - const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); - - T * d_output; - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < trials; ++i) - { - warp_exchange_scatter_to_striped_kernel - <<>>(d_output); - } - - HIP_CHECK(hipPeekAtLastError()) - HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); +template +void run_benchmark_scatter_to_striped(benchmark::State &state, + hipStream_t stream, size_t N) { + constexpr unsigned trials = 100; + constexpr unsigned items_per_block = BlockSize * ItemsPerThread; + const unsigned size = + items_per_block * ((N + items_per_block - 1) / items_per_block); + + T *d_output; + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < trials; ++i) { + warp_exchange_scatter_to_striped_kernel + <<>>( + d_output); } - state.SetBytesProcessed(state.iterations() * trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * trials * size); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipPeekAtLastError()) + HIP_CHECK(hipDeviceSynchronize()); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * trials * size); + + HIP_CHECK(hipFree(d_output)); } -struct StripedToBlockedOp -{ - template - __device__ void operator()(WarpExchangeT& warp_exchange, T (&thread_data)[ItemsPerThread]) const - { - warp_exchange.StripedToBlocked(thread_data, thread_data); - } +struct StripedToBlockedOp { + template + __device__ void operator()(WarpExchangeT &warp_exchange, + T (&thread_data)[ItemsPerThread]) const { + warp_exchange.StripedToBlocked(thread_data, thread_data); + } }; -struct BlockedToStripedOp -{ - template - __device__ void operator()(WarpExchangeT& warp_exchange, T (&thread_data)[ItemsPerThread]) const - { - warp_exchange.BlockedToStriped(thread_data, thread_data); - } +struct BlockedToStripedOp { + template + __device__ void operator()(WarpExchangeT &warp_exchange, + T (&thread_data)[ItemsPerThread]) const { + warp_exchange.BlockedToStriped(thread_data, thread_data); + } }; -#define CREATE_BENCHMARK_STRIPED_TO_BLOCKED(T, BS, IT, WS, ALG) \ - benchmark::RegisterBenchmark( \ - std::string("warp_exchange_striped_to_blocked." \ - ).c_str(), \ - &run_benchmark, \ - stream, \ - size \ - ) - -#define CREATE_BENCHMARK_BLOCKED_TO_STRIPED(T, BS, IT, WS, ALG) \ - benchmark::RegisterBenchmark( \ - std::string("warp_exchange_blocked_to_striped." \ - ).c_str(), \ - &run_benchmark, \ - stream, \ - size \ - ) - -#define CREATE_BENCHMARK_SCATTER_TO_STRIPED(T, OFFSET_T, BS, IT, WS) \ -benchmark::RegisterBenchmark( \ - std::string("warp_exchange_scatter_to_striped." \ - ).c_str(), \ - &run_benchmark_scatter_to_striped, \ - stream, size \ -) - -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_warp_exchange" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks{ - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 16), - CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 32), - CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 256, 4, 32), - - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, WARP_EXCHANGE_SHUFFLE), +#define CREATE_BENCHMARK_STRIPED_TO_BLOCKED(T, BS, IT, WS, ALG) \ + benchmark::RegisterBenchmark( \ + std::string("warp_exchange_striped_to_blocked.") \ + .c_str(), \ + &run_benchmark, \ + stream, size) + +#define CREATE_BENCHMARK_BLOCKED_TO_STRIPED(T, BS, IT, WS, ALG) \ + benchmark::RegisterBenchmark( \ + std::string("warp_exchange_blocked_to_striped.") \ + .c_str(), \ + &run_benchmark, \ + stream, size) + +#define CREATE_BENCHMARK_SCATTER_TO_STRIPED(T, OFFSET_T, BS, IT, WS) \ + benchmark::RegisterBenchmark( \ + std::string("warp_exchange_scatter_to_striped.") \ + .c_str(), \ + &run_benchmark_scatter_to_striped, stream, \ + size) + +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_warp_exchange" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks{ + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 16), + CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 32), + CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 256, 4, 32), + + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, + WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, + WARP_EXCHANGE_SHUFFLE), // CUB requires WS == IPT for WARP_EXCHANGE_SHUFFLE #ifdef HIPCUB_ROCPRIM_API - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, + WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, + WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, + WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, + WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, + WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, + WARP_EXCHANGE_SHUFFLE), #endif - }; + }; #ifdef HIPCUB_ROCPRIM_API - if (::benchmark_utils::is_warp_size_supported(64)) - { - std::vector additional_benchmarks{ - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 64), - - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 256, 4, 64)}; - benchmarks.insert( - benchmarks.end(), - additional_benchmarks.begin(), - additional_benchmarks.end() - ); - } + if (::benchmark_utils::is_warp_size_supported(64)) { + std::vector additional_benchmarks{ + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, + WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, + WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, + WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, + WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 64), + + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, + WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, + WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, + WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, + WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 256, 4, 64)}; + benchmarks.insert(benchmarks.end(), additional_benchmarks.begin(), + additional_benchmarks.end()); + } #endif - // Use manual timing - for (auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } - // Force number of iterations - if (trials > 0) - { - for (auto& b : benchmarks) - { - b->Iterations(trials); - } + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_warp_load.cpp b/benchmark/benchmark_warp_load.cpp index fb708537..3479f83f 100644 --- a/benchmark/benchmark_warp_load.cpp +++ b/benchmark/benchmark_warp_load.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -31,248 +31,217 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template -__device__ auto warp_load_benchmark(T* d_input, T* d_output) - -> std::enable_if_t> -{ - using WarpLoadT = ::hipcub::WarpLoad; - constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; - constexpr int tile_size = ItemsPerThread * LogicalWarpSize; - - const unsigned warp_id = threadIdx.x / LogicalWarpSize; - const unsigned global_warp_id = blockIdx.x * warps_in_block + warp_id; - __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; - T thread_data[ItemsPerThread]; - - WarpLoadT(temp_storage[warp_id]).Load(d_input + global_warp_id * tile_size, thread_data); - - #pragma unroll - for (unsigned i = 0; i < ItemsPerThread; ++i) - { - const unsigned striped_global_idx - = BlockSize * ItemsPerThread * blockIdx.x + BlockSize * i + threadIdx.x; - d_output[striped_global_idx] = thread_data[i]; - } +template +__device__ auto warp_load_benchmark(T *d_input, T *d_output) + -> std::enable_if_t< + benchmark_utils::device_test_enabled_for_warp_size_v> { + using WarpLoadT = + ::hipcub::WarpLoad; + constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; + constexpr int tile_size = ItemsPerThread * LogicalWarpSize; + + const unsigned warp_id = threadIdx.x / LogicalWarpSize; + const unsigned global_warp_id = blockIdx.x * warps_in_block + warp_id; + __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; + T thread_data[ItemsPerThread]; + + WarpLoadT(temp_storage[warp_id]) + .Load(d_input + global_warp_id * tile_size, thread_data); + +#pragma unroll + for (unsigned i = 0; i < ItemsPerThread; ++i) { + const unsigned striped_global_idx = + BlockSize * ItemsPerThread * blockIdx.x + BlockSize * i + threadIdx.x; + d_output[striped_global_idx] = thread_data[i]; + } } -template -__device__ auto warp_load_benchmark(T* /*d_input*/, T* /*d_output*/) - -> std::enable_if_t> -{} - -template -__global__ __launch_bounds__(BlockSize) void warp_load_kernel(T* d_input, T* d_output) -{ - warp_load_benchmark(d_input, d_output); +template +__device__ auto +warp_load_benchmark(T * /*d_input*/, T * /*d_output*/) -> std::enable_if_t< + !benchmark_utils::device_test_enabled_for_warp_size_v> {} + +template +__global__ __launch_bounds__(BlockSize) void warp_load_kernel(T *d_input, + T *d_output) { + warp_load_benchmark( + d_input, d_output); } -template< - class T, - unsigned BlockSize, - unsigned ItemsPerThread, - unsigned LogicalWarpSize, - ::hipcub::WarpLoadAlgorithm Algorithm, - unsigned Trials = 100 -> -void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) -{ - constexpr unsigned items_per_block = BlockSize * ItemsPerThread; - const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); - - std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); - T * d_input; - T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); - - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < Trials; i++) - { - warp_load_kernel - <<>>(d_input, d_output); - } - HIP_CHECK(hipPeekAtLastError()) - HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); +template +void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { + constexpr unsigned items_per_block = BlockSize * ItemsPerThread; + const unsigned size = + items_per_block * ((N + items_per_block - 1) / items_per_block); + + std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); + T *d_input; + T *d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), + hipMemcpyHostToDevice)); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < Trials; i++) { + warp_load_kernel + <<>>( + d_input, d_output); } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); - - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipPeekAtLastError()) + HIP_CHECK(hipDeviceSynchronize()); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ - benchmark::RegisterBenchmark( \ - "warp_load.", \ - &run_benchmark, \ - stream, size \ - ) - -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks{ - CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_VECTORIZE), +#define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ + benchmark::RegisterBenchmark( \ + "warp_load.", \ + &run_benchmark, stream, size) + +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks{ + CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_VECTORIZE), + // WARP_LOAD_TRANSPOSE removed because of shared memory limit + // CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_VECTORIZE) + // WARP_LOAD_TRANSPOSE removed because of shared memory limit + // CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_TRANSPOSE) + }; + + if (::benchmark_utils::is_warp_size_supported(64)) { + std::vector additional_benchmarks{ + CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_VECTORIZE), // WARP_LOAD_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_VECTORIZE) + // CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_VECTORIZE), // WARP_LOAD_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_TRANSPOSE) + // CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_VECTORIZE) + // WARP_LOAD_TRANSPOSE removed because of shared memory limit + // CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_TRANSPOSE) }; - - if (::benchmark_utils::is_warp_size_supported(64)) - { - std::vector additional_benchmarks{ - CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_VECTORIZE), - // WARP_LOAD_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_VECTORIZE), - // WARP_LOAD_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_VECTORIZE) - // WARP_LOAD_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_TRANSPOSE) - }; - benchmarks.insert( - benchmarks.end(), - additional_benchmarks.begin(), - additional_benchmarks.end() - ); - } - - // Use manual timing - for (auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) - { - for (auto& b : benchmarks) - { - b->Iterations(trials); - } + benchmarks.insert(benchmarks.end(), additional_benchmarks.begin(), + additional_benchmarks.end()); + } + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_warp_merge_sort.cpp b/benchmark/benchmark_warp_merge_sort.cpp index 322271bc..e29d14be 100644 --- a/benchmark/benchmark_warp_merge_sort.cpp +++ b/benchmark/benchmark_warp_merge_sort.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -35,520 +35,442 @@ constexpr size_t DEFAULT_N = 1024 * 1024 * 128; #endif -enum class benchmark_kinds -{ - sort_keys, - sort_pairs, +enum class benchmark_kinds { + sort_keys, + sort_pairs, }; -template -__device__ auto sort_keys_benchmark(const T* input, T* output, Compare compare_op) - -> std::enable_if_t> -{ - constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; +template +__device__ auto sort_keys_benchmark(const T *input, T *output, + Compare compare_op) + -> std::enable_if_t< + benchmark_utils::device_test_enabled_for_warp_size_v> { + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int flat_tid = threadIdx.x; - const unsigned int block_offset = blockIdx.x * items_per_block; - T keys[ItemsPerThread]; - hipcub::LoadDirectBlocked(flat_tid, input + block_offset, keys); + const unsigned int flat_tid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * items_per_block; + T keys[ItemsPerThread]; + hipcub::LoadDirectBlocked(flat_tid, input + block_offset, keys); - constexpr unsigned int warps_per_block = BlockSize / LogicalWarpSize; - const unsigned int warp_id = threadIdx.x / LogicalWarpSize; + constexpr unsigned int warps_per_block = BlockSize / LogicalWarpSize; + const unsigned int warp_id = threadIdx.x / LogicalWarpSize; - using warp_merge_sort = hipcub::WarpMergeSort; - __shared__ typename warp_merge_sort::TempStorage storage[warps_per_block]; + using warp_merge_sort = + hipcub::WarpMergeSort; + __shared__ typename warp_merge_sort::TempStorage storage[warps_per_block]; - warp_merge_sort wsort{storage[warp_id]}; - wsort.Sort(keys, compare_op); + warp_merge_sort wsort{storage[warp_id]}; + wsort.Sort(keys, compare_op); - hipcub::StoreDirectBlocked(flat_tid, output + block_offset, keys); + hipcub::StoreDirectBlocked(flat_tid, output + block_offset, keys); } -template -__device__ auto sort_keys_benchmark(const T* /*input*/, T* /*output*/, Compare /*compare_op*/) - -> std::enable_if_t> -{} - -template -__global__ - __launch_bounds__(BlockSize) void sort_keys(const T* input, T* output, Compare compare_op) -{ - sort_keys_benchmark(input, output, compare_op); +template +__device__ auto sort_keys_benchmark(const T * /*input*/, T * /*output*/, + Compare /*compare_op*/) + -> std::enable_if_t> {} + +template +__global__ __launch_bounds__(BlockSize) void sort_keys(const T *input, + T *output, + Compare compare_op) { + sort_keys_benchmark(input, output, + compare_op); } -template -__device__ auto sort_pairs_benchmark(const T* input, T* output, Compare compare_op) - -> std::enable_if_t> -{ - constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - - const unsigned int flat_tid = threadIdx.x; - const unsigned int block_offset = blockIdx.x * items_per_block; - T keys[ItemsPerThread]; - T values[ItemsPerThread]; - hipcub::LoadDirectBlocked(flat_tid, input + block_offset, keys); - - for(unsigned int i = 0; i < ItemsPerThread; ++i) - { - values[i] = keys[i] + T(1); - } +template +__device__ auto sort_pairs_benchmark(const T *input, T *output, + Compare compare_op) + -> std::enable_if_t< + benchmark_utils::device_test_enabled_for_warp_size_v> { + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - constexpr unsigned int warps_per_block = BlockSize / LogicalWarpSize; - const unsigned int warp_id = threadIdx.x / LogicalWarpSize; + const unsigned int flat_tid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * items_per_block; + T keys[ItemsPerThread]; + T values[ItemsPerThread]; + hipcub::LoadDirectBlocked(flat_tid, input + block_offset, keys); - using warp_merge_sort = hipcub::WarpMergeSort; - __shared__ typename warp_merge_sort::TempStorage storage[warps_per_block]; + for (unsigned int i = 0; i < ItemsPerThread; ++i) { + values[i] = keys[i] + T(1); + } - warp_merge_sort wsort{storage[warp_id]}; - wsort.Sort(keys, values, compare_op); + constexpr unsigned int warps_per_block = BlockSize / LogicalWarpSize; + const unsigned int warp_id = threadIdx.x / LogicalWarpSize; - for(unsigned int i = 0; i < ItemsPerThread; ++i) - { - keys[i] += values[i]; - } + using warp_merge_sort = + hipcub::WarpMergeSort; + __shared__ typename warp_merge_sort::TempStorage storage[warps_per_block]; + + warp_merge_sort wsort{storage[warp_id]}; + wsort.Sort(keys, values, compare_op); + + for (unsigned int i = 0; i < ItemsPerThread; ++i) { + keys[i] += values[i]; + } - hipcub::StoreDirectBlocked(flat_tid, output + block_offset, keys); + hipcub::StoreDirectBlocked(flat_tid, output + block_offset, keys); } -template -__device__ auto sort_pairs_benchmark(const T* /*input*/, T* /*output*/, Compare /*compare_op*/) - -> std::enable_if_t> -{} - -template -__global__ - __launch_bounds__(BlockSize) void sort_pairs(const T* input, T* output, Compare compare_op) -{ - sort_pairs_benchmark(input, output, compare_op); +template +__device__ auto sort_pairs_benchmark(const T * /*input*/, T * /*output*/, + Compare /*compare_op*/) + -> std::enable_if_t> {} + +template +__global__ __launch_bounds__(BlockSize) void sort_pairs(const T *input, + T *output, + Compare compare_op) { + sort_pairs_benchmark( + input, output, compare_op); } -template -struct max_value { - static constexpr T value = std::numeric_limits::max(); +template struct max_value { + static constexpr T value = std::numeric_limits::max(); }; -template -__device__ auto sort_keys_segmented_benchmark(const T* input, - T* output, - const unsigned int* segment_sizes, - Compare compare) - -> std::enable_if_t> -{ - constexpr unsigned int max_segment_size = LogicalWarpSize * ItemsPerThread; - constexpr unsigned int segments_per_block = BlockSize / LogicalWarpSize; +template +__device__ auto sort_keys_segmented_benchmark(const T *input, T *output, + const unsigned int *segment_sizes, + Compare compare) + -> std::enable_if_t< + benchmark_utils::device_test_enabled_for_warp_size_v> { + constexpr unsigned int max_segment_size = LogicalWarpSize * ItemsPerThread; + constexpr unsigned int segments_per_block = BlockSize / LogicalWarpSize; - using warp_merge_sort = hipcub::WarpMergeSort; - __shared__ typename warp_merge_sort::TempStorage storage[segments_per_block]; + using warp_merge_sort = + hipcub::WarpMergeSort; + __shared__ typename warp_merge_sort::TempStorage storage[segments_per_block]; - const unsigned int warp_id = threadIdx.x / LogicalWarpSize; - warp_merge_sort wsort{storage[warp_id]}; + const unsigned int warp_id = threadIdx.x / LogicalWarpSize; + warp_merge_sort wsort{storage[warp_id]}; - const unsigned int segment_id = blockIdx.x * segments_per_block + warp_id; + const unsigned int segment_id = blockIdx.x * segments_per_block + warp_id; - const unsigned int segment_size = segment_sizes[segment_id]; - const unsigned int warp_offset = segment_id * max_segment_size; - T keys[ItemsPerThread]; + const unsigned int segment_size = segment_sizes[segment_id]; + const unsigned int warp_offset = segment_id * max_segment_size; + T keys[ItemsPerThread]; - const unsigned int flat_tid = wsort.get_linear_tid(); - hipcub::LoadDirectBlocked(flat_tid, input + warp_offset, keys, segment_size); + const unsigned int flat_tid = wsort.get_linear_tid(); + hipcub::LoadDirectBlocked(flat_tid, input + warp_offset, keys, segment_size); - const T oob_default = max_value::value; - wsort.Sort(keys, compare, segment_size, oob_default); + const T oob_default = max_value::value; + wsort.Sort(keys, compare, segment_size, oob_default); - hipcub::StoreDirectBlocked(flat_tid, output + warp_offset, keys, segment_size); + hipcub::StoreDirectBlocked(flat_tid, output + warp_offset, keys, + segment_size); } -template -__device__ auto sort_keys_segmented_benchmark(const T* /*input*/, - T* /*output*/, - const unsigned int* /*segment_sizes*/, - Compare /*compare*/) - -> std::enable_if_t> -{} - -template -__global__ __launch_bounds__(BlockSize) void sort_keys_segmented(const T* input, - T* output, - const unsigned int* segment_sizes, - Compare compare) -{ - sort_keys_segmented_benchmark(input, - output, - segment_sizes, - compare); +template +__device__ auto +sort_keys_segmented_benchmark(const T * /*input*/, T * /*output*/, + const unsigned int * /*segment_sizes*/, + Compare /*compare*/) + -> std::enable_if_t> {} + +template +__global__ __launch_bounds__(BlockSize) void sort_keys_segmented( + const T *input, T *output, const unsigned int *segment_sizes, + Compare compare) { + sort_keys_segmented_benchmark( + input, output, segment_sizes, compare); } -template -__device__ auto sort_pairs_segmented_benchmark(const T* input, - T* output, - const unsigned int* segment_sizes, - Compare compare) - -> std::enable_if_t> -{ - constexpr unsigned int max_segment_size = LogicalWarpSize * ItemsPerThread; - constexpr unsigned int segments_per_block = BlockSize / LogicalWarpSize; - - using warp_merge_sort = hipcub::WarpMergeSort; - __shared__ typename warp_merge_sort::TempStorage storage[segments_per_block]; - - const unsigned int warp_id = threadIdx.x / LogicalWarpSize; - warp_merge_sort wsort{storage[warp_id]}; - - const unsigned int segment_id = blockIdx.x * segments_per_block + warp_id; - - const unsigned int segment_size = segment_sizes[segment_id]; - const unsigned int warp_offset = segment_id * max_segment_size; - T keys[ItemsPerThread]; - T values[ItemsPerThread]; - - const unsigned int flat_tid = wsort.get_linear_tid(); - hipcub::LoadDirectBlocked(flat_tid, input + warp_offset, keys, segment_size); - - for(unsigned int i = 0; i < ItemsPerThread; ++i) { - if(flat_tid * ItemsPerThread + i < segment_size) { - values[i] = keys[i] + T(1); - } +template +__device__ auto +sort_pairs_segmented_benchmark(const T *input, T *output, + const unsigned int *segment_sizes, + Compare compare) + -> std::enable_if_t< + benchmark_utils::device_test_enabled_for_warp_size_v> { + constexpr unsigned int max_segment_size = LogicalWarpSize * ItemsPerThread; + constexpr unsigned int segments_per_block = BlockSize / LogicalWarpSize; + + using warp_merge_sort = + hipcub::WarpMergeSort; + __shared__ typename warp_merge_sort::TempStorage storage[segments_per_block]; + + const unsigned int warp_id = threadIdx.x / LogicalWarpSize; + warp_merge_sort wsort{storage[warp_id]}; + + const unsigned int segment_id = blockIdx.x * segments_per_block + warp_id; + + const unsigned int segment_size = segment_sizes[segment_id]; + const unsigned int warp_offset = segment_id * max_segment_size; + T keys[ItemsPerThread]; + T values[ItemsPerThread]; + + const unsigned int flat_tid = wsort.get_linear_tid(); + hipcub::LoadDirectBlocked(flat_tid, input + warp_offset, keys, segment_size); + + for (unsigned int i = 0; i < ItemsPerThread; ++i) { + if (flat_tid * ItemsPerThread + i < segment_size) { + values[i] = keys[i] + T(1); } + } - const T oob_default = max_value::value; - wsort.Sort(keys, values, compare, segment_size, oob_default); + const T oob_default = max_value::value; + wsort.Sort(keys, values, compare, segment_size, oob_default); - for(unsigned int i = 0; i < ItemsPerThread; ++i) { - if(flat_tid * ItemsPerThread + i < segment_size) { - keys[i] += values[i]; - } + for (unsigned int i = 0; i < ItemsPerThread; ++i) { + if (flat_tid * ItemsPerThread + i < segment_size) { + keys[i] += values[i]; } + } - hipcub::StoreDirectBlocked(flat_tid, output + warp_offset, keys, segment_size); + hipcub::StoreDirectBlocked(flat_tid, output + warp_offset, keys, + segment_size); } -template -__device__ auto sort_pairs_segmented_benchmark(const T* /*input*/, - T* /*output*/, - const unsigned int* /*segment_sizes*/, - Compare /*compare*/) - -> std::enable_if_t> -{} - -template -__global__ __launch_bounds__(BlockSize) void sort_pairs_segmented(const T* input, - T* output, - const unsigned int* segment_sizes, - Compare compare) -{ - sort_pairs_segmented_benchmark(input, - output, - segment_sizes, - compare); +template +__device__ auto +sort_pairs_segmented_benchmark(const T * /*input*/, T * /*output*/, + const unsigned int * /*segment_sizes*/, + Compare /*compare*/) + -> std::enable_if_t> {} + +template +__global__ __launch_bounds__(BlockSize) void sort_pairs_segmented( + const T *input, T *output, const unsigned int *segment_sizes, + Compare compare) { + sort_pairs_segmented_benchmark( + input, output, segment_sizes, compare); } -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize, - unsigned int ItemsPerThread, - class CompareOp = test_utils::less, - unsigned int Trials = 10 -> -void run_benchmark(benchmark::State& state, const benchmark_kinds benchmark_kind, const hipStream_t stream, const size_t N) -{ - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); - - const auto input = std::is_floating_point::value ? - benchmark_utils::get_random_data(size, static_cast(-1000), static_cast(1000)) : - benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); - - T* d_input = nullptr; - T* d_output = nullptr; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(input[0]))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(input[0]))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); - - for(auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - if(benchmark_kind == benchmark_kinds::sort_keys) - { - for(unsigned int i = 0; i < Trials; ++i) { - sort_keys - <<>>(d_input, - d_output, - CompareOp{}); - } - } - else if(benchmark_kind == benchmark_kinds::sort_pairs) - { - for(unsigned int i = 0; i < Trials; ++i) { - sort_pairs - <<>>(d_input, - d_output, - CompareOp{}); - } - } - HIP_CHECK(hipPeekAtLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); +template +void run_benchmark(benchmark::State &state, + const benchmark_kinds benchmark_kind, + const hipStream_t stream, const size_t N) { + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto size = + items_per_block * ((N + items_per_block - 1) / items_per_block); + + const auto input = + std::is_floating_point::value + ? benchmark_utils::get_random_data(size, static_cast(-1000), + static_cast(1000)) + : benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); + + T *d_input = nullptr; + T *d_output = nullptr; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(input[0]))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(input[0]))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), + hipMemcpyHostToDevice)); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + if (benchmark_kind == benchmark_kinds::sort_keys) { + for (unsigned int i = 0; i < Trials; ++i) { + sort_keys + <<>>( + d_input, d_output, CompareOp{}); + } + } else if (benchmark_kind == benchmark_kinds::sort_pairs) { + for (unsigned int i = 0; i < Trials; ++i) { + sort_pairs + <<>>( + d_input, d_output, CompareOp{}); + } } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); - - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize, - unsigned int ItemsPerThread, - class CompareOp = test_utils::less, - unsigned int Trials = 10 -> -void run_segmented_benchmark(benchmark::State& state, const benchmark_kinds benchmark_kind, const hipStream_t stream, const size_t N) -{ - constexpr auto max_segment_size = LogicalWarpSize * ItemsPerThread; - constexpr auto segments_per_block = BlockSize / LogicalWarpSize; - constexpr auto items_per_block = BlockSize * ItemsPerThread; - - const auto num_blocks = (N + items_per_block - 1) / items_per_block; - const auto num_segments = num_blocks * segments_per_block; - const auto size = num_blocks * items_per_block; - - const auto input = std::is_floating_point::value ? - benchmark_utils::get_random_data(size, static_cast(-1000), static_cast(1000)) : - benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); - - const auto segment_sizes = benchmark_utils::get_random_data( - num_segments, 0, max_segment_size); - - T* d_input = nullptr; - T* d_output = nullptr; - unsigned int* d_segment_sizes = nullptr; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(input[0]))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(input[0]))); - HIP_CHECK(hipMalloc(&d_segment_sizes, num_segments * sizeof(segment_sizes[0]))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK(hipMemcpy(d_segment_sizes, segment_sizes.data(), - num_segments * sizeof(segment_sizes[0]), - hipMemcpyHostToDevice)); - - for(auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - - if(benchmark_kind == benchmark_kinds::sort_keys) - { - for(unsigned int i = 0; i < Trials; ++i) - { - sort_keys_segmented - <<>>(d_input, - d_output, - d_segment_sizes, - CompareOp{}); - } - } - else if(benchmark_kind == benchmark_kinds::sort_pairs) - { - for(unsigned int i = 0; i < Trials; ++i) - { - sort_pairs_segmented - <<>>(d_input, - d_output, - d_segment_sizes, - CompareOp{}); - } - } - HIP_CHECK(hipPeekAtLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); +template +void run_segmented_benchmark(benchmark::State &state, + const benchmark_kinds benchmark_kind, + const hipStream_t stream, const size_t N) { + constexpr auto max_segment_size = LogicalWarpSize * ItemsPerThread; + constexpr auto segments_per_block = BlockSize / LogicalWarpSize; + constexpr auto items_per_block = BlockSize * ItemsPerThread; + + const auto num_blocks = (N + items_per_block - 1) / items_per_block; + const auto num_segments = num_blocks * segments_per_block; + const auto size = num_blocks * items_per_block; + + const auto input = + std::is_floating_point::value + ? benchmark_utils::get_random_data(size, static_cast(-1000), + static_cast(1000)) + : benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); + + const auto segment_sizes = benchmark_utils::get_random_data( + num_segments, 0, max_segment_size); + + T *d_input = nullptr; + T *d_output = nullptr; + unsigned int *d_segment_sizes = nullptr; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(input[0]))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(input[0]))); + HIP_CHECK( + hipMalloc(&d_segment_sizes, num_segments * sizeof(segment_sizes[0]))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_segment_sizes, segment_sizes.data(), + num_segments * sizeof(segment_sizes[0]), + hipMemcpyHostToDevice)); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + + if (benchmark_kind == benchmark_kinds::sort_keys) { + for (unsigned int i = 0; i < Trials; ++i) { + sort_keys_segmented + <<>>( + d_input, d_output, d_segment_sizes, CompareOp{}); + } + } else if (benchmark_kind == benchmark_kinds::sort_pairs) { + for (unsigned int i = 0; i < Trials; ++i) { + sort_pairs_segmented + <<>>( + d_input, d_output, d_segment_sizes, CompareOp{}); + } } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); - - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); - HIP_CHECK(hipFree(d_segment_sizes)); + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_segment_sizes)); } -#define CREATE_BENCHMARK(T, BS, WS, IPT) \ - if(WS <= device_warp_size) { \ - benchmarks.push_back(benchmark::RegisterBenchmark( \ - std::string("warp_merge_sort.sub_algorithm_name:" \ - + name \ - ).c_str(), \ - segmented ? &run_benchmark : &run_segmented_benchmark, \ - benchmark_kind, stream, size)); \ - } \ - +#define CREATE_BENCHMARK(T, BS, WS, IPT) \ + if (WS <= device_warp_size) { \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("warp_merge_sort.sub_algorithm_name:" + \ + name) \ + .c_str(), \ + segmented ? &run_benchmark \ + : &run_segmented_benchmark, \ + benchmark_kind, stream, size)); \ + } #define BENCHMARK_TYPE_WS(type, block, warp) \ - CREATE_BENCHMARK(type, block, warp, 1); \ - CREATE_BENCHMARK(type, block, warp, 4); \ - CREATE_BENCHMARK(type, block, warp, 8) + CREATE_BENCHMARK(type, block, warp, 1); \ + CREATE_BENCHMARK(type, block, warp, 4); \ + CREATE_BENCHMARK(type, block, warp, 8) -#define BENCHMARK_TYPE(type, block) \ - BENCHMARK_TYPE_WS(type, block, 4); \ - BENCHMARK_TYPE_WS(type, block, 16); \ - BENCHMARK_TYPE_WS(type, block, 32); \ - BENCHMARK_TYPE_WS(type, block, 64) +#define BENCHMARK_TYPE(type, block) \ + BENCHMARK_TYPE_WS(type, block, 4); \ + BENCHMARK_TYPE_WS(type, block, 16); \ + BENCHMARK_TYPE_WS(type, block, 32); \ + BENCHMARK_TYPE_WS(type, block, 64) void add_benchmarks(const benchmark_kinds benchmark_kind, - const std::string& name, - std::vector& benchmarks, - const hipStream_t stream, - const size_t size, - const bool segmented, - const unsigned int device_warp_size) -{ - BENCHMARK_TYPE(int, 256); - BENCHMARK_TYPE(int8_t, 256); - BENCHMARK_TYPE(uint8_t, 256); - BENCHMARK_TYPE(long long, 256); + const std::string &name, + std::vector &benchmarks, + const hipStream_t stream, const size_t size, + const bool segmented, const unsigned int device_warp_size) { + BENCHMARK_TYPE(int, 256); + BENCHMARK_TYPE(int8_t, 256); + BENCHMARK_TYPE(uint8_t, 256); + BENCHMARK_TYPE(long long, 256); } -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_warp_merge_sort" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - const auto device_warp_size = [] { - const int result = HIPCUB_HOST_WARP_THREADS; - if(result > 0) { - std::cout << "[HIP] Device warp size: " << result << std::endl; - } else { - std::cerr << "Failed to get device warp size! Aborting.\n"; - std::exit(1); - } - return static_cast(result); - }(); - - // Add benchmarks - std::vector benchmarks; - add_benchmarks(benchmark_kinds::sort_keys, "sort(keys)", benchmarks, stream, - size, false, device_warp_size); - add_benchmarks(benchmark_kinds::sort_pairs, "sort(keys, values)", - benchmarks, stream, size, false, device_warp_size); - add_benchmarks(benchmark_kinds::sort_keys, "segmented_sort(keys)", - benchmarks, stream, size, true, device_warp_size); - add_benchmarks(benchmark_kinds::sort_pairs, "segmented_sort(keys, values)", - benchmarks, stream, size, true, device_warp_size); - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_warp_merge_sort" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + const auto device_warp_size = [] { + const int result = HIPCUB_HOST_WARP_THREADS; + if (result > 0) { + std::cout << "[HIP] Device warp size: " << result << std::endl; + } else { + std::cerr << "Failed to get device warp size! Aborting.\n"; + std::exit(1); } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } + return static_cast(result); + }(); + + // Add benchmarks + std::vector benchmarks; + add_benchmarks(benchmark_kinds::sort_keys, "sort(keys)", benchmarks, stream, + size, false, device_warp_size); + add_benchmarks(benchmark_kinds::sort_pairs, "sort(keys, values)", benchmarks, + stream, size, false, device_warp_size); + add_benchmarks(benchmark_kinds::sort_keys, "segmented_sort(keys)", benchmarks, + stream, size, true, device_warp_size); + add_benchmarks(benchmark_kinds::sort_pairs, "segmented_sort(keys, values)", + benchmarks, stream, size, true, device_warp_size); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_warp_reduce.cpp b/benchmark/benchmark_warp_reduce.cpp index 65b6e991..2f798d44 100644 --- a/benchmark/benchmark_warp_reduce.cpp +++ b/benchmark/benchmark_warp_reduce.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -25,272 +25,199 @@ // HIP API #include "hipcub/warp/warp_reduce.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template< - class T, - unsigned int WarpSize, - unsigned int Trials -> -__global__ -__launch_bounds__(64) -void warp_reduce_kernel(const T * d_input, T * d_output) -{ - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - - auto value = d_input[i]; - - using wreduce_t = hipcub::WarpReduce; - __shared__ typename wreduce_t::TempStorage storage; - auto reduce_op = hipcub::Sum(); - #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - value = wreduce_t(storage).Reduce(value, reduce_op); - } +template +__global__ __launch_bounds__(64) void warp_reduce_kernel(const T *d_input, + T *d_output) { + const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + auto value = d_input[i]; + + using wreduce_t = hipcub::WarpReduce; + __shared__ typename wreduce_t::TempStorage storage; + auto reduce_op = hipcub::Sum(); +#pragma nounroll + for (unsigned int trial = 0; trial < Trials; trial++) { + value = wreduce_t(storage).Reduce(value, reduce_op); + } - d_output[i] = value; + d_output[i] = value; } -template< - class T, - class Flag, - unsigned int WarpSize, - unsigned int Trials -> -__global__ -__launch_bounds__(64) -void segmented_warp_reduce_kernel(const T* d_input, Flag* d_flags, T* d_output) -{ - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - - auto value = d_input[i]; - auto flag = d_flags[i]; - - using wreduce_t = hipcub::WarpReduce; - __shared__ typename wreduce_t::TempStorage storage; - #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - value = wreduce_t(storage).HeadSegmentedSum(value, flag); - } +template +__global__ __launch_bounds__(64) void segmented_warp_reduce_kernel( + const T *d_input, Flag *d_flags, T *d_output) { + const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + auto value = d_input[i]; + auto flag = d_flags[i]; + + using wreduce_t = hipcub::WarpReduce; + __shared__ typename wreduce_t::TempStorage storage; +#pragma nounroll + for (unsigned int trial = 0; trial < Trials; trial++) { + value = wreduce_t(storage).HeadSegmentedSum(value, flag); + } - d_output[i] = value; + d_output[i] = value; } -template< - bool Segmented, - unsigned int WarpSize, - unsigned int BlockSize, - unsigned int Trials, - class T, - class Flag -> -inline -auto execute_warp_reduce_kernel(T* input, T* output, Flag* /* flags */, - size_t size, hipStream_t stream) - -> typename std::enable_if::type -{ - hipLaunchKernelGGL( - HIP_KERNEL_NAME(warp_reduce_kernel), - dim3(size/BlockSize), dim3(BlockSize), 0, stream, - input, output - ); - HIP_CHECK(hipPeekAtLastError()); +template +inline auto execute_warp_reduce_kernel(T *input, T *output, Flag * /* flags */, + size_t size, hipStream_t stream) -> + typename std::enable_if::type { + hipLaunchKernelGGL(HIP_KERNEL_NAME(warp_reduce_kernel), + dim3(size / BlockSize), dim3(BlockSize), 0, stream, input, + output); + HIP_CHECK(hipPeekAtLastError()); } -template< - bool Segmented, - unsigned int WarpSize, - unsigned int BlockSize, - unsigned int Trials, - class T, - class Flag -> -inline -auto execute_warp_reduce_kernel(T* input, T* output, Flag* flags, - size_t size, hipStream_t stream) - -> typename std::enable_if::type -{ - hipLaunchKernelGGL( - HIP_KERNEL_NAME(segmented_warp_reduce_kernel), - dim3(size/BlockSize), dim3(BlockSize), 0, stream, - input, flags, output - ); - HIP_CHECK(hipPeekAtLastError()); +template +inline auto execute_warp_reduce_kernel(T *input, T *output, Flag *flags, + size_t size, hipStream_t stream) -> + typename std::enable_if::type { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(segmented_warp_reduce_kernel), + dim3(size / BlockSize), dim3(BlockSize), 0, stream, input, flags, output); + HIP_CHECK(hipPeekAtLastError()); } -template< - bool Segmented, - class T, - unsigned int WarpSize, - unsigned int BlockSize, - unsigned int Trials = 100 -> -void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) -{ - using flag_type = unsigned char; - - const auto size = BlockSize * ((N + BlockSize - 1)/BlockSize); - - std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); - std::vector flags = benchmark_utils::get_random_data(size, 0, 1); - T * d_input; - flag_type * d_flags; - T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_flags, size * sizeof(flag_type))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_flags, flags.data(), - size * sizeof(flag_type), - hipMemcpyHostToDevice - ) - ); +template +void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { + using flag_type = unsigned char; + + const auto size = BlockSize * ((N + BlockSize - 1) / BlockSize); + + std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); + std::vector flags = + benchmark_utils::get_random_data(size, 0, 1); + T *d_input; + flag_type *d_flags; + T *d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_flags, size * sizeof(flag_type))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_flags, flags.data(), size * sizeof(flag_type), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + execute_warp_reduce_kernel( + d_input, d_output, d_flags, size, stream); HIP_CHECK(hipDeviceSynchronize()); - for(auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - execute_warp_reduce_kernel( - d_input, d_output, d_flags, size, stream - ); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); - - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); - HIP_CHECK(hipFree(d_flags)); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_flags)); } -#define CREATE_BENCHMARK(T, WS, BS) \ - benchmark::RegisterBenchmark( \ - std::string("warp_reduce.sub_algorithm_name:" \ - + name \ - ).c_str(), \ - &run_benchmark, \ - stream, size \ - ) - +#define CREATE_BENCHMARK(T, WS, BS) \ + benchmark::RegisterBenchmark( \ + std::string("warp_reduce.sub_algorithm_name:" + \ + name) \ + .c_str(), \ + &run_benchmark, stream, size) // If warp size limit is 16 #define BENCHMARK_TYPE_WS16(type) \ - CREATE_BENCHMARK(type, 15, 32), \ - CREATE_BENCHMARK(type, 16, 32) - + CREATE_BENCHMARK(type, 15, 32), CREATE_BENCHMARK(type, 16, 32) // If warp size limit is 32 -#define BENCHMARK_TYPE_WS32(type) \ - BENCHMARK_TYPE_WS16(type), \ - CREATE_BENCHMARK(type, 31, 32), \ - CREATE_BENCHMARK(type, 32, 32), \ - CREATE_BENCHMARK(type, 32, 64) - +#define BENCHMARK_TYPE_WS32(type) \ + BENCHMARK_TYPE_WS16(type), CREATE_BENCHMARK(type, 31, 32), \ + CREATE_BENCHMARK(type, 32, 32), CREATE_BENCHMARK(type, 32, 64) // If warp size limit is 64 -#define BENCHMARK_TYPE_WS64(type) \ - BENCHMARK_TYPE_WS32(type), \ - CREATE_BENCHMARK(type, 37, 64), \ - CREATE_BENCHMARK(type, 61, 64), \ - CREATE_BENCHMARK(type, 64, 64) - - -template -void add_benchmarks(const std::string& name, - std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - std::vector bs = - { +#define BENCHMARK_TYPE_WS64(type) \ + BENCHMARK_TYPE_WS32(type), CREATE_BENCHMARK(type, 37, 64), \ + CREATE_BENCHMARK(type, 61, 64), CREATE_BENCHMARK(type, 64, 64) + +template +void add_benchmarks(const std::string &name, + std::vector &benchmarks, + hipStream_t stream, size_t size) { + std::vector bs = { #if HIPCUB_WARP_THREADS_MACRO == 16 - BENCHMARK_TYPE_WS16(int), - BENCHMARK_TYPE_WS16(float), - BENCHMARK_TYPE_WS16(double), - BENCHMARK_TYPE_WS16(int8_t), - BENCHMARK_TYPE_WS16(uint8_t) + BENCHMARK_TYPE_WS16(int), + BENCHMARK_TYPE_WS16(float), + BENCHMARK_TYPE_WS16(double), + BENCHMARK_TYPE_WS16(int8_t), + BENCHMARK_TYPE_WS16(uint8_t) #elif HIPCUB_WARP_THREADS_MACRO == 32 - BENCHMARK_TYPE_WS32(int), - BENCHMARK_TYPE_WS32(float), - BENCHMARK_TYPE_WS32(double), - BENCHMARK_TYPE_WS32(int8_t), - BENCHMARK_TYPE_WS32(uint8_t) + BENCHMARK_TYPE_WS32(int), + BENCHMARK_TYPE_WS32(float), + BENCHMARK_TYPE_WS32(double), + BENCHMARK_TYPE_WS32(int8_t), + BENCHMARK_TYPE_WS32(uint8_t) #else - BENCHMARK_TYPE_WS64(int), - BENCHMARK_TYPE_WS64(float), - BENCHMARK_TYPE_WS64(double), - BENCHMARK_TYPE_WS64(int8_t), - BENCHMARK_TYPE_WS64(uint8_t) + BENCHMARK_TYPE_WS64(int), + BENCHMARK_TYPE_WS64(float), + BENCHMARK_TYPE_WS64(double), + BENCHMARK_TYPE_WS64(int8_t), + BENCHMARK_TYPE_WS64(uint8_t) #endif - }; - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); + }; + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_warp_reduce" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_benchmarks("reduce", benchmarks, stream, size); - add_benchmarks("segmented_reduce", benchmarks, stream, size); - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_warp_reduce" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_benchmarks("reduce", benchmarks, stream, size); + add_benchmarks("segmented_reduce", benchmarks, stream, size); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_warp_scan.cpp b/benchmark/benchmark_warp_scan.cpp index f8003473..926f644d 100644 --- a/benchmark/benchmark_warp_scan.cpp +++ b/benchmark/benchmark_warp_scan.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -29,148 +29,116 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -enum class scan_type -{ - inclusive_scan, - exclusive_scan, - broadcast -}; +enum class scan_type { inclusive_scan, exclusive_scan, broadcast }; -template -__global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output, const T init) -{ - Runner::template run(input, output, init); +template +__global__ __launch_bounds__(BlockSize) void kernel(const T *input, T *output, + const T init) { + Runner::template run(input, output, init); } -struct inclusive_scan -{ - template - __device__ static void run(const T* input, T* output, const T init) - { - (void)init; +struct inclusive_scan { + template + __device__ static void run(const T *input, T *output, const T init) { + (void)init; - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - auto value = input[i]; + const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + auto value = input[i]; - using wscan_t = hipcub::WarpScan; - __shared__ typename wscan_t::TempStorage storage; - auto scan_op = hipcub::Sum(); + using wscan_t = hipcub::WarpScan; + __shared__ typename wscan_t::TempStorage storage; + auto scan_op = hipcub::Sum(); #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - wscan_t(storage).InclusiveScan(value, value, scan_op); - } - - output[i] = value; + for (unsigned int trial = 0; trial < Trials; trial++) { + wscan_t(storage).InclusiveScan(value, value, scan_op); } + + output[i] = value; + } }; -struct exclusive_scan -{ - template - __device__ static void run(const T* input, T* output, const T init) - { - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - auto value = input[i]; - - using wscan_t = hipcub::WarpScan; - __shared__ typename wscan_t::TempStorage storage; - auto scan_op = hipcub::Sum(); -#pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - wscan_t(storage).ExclusiveScan(value, value, init, scan_op); - } +struct exclusive_scan { + template + __device__ static void run(const T *input, T *output, const T init) { + const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + auto value = input[i]; - output[i] = value; + using wscan_t = hipcub::WarpScan; + __shared__ typename wscan_t::TempStorage storage; + auto scan_op = hipcub::Sum(); +#pragma nounroll + for (unsigned int trial = 0; trial < Trials; trial++) { + wscan_t(storage).ExclusiveScan(value, value, init, scan_op); } + + output[i] = value; + } }; -struct broadcast -{ - template - __device__ static void run(const T* input, T* output, const T init) - { - (void)init; +struct broadcast { + template + __device__ static void run(const T *input, T *output, const T init) { + (void)init; - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - auto value = input[i]; + const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + auto value = input[i]; - using wscan_t = hipcub::WarpScan; - __shared__ typename wscan_t::TempStorage storage; + using wscan_t = hipcub::WarpScan; + __shared__ typename wscan_t::TempStorage storage; #pragma nounroll - for(unsigned int trial = 0; trial < Trials; trial++) - { - value = wscan_t(storage).Broadcast(value, 0); - } - - output[i] = value; + for (unsigned int trial = 0; trial < Trials; trial++) { + value = wscan_t(storage).Broadcast(value, 0); } + + output[i] = value; + } }; -template -void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) -{ - // Make sure size is a multiple of BlockSize - size = BlockSize * ((size + BlockSize - 1)/BlockSize); - // Allocate and fill memory - std::vector input(size, 1.0f); - T * d_input; - T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); +template +void run_benchmark(benchmark::State &state, hipStream_t stream, size_t size) { + // Make sure size is a multiple of BlockSize + size = BlockSize * ((size + BlockSize - 1) / BlockSize); + // Allocate and fill memory + std::vector input(size, 1.0f); + T *d_input; + T *d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel), + dim3(size / BlockSize), dim3(BlockSize), 0, stream, d_input, d_output, + input[0]); + HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), - dim3(size / BlockSize), - dim3(BlockSize), - 0, - stream, - d_input, - d_output, - input[0]); - HIP_CHECK(hipPeekAtLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); - state.SetItemsProcessed(state.iterations() * size * Trials); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); + state.SetItemsProcessed(state.iterations() * size * Trials); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK_IMPL(T, BS, WS, OP) \ - benchmark::RegisterBenchmark( \ - std::string("warp_scan.method_name:" \ - + method_name \ - ).c_str(), \ - &run_benchmark, \ - stream, \ - size \ - ) +#define CREATE_BENCHMARK_IMPL(T, BS, WS, OP) \ + benchmark::RegisterBenchmark( \ + std::string("warp_scan.method_name:" + \ + method_name) \ + .c_str(), \ + &run_benchmark, stream, size) #define CREATE_BENCHMARK(T, BS, WS) CREATE_BENCHMARK_IMPL(T, BS, WS, Benchmark) @@ -197,87 +165,82 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) CREATE_BENCHMARK(type, 256, 64) // clang-format on -template -void add_benchmarks(std::vector& benchmarks, - const std::string& method_name, - hipStream_t stream, - size_t size) -{ - using custom_double2 = benchmark_utils::custom_type; - using custom_int_double = benchmark_utils::custom_type; +template +void add_benchmarks(std::vector &benchmarks, + const std::string &method_name, hipStream_t stream, + size_t size) { + using custom_double2 = benchmark_utils::custom_type; + using custom_int_double = benchmark_utils::custom_type; - std::vector new_benchmarks = { + std::vector new_benchmarks = { #if HIPCUB_WARP_THREADS_MACRO == 16 - BENCHMARK_TYPE_WS16(int), - BENCHMARK_TYPE_WS16(float), - BENCHMARK_TYPE_WS16(double), - BENCHMARK_TYPE_WS16(int8_t), - BENCHMARK_TYPE_WS16(custom_double2), - BENCHMARK_TYPE_WS16(custom_int_double) + BENCHMARK_TYPE_WS16(int), + BENCHMARK_TYPE_WS16(float), + BENCHMARK_TYPE_WS16(double), + BENCHMARK_TYPE_WS16(int8_t), + BENCHMARK_TYPE_WS16(custom_double2), + BENCHMARK_TYPE_WS16(custom_int_double) #elif HIPCUB_WARP_THREADS_MACRO == 32 - BENCHMARK_TYPE_WS32(int), - BENCHMARK_TYPE_WS32(float), - BENCHMARK_TYPE_WS32(double), - BENCHMARK_TYPE_WS32(int8_t), - BENCHMARK_TYPE_WS32(custom_double2), - BENCHMARK_TYPE_WS32(custom_int_double) + BENCHMARK_TYPE_WS32(int), + BENCHMARK_TYPE_WS32(float), + BENCHMARK_TYPE_WS32(double), + BENCHMARK_TYPE_WS32(int8_t), + BENCHMARK_TYPE_WS32(custom_double2), + BENCHMARK_TYPE_WS32(custom_int_double) #else - BENCHMARK_TYPE_WS64(int), - BENCHMARK_TYPE_WS64(float), - BENCHMARK_TYPE_WS64(double), - BENCHMARK_TYPE_WS64(int8_t), - BENCHMARK_TYPE_WS64(custom_double2), - BENCHMARK_TYPE_WS64(custom_int_double) + BENCHMARK_TYPE_WS64(int), + BENCHMARK_TYPE_WS64(float), + BENCHMARK_TYPE_WS64(double), + BENCHMARK_TYPE_WS64(int8_t), + BENCHMARK_TYPE_WS64(custom_double2), + BENCHMARK_TYPE_WS64(custom_int_double) #endif - }; - benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); + }; + benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), + new_benchmarks.end()); } -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_warp_scan" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_benchmarks(benchmarks, "inclusive_scan", stream, size); - add_benchmarks(benchmarks, "exclusive_scan", stream, size); - add_benchmarks(benchmarks, "broadcast", stream, size); - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if(trials > 0) - { - for(auto& b : benchmarks) - { - b->Iterations(trials); - } +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_warp_scan" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_benchmarks(benchmarks, "inclusive_scan", stream, size); + add_benchmarks(benchmarks, "exclusive_scan", stream, size); + add_benchmarks(benchmarks, "broadcast", stream, size); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_warp_store.cpp b/benchmark/benchmark_warp_store.cpp index 03f63e46..a73b4eb2 100644 --- a/benchmark/benchmark_warp_store.cpp +++ b/benchmark/benchmark_warp_store.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -31,237 +31,211 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template -__device__ auto warp_store_benchmark(T* d_output) - -> std::enable_if_t> -{ - T thread_data[ItemsPerThread]; - #pragma unroll - for (unsigned i = 0; i < ItemsPerThread; ++i) - { - thread_data[i] = static_cast(i); - } - - using WarpStoreT = ::hipcub::WarpStore; - constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; - constexpr int tile_size = ItemsPerThread * LogicalWarpSize; - __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; - const unsigned warp_id = threadIdx.x / LogicalWarpSize; - const unsigned global_warp_id = blockIdx.x * warps_in_block + warp_id; - - WarpStoreT(temp_storage[warp_id]).Store(d_output + global_warp_id * tile_size, thread_data); +template +__device__ auto warp_store_benchmark(T *d_output) -> std::enable_if_t< + benchmark_utils::device_test_enabled_for_warp_size_v> { + T thread_data[ItemsPerThread]; +#pragma unroll + for (unsigned i = 0; i < ItemsPerThread; ++i) { + thread_data[i] = static_cast(i); + } + + using WarpStoreT = + ::hipcub::WarpStore; + constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; + constexpr int tile_size = ItemsPerThread * LogicalWarpSize; + __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; + const unsigned warp_id = threadIdx.x / LogicalWarpSize; + const unsigned global_warp_id = blockIdx.x * warps_in_block + warp_id; + + WarpStoreT(temp_storage[warp_id]) + .Store(d_output + global_warp_id * tile_size, thread_data); } -template -__device__ auto warp_store_benchmark(T* /*d_output*/) - -> std::enable_if_t> -{} +template +__device__ auto warp_store_benchmark(T * /*d_output*/) -> std::enable_if_t< + !benchmark_utils::device_test_enabled_for_warp_size_v> {} -template -__global__ __launch_bounds__(BlockSize) void warp_store_kernel(T* d_output) -{ - warp_store_benchmark(d_output); +template +__global__ __launch_bounds__(BlockSize) void warp_store_kernel(T *d_output) { + warp_store_benchmark( + d_output); } -template< - class T, - unsigned BlockSize, - unsigned ItemsPerThread, - unsigned LogicalWarpSize, - ::hipcub::WarpStoreAlgorithm Algorithm, - unsigned Trials = 100 -> -void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) -{ - constexpr unsigned items_per_block = BlockSize * ItemsPerThread; - const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); +template +void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { + constexpr unsigned items_per_block = BlockSize * ItemsPerThread; + const unsigned size = + items_per_block * ((N + items_per_block - 1) / items_per_block); - T * d_output; - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + T *d_output; + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - for (auto _ : state) - { - auto start = std::chrono::high_resolution_clock::now(); + for (auto _ : state) { + auto start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < Trials; ++i) - { - warp_store_kernel - <<>>(d_output); - } - HIP_CHECK(hipPeekAtLastError()) - HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); + for (size_t i = 0; i < Trials; ++i) { + warp_store_kernel + <<>>( + d_output); } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); - - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipPeekAtLastError()) + HIP_CHECK(hipDeviceSynchronize()); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); + + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ -benchmark::RegisterBenchmark( \ - std::string("warp_store." \ - ).c_str(), \ - &run_benchmark, \ - stream, size \ -) - -int main(int argc, char *argv[]) -{ - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_warp_store" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks{ - CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_VECTORIZE), +#define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ + benchmark::RegisterBenchmark( \ + std::string("warp_store.") \ + .c_str(), \ + &run_benchmark, stream, size) + +int main(int argc, char *argv[]) { + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_warp_store" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks{ + CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_VECTORIZE), + // WARP_STORE_TRANSPOSE removed because of shared memory limit + // CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_VECTORIZE) + // WARP_STORE_TRANSPOSE removed because of shared memory limit + // CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_TRANSPOSE) + }; + + if (::benchmark_utils::is_warp_size_supported(64)) { + std::vector additional_benchmarks{ + CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_VECTORIZE), + // WARP_STORE_TRANSPOSE removed because of shared memory limit + // CREATE_BENCHMARK(double, 256, 16, 64, + // ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_VECTORIZE), // WARP_STORE_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_VECTORIZE) + // CREATE_BENCHMARK(double, 256, 32, 64, + // ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_VECTORIZE) // WARP_STORE_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_TRANSPOSE) + // CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_TRANSPOSE) }; - - if (::benchmark_utils::is_warp_size_supported(64)) - { - std::vector additional_benchmarks{ - CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_VECTORIZE), - // WARP_STORE_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_VECTORIZE), - // WARP_STORE_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_VECTORIZE) - // WARP_STORE_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_TRANSPOSE) - }; - benchmarks.insert( - benchmarks.end(), - additional_benchmarks.begin(), - additional_benchmarks.end() - ); - } - - // Use manual timing - for (auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) - { - for (auto& b : benchmarks) - { - b->Iterations(trials); - } + benchmarks.insert(benchmarks.end(), additional_benchmarks.begin(), + additional_benchmarks.end()); + } + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if (trials > 0) { + for (auto &b : benchmarks) { + b->Iterations(trials); } + } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } From 9e2b241e3f80ad023687807352d93430fc11c075 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Mon, 17 Jun 2024 11:07:19 -0600 Subject: [PATCH 34/46] ran correct .clang-format file --- .../benchmark_block_adjacent_difference.cpp | 627 +++++----- benchmark/benchmark_block_discontinuity.cpp | 437 +++---- benchmark/benchmark_block_exchange.cpp | 511 ++++---- benchmark/benchmark_block_histogram.cpp | 308 ++--- benchmark/benchmark_block_merge_sort.cpp | 370 +++--- benchmark/benchmark_block_radix_rank.cpp | 326 +++--- benchmark/benchmark_block_radix_sort.cpp | 490 ++++---- benchmark/benchmark_block_reduce.cpp | 315 ++--- .../benchmark_block_run_length_decode.cpp | 362 +++--- benchmark/benchmark_block_scan.cpp | 336 +++--- benchmark/benchmark_block_shuffle.cpp | 473 ++++---- .../benchmark_device_adjacent_difference.cpp | 326 +++--- benchmark/benchmark_device_batch_copy.cpp | 602 +++++----- benchmark/benchmark_device_batch_memcpy.cpp | 634 +++++----- benchmark/benchmark_device_histogram.cpp | 1042 +++++++++-------- benchmark/benchmark_device_memory.cpp | 691 ++++++----- benchmark/benchmark_device_merge_sort.cpp | 490 ++++---- benchmark/benchmark_device_partition.cpp | 688 ++++++----- benchmark/benchmark_device_radix_sort.cpp | 755 +++++++----- benchmark/benchmark_device_reduce.cpp | 282 ++--- benchmark/benchmark_device_reduce_by_key.cpp | 356 +++--- .../benchmark_device_run_length_encode.cpp | 556 +++++---- benchmark/benchmark_device_scan.cpp | 551 +++++---- .../benchmark_device_segmented_radix_sort.cpp | 788 +++++++------ .../benchmark_device_segmented_reduce.cpp | 413 ++++--- benchmark/benchmark_device_segmented_sort.cpp | 840 +++++++------ benchmark/benchmark_device_select.cpp | 894 +++++++------- benchmark/benchmark_device_spmv.cpp | 395 ++++--- benchmark/benchmark_utils.hpp | 669 ++++++----- benchmark/benchmark_warp_exchange.cpp | 540 +++++---- benchmark/benchmark_warp_load.cpp | 416 +++---- benchmark/benchmark_warp_merge_sort.cpp | 858 ++++++++------ benchmark/benchmark_warp_reduce.cpp | 348 +++--- benchmark/benchmark_warp_scan.cpp | 337 +++--- benchmark/benchmark_warp_store.cpp | 401 ++++--- 35 files changed, 10023 insertions(+), 8404 deletions(-) diff --git a/benchmark/benchmark_block_adjacent_difference.cpp b/benchmark/benchmark_block_adjacent_difference.cpp index 70b28f63..9ffdfa1e 100644 --- a/benchmark/benchmark_block_adjacent_difference.cpp +++ b/benchmark/benchmark_block_adjacent_difference.cpp @@ -32,341 +32,388 @@ const size_t DEFAULT_N = 1024 * 1024 * 128; #endif -template -__global__ __launch_bounds__(BlockSize) void kernel(Args... args) { - Benchmark::template run(args...); +template +__global__ __launch_bounds__(BlockSize) void kernel(Args... args) +{ + Benchmark::template run(args...); } -template struct minus { - HIPCUB_HOST_DEVICE inline constexpr T operator()(const T &a, - const T &b) const { - return a - b; - } +template +struct minus +{ + HIPCUB_HOST_DEVICE inline constexpr T operator()(const T& a, const T& b) const + { + return a - b; + } }; -struct subtract_left { - template - __device__ static void run(const T *d_input, T *d_output, - unsigned int trials) { - const unsigned int lid = threadIdx.x; - const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; +struct subtract_left +{ + template + __device__ static void run(const T* d_input, T* d_output, unsigned int trials) + { + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; - T input[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, d_input + block_offset, input); + T input[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, d_input + block_offset, input); - hipcub::BlockAdjacentDifference adjacent_difference; + hipcub::BlockAdjacentDifference adjacent_difference; #pragma nounroll - for (unsigned int trial = 0; trial < trials; trial++) { - T output[ItemsPerThread]; - if (WithTile) { - adjacent_difference.SubtractLeft(input, output, minus{}, T(123)); - } else { - adjacent_difference.SubtractLeft(input, output, minus{}); - } - - for (unsigned int i = 0; i < ItemsPerThread; ++i) { - input[i] += output[i]; - } - - __syncthreads(); + for(unsigned int trial = 0; trial < trials; trial++) + { + T output[ItemsPerThread]; + if(WithTile) + { + adjacent_difference.SubtractLeft(input, output, minus{}, T(123)); + } else + { + adjacent_difference.SubtractLeft(input, output, minus{}); + } + + for(unsigned int i = 0; i < ItemsPerThread; ++i) + { + input[i] += output[i]; + } + + __syncthreads(); + } + + hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } - - hipcub::StoreDirectStriped(lid, d_output + block_offset, input); - } }; -struct subtract_left_partial_tile { - template - __device__ static void run(const T *d_input, const int *tile_sizes, - T *d_output, unsigned int trials) { - const unsigned int lid = threadIdx.x; - const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; +struct subtract_left_partial_tile +{ + template + __device__ static void + run(const T* d_input, const int* tile_sizes, T* d_output, unsigned int trials) + { + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; - T input[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, d_input + block_offset, input); + T input[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, d_input + block_offset, input); - hipcub::BlockAdjacentDifference adjacent_difference; + hipcub::BlockAdjacentDifference adjacent_difference; - int tile_size = tile_sizes[blockIdx.x]; + int tile_size = tile_sizes[blockIdx.x]; - // Try to evenly distribute the length of tile_sizes between all the trials - const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; + // Try to evenly distribute the length of tile_sizes between all the trials + const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; #pragma nounroll - for (unsigned int trial = 0; trial < trials; trial++) { - T output[ItemsPerThread]; - - if (WithTile) { - adjacent_difference.SubtractLeftPartialTile(input, output, minus{}, - tile_size, T(123)); - } else { - adjacent_difference.SubtractLeftPartialTile(input, output, minus{}, - tile_size); - } - - for (unsigned int i = 0; i < ItemsPerThread; ++i) { - input[i] += output[i]; - } - - // Change the tile_size to even out the distribution - tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); - __syncthreads(); + for(unsigned int trial = 0; trial < trials; trial++) + { + T output[ItemsPerThread]; + + if(WithTile) + { + adjacent_difference.SubtractLeftPartialTile(input, + output, + minus{}, + tile_size, + T(123)); + } else + { + adjacent_difference.SubtractLeftPartialTile(input, output, minus{}, tile_size); + } + + for(unsigned int i = 0; i < ItemsPerThread; ++i) + { + input[i] += output[i]; + } + + // Change the tile_size to even out the distribution + tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); + __syncthreads(); + } + + hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } - - hipcub::StoreDirectStriped(lid, d_output + block_offset, input); - } }; -struct subtract_right { - template - __device__ static void run(const T *d_input, T *d_output, - unsigned int trials) { - const unsigned int lid = threadIdx.x; - const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; +struct subtract_right +{ + template + __device__ static void run(const T* d_input, T* d_output, unsigned int trials) + { + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; - T input[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, d_input + block_offset, input); + T input[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, d_input + block_offset, input); - hipcub::BlockAdjacentDifference adjacent_difference; + hipcub::BlockAdjacentDifference adjacent_difference; #pragma nounroll - for (unsigned int trial = 0; trial < trials; trial++) { - T output[ItemsPerThread]; - if (WithTile) { - adjacent_difference.SubtractRight(input, output, minus{}, T(123)); - } else { - adjacent_difference.SubtractRight(input, output, minus{}); - } - - for (unsigned int i = 0; i < ItemsPerThread; ++i) { - input[i] += output[i]; - } - - __syncthreads(); + for(unsigned int trial = 0; trial < trials; trial++) + { + T output[ItemsPerThread]; + if(WithTile) + { + adjacent_difference.SubtractRight(input, output, minus{}, T(123)); + } else + { + adjacent_difference.SubtractRight(input, output, minus{}); + } + + for(unsigned int i = 0; i < ItemsPerThread; ++i) + { + input[i] += output[i]; + } + + __syncthreads(); + } + + hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } - - hipcub::StoreDirectStriped(lid, d_output + block_offset, input); - } }; -struct subtract_right_partial_tile { - template - __device__ static void run(const T *d_input, const int *tile_sizes, - T *d_output, unsigned int trials) { - const unsigned int lid = threadIdx.x; - const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; +struct subtract_right_partial_tile +{ + template + __device__ static void + run(const T* d_input, const int* tile_sizes, T* d_output, unsigned int trials) + { + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; - T input[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, d_input + block_offset, input); + T input[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, d_input + block_offset, input); - hipcub::BlockAdjacentDifference adjacent_difference; + hipcub::BlockAdjacentDifference adjacent_difference; - int tile_size = tile_sizes[blockIdx.x]; + int tile_size = tile_sizes[blockIdx.x]; - // Try to evenly distribute the length of tile_sizes between all the trials - const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; + // Try to evenly distribute the length of tile_sizes between all the trials + const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; #pragma nounroll - for (unsigned int trial = 0; trial < trials; trial++) { - T output[ItemsPerThread]; + for(unsigned int trial = 0; trial < trials; trial++) + { + T output[ItemsPerThread]; - adjacent_difference.SubtractRightPartialTile(input, output, minus{}, - tile_size); + adjacent_difference.SubtractRightPartialTile(input, output, minus{}, tile_size); - for (unsigned int i = 0; i < ItemsPerThread; ++i) { - input[i] += output[i]; - } + for(unsigned int i = 0; i < ItemsPerThread; ++i) + { + input[i] += output[i]; + } - // Change the tile_size to even out the distribution - tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); - __syncthreads(); - } + // Change the tile_size to even out the distribution + tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); + __syncthreads(); + } - hipcub::StoreDirectStriped(lid, d_output + block_offset, input); - } + hipcub::StoreDirectStriped(lid, d_output + block_offset, input); + } }; -template -auto run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) - -> std::enable_if_t< - !std::is_same::value && - !std::is_same::value> { - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto num_blocks = (N + items_per_block - 1) / items_per_block; - // Round up size to the next multiple of items_per_block - const auto size = num_blocks * items_per_block; - - const std::vector input = - benchmark_utils::get_random_data(size, T(0), T(10)); - T *d_input; - T *d_output; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); - HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), - hipMemcpyHostToDevice)); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel), - dim3(num_blocks), dim3(BlockSize), 0, stream, d_input, d_output, - Trials); - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); - - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); +template +auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) + -> std::enable_if_t::value + && !std::is_same::value> +{ + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto num_blocks = (N + items_per_block - 1) / items_per_block; + // Round up size to the next multiple of items_per_block + const auto size = num_blocks * items_per_block; + + const std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); + T* d_input; + T* d_output; + HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); + HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); + HIP_CHECK( + hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice)); + + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), + dim3(num_blocks), + dim3(BlockSize), + 0, + stream, + d_input, + d_output, + Trials); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -template -auto run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) - -> std::enable_if_t< - std::is_same::value || - std::is_same::value> { - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto num_blocks = (N + items_per_block - 1) / items_per_block; - // Round up size to the next multiple of items_per_block - const auto size = num_blocks * items_per_block; - - const std::vector input = - benchmark_utils::get_random_data(size, T(0), T(10)); - const std::vector tile_sizes = - benchmark_utils::get_random_data(num_blocks, 0, items_per_block); - - T *d_input; - int *d_tile_sizes; - T *d_output; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); - HIP_CHECK( - hipMalloc(&d_tile_sizes, tile_sizes.size() * sizeof(tile_sizes[0]))); - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); - HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_tile_sizes, tile_sizes.data(), - tile_sizes.size() * sizeof(tile_sizes[0]), - hipMemcpyHostToDevice)); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel), - dim3(num_blocks), dim3(BlockSize), 0, stream, d_input, d_tile_sizes, - d_output, Trials); - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); - - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_tile_sizes)); - HIP_CHECK(hipFree(d_output)); +template +auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) + -> std::enable_if_t::value + || std::is_same::value> +{ + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto num_blocks = (N + items_per_block - 1) / items_per_block; + // Round up size to the next multiple of items_per_block + const auto size = num_blocks * items_per_block; + + const std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); + const std::vector tile_sizes + = benchmark_utils::get_random_data(num_blocks, 0, items_per_block); + + T* d_input; + int* d_tile_sizes; + T* d_output; + HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); + HIP_CHECK(hipMalloc(&d_tile_sizes, tile_sizes.size() * sizeof(tile_sizes[0]))); + HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); + HIP_CHECK( + hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_tile_sizes, + tile_sizes.data(), + tile_sizes.size() * sizeof(tile_sizes[0]), + hipMemcpyHostToDevice)); + + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), + dim3(num_blocks), + dim3(BlockSize), + 0, + stream, + d_input, + d_tile_sizes, + d_output, + Trials); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_tile_sizes)); + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ - benchmark::RegisterBenchmark( \ - std::string("block_adjacent_difference.sub_algorithm_name:") + \ - name + \ - std::string("") \ - .c_str(), \ - &run_benchmark, stream, size) - -#define BENCHMARK_TYPE(type, block, with_tile) \ - CREATE_BENCHMARK(type, block, 1, with_tile), \ - CREATE_BENCHMARK(type, block, 3, with_tile), \ - CREATE_BENCHMARK(type, block, 4, with_tile), \ - CREATE_BENCHMARK(type, block, 8, with_tile), \ - CREATE_BENCHMARK(type, block, 16, with_tile), \ - CREATE_BENCHMARK(type, block, 32, with_tile) - -template -void add_benchmarks(const std::string &name, - std::vector &benchmarks, - hipStream_t stream, size_t size) { - std::vector bs = { - BENCHMARK_TYPE(int, 256, false), BENCHMARK_TYPE(float, 256, false), - BENCHMARK_TYPE(int8_t, 256, false), BENCHMARK_TYPE(long long, 256, false), - BENCHMARK_TYPE(double, 256, false)}; - - if (!std::is_same::value) { - bs.insert(bs.end(), - {BENCHMARK_TYPE(int, 256, true), BENCHMARK_TYPE(float, 256, true), - BENCHMARK_TYPE(int8_t, 256, true), - BENCHMARK_TYPE(long long, 256, true), - BENCHMARK_TYPE(double, 256, true)}); - } - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ + benchmark::RegisterBenchmark( \ + std::string("block_adjacent_difference.sub_algorithm_name:") \ + + name + std::string("").c_str(), \ + &run_benchmark, \ + stream, \ + size) + +#define BENCHMARK_TYPE(type, block, with_tile) \ + CREATE_BENCHMARK(type, block, 1, with_tile), CREATE_BENCHMARK(type, block, 3, with_tile), \ + CREATE_BENCHMARK(type, block, 4, with_tile), CREATE_BENCHMARK(type, block, 8, with_tile), \ + CREATE_BENCHMARK(type, block, 16, with_tile), CREATE_BENCHMARK(type, block, 32, with_tile) + +template +void add_benchmarks(const std::string& name, + std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + std::vector bs = {BENCHMARK_TYPE(int, 256, false), + BENCHMARK_TYPE(float, 256, false), + BENCHMARK_TYPE(int8_t, 256, false), + BENCHMARK_TYPE(long long, 256, false), + BENCHMARK_TYPE(double, 256, false)}; + + if(!std::is_same::value) + { + bs.insert(bs.end(), + {BENCHMARK_TYPE(int, 256, true), + BENCHMARK_TYPE(float, 256, true), + BENCHMARK_TYPE(int8_t, 256, true), + BENCHMARK_TYPE(long long, 256, true), + BENCHMARK_TYPE(double, 256, true)}); + } + + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - - std::cout << "benchmark_block_adjacent_difference" << std::endl; - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_benchmarks("SubtractLeft", benchmarks, stream, size); - add_benchmarks("SubtractRight", benchmarks, stream, size); - add_benchmarks("SubtractLeftPartialTile", - benchmarks, stream, size); - add_benchmarks("SubtractRightPartialTile", - benchmarks, stream, size); - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_block_adjacent_difference" << std::endl; + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_benchmarks("SubtractLeft", benchmarks, stream, size); + add_benchmarks("SubtractRight", benchmarks, stream, size); + add_benchmarks("SubtractLeftPartialTile", benchmarks, stream, size); + add_benchmarks("SubtractRightPartialTile", + benchmarks, + stream, + size); + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } \ No newline at end of file diff --git a/benchmark/benchmark_block_discontinuity.cpp b/benchmark/benchmark_block_discontinuity.cpp index 79a5aa33..24446c9a 100644 --- a/benchmark/benchmark_block_discontinuity.cpp +++ b/benchmark/benchmark_block_discontinuity.cpp @@ -33,228 +33,273 @@ const size_t DEFAULT_N = 1024 * 1024 * 128; #endif -template struct custom_flag_op1 { - HIPCUB_HOST_DEVICE - bool operator()(const T &a, const T &b) const { return (a == b); } +template +struct custom_flag_op1 +{ + HIPCUB_HOST_DEVICE + bool operator()(const T& a, const T& b) const + { + return (a == b); + } }; -template -__global__ __launch_bounds__(BlockSize) void kernel(const T *d_input, - T *d_output) { - Runner::template run( - d_input, d_output); +template +__global__ __launch_bounds__(BlockSize) void kernel(const T* d_input, T* d_output) +{ + Runner::template run(d_input, d_output); } -struct flag_heads { - template - __device__ static void run(const T *d_input, T *d_output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = - hipBlockIdx_x * ItemsPerThread * BlockSize; - - T input[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, d_input + block_offset, input); +struct flag_heads +{ + template + __device__ static void run(const T* d_input, T* d_output) + { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + + T input[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, d_input + block_offset, input); #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - hipcub::BlockDiscontinuity bdiscontinuity; - bool head_flags[ItemsPerThread]; - if (WithTile) { - bdiscontinuity.FlagHeads(head_flags, input, hipcub::Equality(), T(123)); - } else { - bdiscontinuity.FlagHeads(head_flags, input, hipcub::Equality()); - } - - for (unsigned int i = 0; i < ItemsPerThread; i++) { - input[i] += head_flags[i]; - } - __syncthreads(); + for(unsigned int trial = 0; trial < Trials; trial++) + { + hipcub::BlockDiscontinuity bdiscontinuity; + bool head_flags[ItemsPerThread]; + if(WithTile) + { + bdiscontinuity.FlagHeads(head_flags, input, hipcub::Equality(), T(123)); + } else + { + bdiscontinuity.FlagHeads(head_flags, input, hipcub::Equality()); + } + + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + input[i] += head_flags[i]; + } + __syncthreads(); + } + hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } - hipcub::StoreDirectStriped(lid, d_output + block_offset, input); - } }; -struct flag_tails { - template - __device__ static void run(const T *d_input, T *d_output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = - hipBlockIdx_x * ItemsPerThread * BlockSize; - - T input[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, d_input + block_offset, input); +struct flag_tails +{ + template + __device__ static void run(const T* d_input, T* d_output) + { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + + T input[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, d_input + block_offset, input); #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - hipcub::BlockDiscontinuity bdiscontinuity; - bool tail_flags[ItemsPerThread]; - if (WithTile) { - bdiscontinuity.FlagTails(tail_flags, input, hipcub::Equality(), T(123)); - } else { - bdiscontinuity.FlagTails(tail_flags, input, hipcub::Equality()); - } - - for (unsigned int i = 0; i < ItemsPerThread; i++) { - input[i] += tail_flags[i]; - } - __syncthreads(); + for(unsigned int trial = 0; trial < Trials; trial++) + { + hipcub::BlockDiscontinuity bdiscontinuity; + bool tail_flags[ItemsPerThread]; + if(WithTile) + { + bdiscontinuity.FlagTails(tail_flags, input, hipcub::Equality(), T(123)); + } else + { + bdiscontinuity.FlagTails(tail_flags, input, hipcub::Equality()); + } + + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + input[i] += tail_flags[i]; + } + __syncthreads(); + } + hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } - hipcub::StoreDirectStriped(lid, d_output + block_offset, input); - } }; -struct flag_heads_and_tails { - template - __device__ static void run(const T *d_input, T *d_output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = - hipBlockIdx_x * ItemsPerThread * BlockSize; - - T input[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, d_input + block_offset, input); +struct flag_heads_and_tails +{ + template + __device__ static void run(const T* d_input, T* d_output) + { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + + T input[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, d_input + block_offset, input); #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - hipcub::BlockDiscontinuity bdiscontinuity; - bool head_flags[ItemsPerThread]; - bool tail_flags[ItemsPerThread]; - if (WithTile) { - bdiscontinuity.FlagHeadsAndTails(head_flags, T(123), tail_flags, T(234), - input, hipcub::Equality()); - } else { - bdiscontinuity.FlagHeadsAndTails(head_flags, tail_flags, input, - hipcub::Equality()); - } - - for (unsigned int i = 0; i < ItemsPerThread; i++) { - input[i] += head_flags[i]; - input[i] += tail_flags[i]; - } - __syncthreads(); + for(unsigned int trial = 0; trial < Trials; trial++) + { + hipcub::BlockDiscontinuity bdiscontinuity; + bool head_flags[ItemsPerThread]; + bool tail_flags[ItemsPerThread]; + if(WithTile) + { + bdiscontinuity.FlagHeadsAndTails(head_flags, + T(123), + tail_flags, + T(234), + input, + hipcub::Equality()); + } else + { + bdiscontinuity.FlagHeadsAndTails(head_flags, tail_flags, input, hipcub::Equality()); + } + + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + input[i] += head_flags[i]; + input[i] += tail_flags[i]; + } + __syncthreads(); + } + hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } - hipcub::StoreDirectStriped(lid, d_output + block_offset, input); - } }; -template -void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = - items_per_block * ((N + items_per_block - 1) / items_per_block); - - std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); - T *d_input; - T *d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel), - dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, - d_output); - HIP_CHECK(hipPeekAtLastError()); +template +void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) +{ + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); + + std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); + T* d_input; + T* d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel), + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_output); + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ - benchmark::RegisterBenchmark( \ - std::string("block_discontinuity.sub_algorithm_name:" + \ - name + \ - ".") \ - .c_str(), \ - &run_benchmark, stream, size) - -#define BENCHMARK_TYPE(type, block, bool) \ - CREATE_BENCHMARK(type, block, 1, bool), \ - CREATE_BENCHMARK(type, block, 2, bool), \ - CREATE_BENCHMARK(type, block, 3, bool), \ - CREATE_BENCHMARK(type, block, 4, bool), \ - CREATE_BENCHMARK(type, block, 8, bool) - -template -void add_benchmarks(const std::string &name, - std::vector &benchmarks, - hipStream_t stream, size_t size) { - std::vector bs = { - BENCHMARK_TYPE(int, 256, false), - BENCHMARK_TYPE(int, 256, true), - BENCHMARK_TYPE(int8_t, 256, false), - BENCHMARK_TYPE(int8_t, 256, true), - BENCHMARK_TYPE(uint8_t, 256, false), - BENCHMARK_TYPE(uint8_t, 256, true), - BENCHMARK_TYPE(long long, 256, false), - BENCHMARK_TYPE(long long, 256, true), - }; - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ + benchmark::RegisterBenchmark( \ + std::string("block_discontinuity.sub_algorithm_name:" \ + + name + ".") \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) + +#define BENCHMARK_TYPE(type, block, bool) \ + CREATE_BENCHMARK(type, block, 1, bool), CREATE_BENCHMARK(type, block, 2, bool), \ + CREATE_BENCHMARK(type, block, 3, bool), CREATE_BENCHMARK(type, block, 4, bool), \ + CREATE_BENCHMARK(type, block, 8, bool) + +template +void add_benchmarks(const std::string& name, + std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + std::vector bs = { + BENCHMARK_TYPE(int, 256, false), + BENCHMARK_TYPE(int, 256, true), + BENCHMARK_TYPE(int8_t, 256, false), + BENCHMARK_TYPE(int8_t, 256, true), + BENCHMARK_TYPE(uint8_t, 256, false), + BENCHMARK_TYPE(uint8_t, 256, true), + BENCHMARK_TYPE(long long, 256, false), + BENCHMARK_TYPE(long long, 256, true), + }; + + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_block_discontinuity" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_benchmarks("flag_heads", benchmarks, stream, size); - add_benchmarks("flag_tails", benchmarks, stream, size); - add_benchmarks("flag_heads_and_tails", benchmarks, - stream, size); - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_block_discontinuity" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_benchmarks("flag_heads", benchmarks, stream, size); + add_benchmarks("flag_tails", benchmarks, stream, size); + add_benchmarks("flag_heads_and_tails", benchmarks, stream, size); + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_block_exchange.cpp b/benchmark/benchmark_block_exchange.cpp index d91a2297..a36d041a 100644 --- a/benchmark/benchmark_block_exchange.cpp +++ b/benchmark/benchmark_block_exchange.cpp @@ -31,300 +31,311 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template -__global__ __launch_bounds__(BlockSize) void kernel(const T *d_input, - const unsigned int *d_ranks, - T *d_output) { - Runner::template run(d_input, d_ranks, - d_output); +template +__global__ __launch_bounds__(BlockSize) void kernel(const T* d_input, + const unsigned int* d_ranks, + T* d_output) +{ + Runner::template run(d_input, d_ranks, d_output); } -struct blocked_to_striped { - template - __device__ static void run(const T *d_input, const unsigned int *, - T *d_output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = - hipBlockIdx_x * ItemsPerThread * BlockSize; +struct blocked_to_striped +{ + template + __device__ static void run(const T* d_input, const unsigned int*, T* d_output) + { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - T input[ItemsPerThread]; - hipcub::LoadDirectBlocked(lid, d_input + block_offset, input); + T input[ItemsPerThread]; + hipcub::LoadDirectBlocked(lid, d_input + block_offset, input); #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - hipcub::BlockExchange exchange; - exchange.BlockedToStriped(input, input); - __syncthreads(); // extra sync needed because of loop. In normal usage - // sync with be cared for by the load and store functions - // (outside the loop). + for(unsigned int trial = 0; trial < Trials; trial++) + { + hipcub::BlockExchange exchange; + exchange.BlockedToStriped(input, input); + __syncthreads(); // extra sync needed because of loop. In normal usage + // sync with be cared for by the load and store functions + // (outside the loop). + } + hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } - hipcub::StoreDirectStriped(lid, d_output + block_offset, input); - } }; -struct striped_to_blocked { - template - __device__ static void run(const T *d_input, const unsigned int *, - T *d_output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = - hipBlockIdx_x * ItemsPerThread * BlockSize; +struct striped_to_blocked +{ + template + __device__ static void run(const T* d_input, const unsigned int*, T* d_output) + { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - T input[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, d_input + block_offset, input); + T input[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, d_input + block_offset, input); #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - hipcub::BlockExchange exchange; - exchange.StripedToBlocked(input, input); - __syncthreads(); // extra sync needed because of loop. In normal usage - // sync with be cared for by the load and store functions - // (outside the loop). + for(unsigned int trial = 0; trial < Trials; trial++) + { + hipcub::BlockExchange exchange; + exchange.StripedToBlocked(input, input); + __syncthreads(); // extra sync needed because of loop. In normal usage + // sync with be cared for by the load and store functions + // (outside the loop). + } + hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); } - hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); - } }; -struct blocked_to_warp_striped { - template - __device__ static void run(const T *d_input, const unsigned int *, - T *d_output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = - hipBlockIdx_x * ItemsPerThread * BlockSize; +struct blocked_to_warp_striped +{ + template + __device__ static void run(const T* d_input, const unsigned int*, T* d_output) + { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - T input[ItemsPerThread]; - hipcub::LoadDirectBlocked(lid, d_input + block_offset, input); + T input[ItemsPerThread]; + hipcub::LoadDirectBlocked(lid, d_input + block_offset, input); #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - hipcub::BlockExchange exchange; - exchange.BlockedToWarpStriped(input, input); - __syncthreads(); // extra sync needed because of loop. In normal usage - // sync with be cared for by the load and store functions - // (outside the loop). + for(unsigned int trial = 0; trial < Trials; trial++) + { + hipcub::BlockExchange exchange; + exchange.BlockedToWarpStriped(input, input); + __syncthreads(); // extra sync needed because of loop. In normal usage + // sync with be cared for by the load and store functions + // (outside the loop). + } + hipcub::StoreDirectWarpStriped(lid, d_output + block_offset, input); } - hipcub::StoreDirectWarpStriped(lid, d_output + block_offset, input); - } }; -struct warp_striped_to_blocked { - template - __device__ static void run(const T *d_input, const unsigned int *, - T *d_output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = - hipBlockIdx_x * ItemsPerThread * BlockSize; +struct warp_striped_to_blocked +{ + template + __device__ static void run(const T* d_input, const unsigned int*, T* d_output) + { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - T input[ItemsPerThread]; - hipcub::LoadDirectWarpStriped(lid, d_input + block_offset, input); + T input[ItemsPerThread]; + hipcub::LoadDirectWarpStriped(lid, d_input + block_offset, input); #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - hipcub::BlockExchange exchange; - exchange.WarpStripedToBlocked(input, input); - __syncthreads(); // extra sync needed because of loop. In normal usage - // sync with be cared for by the load and store functions - // (outside the loop). + for(unsigned int trial = 0; trial < Trials; trial++) + { + hipcub::BlockExchange exchange; + exchange.WarpStripedToBlocked(input, input); + __syncthreads(); // extra sync needed because of loop. In normal usage + // sync with be cared for by the load and store functions + // (outside the loop). + } + hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); } - hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); - } }; -struct scatter_to_blocked { - template - __device__ static void run(const T *d_input, const unsigned int *d_ranks, - T *d_output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = - hipBlockIdx_x * ItemsPerThread * BlockSize; +struct scatter_to_blocked +{ + template + __device__ static void run(const T* d_input, const unsigned int* d_ranks, T* d_output) + { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - T input[ItemsPerThread]; - unsigned int ranks[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, d_input + block_offset, input); - hipcub::LoadDirectStriped(lid, d_ranks + block_offset, ranks); + T input[ItemsPerThread]; + unsigned int ranks[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, d_input + block_offset, input); + hipcub::LoadDirectStriped(lid, d_ranks + block_offset, ranks); #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - hipcub::BlockExchange exchange; - exchange.ScatterToBlocked(input, input, ranks); - __syncthreads(); // extra sync needed because of loop. In normal usage - // sync with be cared for by the load and store functions - // (outside the loop). + for(unsigned int trial = 0; trial < Trials; trial++) + { + hipcub::BlockExchange exchange; + exchange.ScatterToBlocked(input, input, ranks); + __syncthreads(); // extra sync needed because of loop. In normal usage + // sync with be cared for by the load and store functions + // (outside the loop). + } + hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); } - hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); - } }; -struct scatter_to_striped { - template - __device__ static void run(const T *d_input, const unsigned int *d_ranks, - T *d_output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = - hipBlockIdx_x * ItemsPerThread * BlockSize; +struct scatter_to_striped +{ + template + __device__ static void run(const T* d_input, const unsigned int* d_ranks, T* d_output) + { + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - T input[ItemsPerThread]; - unsigned int ranks[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, d_input + block_offset, input); - hipcub::LoadDirectStriped(lid, d_ranks + block_offset, ranks); + T input[ItemsPerThread]; + unsigned int ranks[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, d_input + block_offset, input); + hipcub::LoadDirectStriped(lid, d_ranks + block_offset, ranks); #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - hipcub::BlockExchange exchange; - exchange.ScatterToStriped(input, input, ranks); - __syncthreads(); // extra sync needed because of loop. In normal usage - // sync with be cared for by the load and store functions - // (outside the loop). + for(unsigned int trial = 0; trial < Trials; trial++) + { + hipcub::BlockExchange exchange; + exchange.ScatterToStriped(input, input, ranks); + __syncthreads(); // extra sync needed because of loop. In normal usage + // sync with be cared for by the load and store functions + // (outside the loop). + } + hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } - hipcub::StoreDirectStriped(lid, d_output + block_offset, input); - } }; -template -void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = - items_per_block * ((N + items_per_block - 1) / items_per_block); - - std::vector input(size); - // Fill input - for (size_t i = 0; i < size; i++) { - input[i] = T(i); - } - std::vector ranks(size); - // Fill ranks (for scatter operations) - std::mt19937 gen; - for (size_t bi = 0; bi < size / items_per_block; bi++) { - auto block_ranks = ranks.begin() + bi * items_per_block; - std::iota(block_ranks, block_ranks + items_per_block, 0); - std::shuffle(block_ranks, block_ranks + items_per_block, gen); - } - T *d_input; - unsigned int *d_ranks; - T *d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_ranks, size * sizeof(unsigned int))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_ranks, ranks.data(), size * sizeof(unsigned int), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel), - dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, - d_ranks, d_output); - HIP_CHECK(hipPeekAtLastError()); +template +void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) +{ + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); + + std::vector input(size); + // Fill input + for(size_t i = 0; i < size; i++) + { + input[i] = T(i); + } + std::vector ranks(size); + // Fill ranks (for scatter operations) + std::mt19937 gen; + for(size_t bi = 0; bi < size / items_per_block; bi++) + { + auto block_ranks = ranks.begin() + bi * items_per_block; + std::iota(block_ranks, block_ranks + items_per_block, 0); + std::shuffle(block_ranks, block_ranks + items_per_block, gen); + } + T* d_input; + unsigned int* d_ranks; + T* d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_ranks, size * sizeof(unsigned int))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_ranks, ranks.data(), size * sizeof(unsigned int), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); - - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_ranks)); - HIP_CHECK(hipFree(d_output)); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_ranks, + d_output); + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_ranks)); + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark( \ - std::string("block_exchange.sub_algorithm_name:" + \ - name) \ - .c_str(), \ - &run_benchmark, stream, size) - -#define BENCHMARK_TYPE(type, block) \ - CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2), \ - CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \ - CREATE_BENCHMARK(type, block, 7), CREATE_BENCHMARK(type, block, 8) - -template -void add_benchmarks(const std::string &name, - std::vector &benchmarks, - hipStream_t stream, size_t size) { - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - - std::vector bs = { - BENCHMARK_TYPE(int, 256), - BENCHMARK_TYPE(int8_t, 256), - BENCHMARK_TYPE(long long, 256), - BENCHMARK_TYPE(custom_float2, 256), - BENCHMARK_TYPE(custom_double2, 256), - }; - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark(std::string("block_exchange.sub_algorithm_name:" \ + + name) \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) + +#define BENCHMARK_TYPE(type, block) \ + CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2), \ + CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \ + CREATE_BENCHMARK(type, block, 7), CREATE_BENCHMARK(type, block, 8) + +template +void add_benchmarks(const std::string& name, + std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + + std::vector bs = { + BENCHMARK_TYPE(int, 256), + BENCHMARK_TYPE(int8_t, 256), + BENCHMARK_TYPE(long long, 256), + BENCHMARK_TYPE(custom_float2, 256), + BENCHMARK_TYPE(custom_double2, 256), + }; + + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_block_exchange" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_benchmarks("blocked_to_striped", benchmarks, stream, - size); - add_benchmarks("striped_to_blocked", benchmarks, stream, - size); - add_benchmarks("blocked_to_warp_striped", benchmarks, - stream, size); - add_benchmarks("warp_striped_to_blocked", benchmarks, - stream, size); - add_benchmarks("scatter_to_blocked", benchmarks, stream, - size); - add_benchmarks("scatter_to_striped", benchmarks, stream, - size); - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_block_exchange" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_benchmarks("blocked_to_striped", benchmarks, stream, size); + add_benchmarks("striped_to_blocked", benchmarks, stream, size); + add_benchmarks("blocked_to_warp_striped", benchmarks, stream, size); + add_benchmarks("warp_striped_to_blocked", benchmarks, stream, size); + add_benchmarks("scatter_to_blocked", benchmarks, stream, size); + add_benchmarks("scatter_to_striped", benchmarks, stream, size); + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_block_histogram.cpp b/benchmark/benchmark_block_histogram.cpp index b9fe9f54..122ccc36 100644 --- a/benchmark/benchmark_block_histogram.cpp +++ b/benchmark/benchmark_block_histogram.cpp @@ -29,167 +29,187 @@ const size_t DEFAULT_N = 1024 * 1024 * 128; #endif -template -__global__ __launch_bounds__(BlockSize) void kernel(const T *input, T *output) { - Runner::template run(input, - output); +template +__global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) +{ + Runner::template run(input, output); } -template struct histogram { - template - __device__ static void run(const T *input, T *output) { - const unsigned int index = - ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; - unsigned int global_offset = hipBlockIdx_x * BinSize; - - T values[ItemsPerThread]; - for (unsigned int k = 0; k < ItemsPerThread; k++) { - values[k] = input[index + k]; - } - - using bhistogram_t = hipcub::BlockHistogram; - __shared__ T histogram[BinSize]; - __shared__ typename bhistogram_t::TempStorage storage; +template +struct histogram +{ + template + __device__ static void run(const T* input, T* output) + { + const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; + unsigned int global_offset = hipBlockIdx_x * BinSize; + + T values[ItemsPerThread]; + for(unsigned int k = 0; k < ItemsPerThread; k++) + { + values[k] = input[index + k]; + } + + using bhistogram_t + = hipcub::BlockHistogram; + __shared__ T histogram[BinSize]; + __shared__ typename bhistogram_t::TempStorage storage; #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - bhistogram_t(storage).Histogram(values, histogram); - } + for(unsigned int trial = 0; trial < Trials; trial++) + { + bhistogram_t(storage).Histogram(values, histogram); + } #pragma unroll - for (unsigned int offset = 0; offset < BinSize; offset += BlockSize) { - if (offset + hipThreadIdx_x < BinSize) { - output[global_offset + hipThreadIdx_x] = - histogram[offset + hipThreadIdx_x]; - global_offset += BlockSize; - } + for(unsigned int offset = 0; offset < BinSize; offset += BlockSize) + { + if(offset + hipThreadIdx_x < BinSize) + { + output[global_offset + hipThreadIdx_x] = histogram[offset + hipThreadIdx_x]; + global_offset += BlockSize; + } + } } - } }; -template -void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { - // Make sure size is a multiple of BlockSize - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = - items_per_block * ((N + items_per_block - 1) / items_per_block); - const auto bin_size = BinSize * ((N + items_per_block - 1) / items_per_block); - // Allocate and fill memory - std::vector input(size, 0.0f); - T *d_input; - T *d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, bin_size * sizeof(T))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel), - dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, - d_output); - HIP_CHECK(hipPeekAtLastError()); +template +void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) +{ + // Make sure size is a multiple of BlockSize + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); + const auto bin_size = BinSize * ((N + items_per_block - 1) / items_per_block); + // Allocate and fill memory + std::vector input(size, 0.0f); + T* d_input; + T* d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, bin_size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); - state.SetItemsProcessed(state.iterations() * size * Trials); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel), + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_output); + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); + state.SetItemsProcessed(state.iterations() * size * Trials); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } // IPT - items per thread -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark( \ - std::string("block_histogram.method_name:" + method_name) \ - .c_str(), \ - &run_benchmark, stream, size) - -#define BENCHMARK_TYPE(type, block) \ - CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2), \ - CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \ - CREATE_BENCHMARK(type, block, 8), CREATE_BENCHMARK(type, block, 16) - -template -void add_benchmarks(std::vector &benchmarks, - const std::string &method_name, - const std::string &algorithm_name, hipStream_t stream, - size_t size) { - std::vector new_benchmarks = { - BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int, 320), - BENCHMARK_TYPE(int, 512), - - BENCHMARK_TYPE(unsigned long long, 256), - BENCHMARK_TYPE(unsigned long long, 320)}; - benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), - new_benchmarks.end()); +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark(std::string("block_histogram.method_name:" + method_name) \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) + +#define BENCHMARK_TYPE(type, block) \ + CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2), \ + CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \ + CREATE_BENCHMARK(type, block, 8), CREATE_BENCHMARK(type, block, 16) + +template +void add_benchmarks(std::vector& benchmarks, + const std::string& method_name, + const std::string& algorithm_name, + hipStream_t stream, + size_t size) +{ + std::vector new_benchmarks + = {BENCHMARK_TYPE(int, 256), + BENCHMARK_TYPE(int, 320), + BENCHMARK_TYPE(int, 512), + + BENCHMARK_TYPE(unsigned long long, 256), + BENCHMARK_TYPE(unsigned long long, 320)}; + benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_block_histogram" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - // using_atomic - using histogram_a_t = - histogram; - add_benchmarks(benchmarks, "histogram", "using_atomic", stream, - size); - // using_sort - using histogram_s_t = - histogram; - add_benchmarks(benchmarks, "histogram", "using_sort", stream, - size); - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_block_histogram" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + // using_atomic + using histogram_a_t = histogram; + add_benchmarks(benchmarks, "histogram", "using_atomic", stream, size); + // using_sort + using histogram_s_t = histogram; + add_benchmarks(benchmarks, "histogram", "using_sort", stream, size); + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_block_merge_sort.cpp b/benchmark/benchmark_block_merge_sort.cpp index 2c6b62aa..62ffbdfa 100644 --- a/benchmark/benchmark_block_merge_sort.cpp +++ b/benchmark/benchmark_block_merge_sort.cpp @@ -32,184 +32,232 @@ const size_t DEFAULT_N = 1024 * 1024 * 128; #endif -enum class benchmark_kinds { sort_keys, sort_pairs }; - -template -__global__ -__launch_bounds__(BlockSize) void sort_keys_kernel(const T *input, T *output, - CompareOp compare_op) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - - T keys[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, input + block_offset, keys); +enum class benchmark_kinds +{ + sort_keys, + sort_pairs +}; + +template +__global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T* input, + T* output, + CompareOp compare_op) +{ + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + + T keys[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, input + block_offset, keys); #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - hipcub::BlockMergeSort sort; - sort.Sort(keys, compare_op); - } + for(unsigned int trial = 0; trial < Trials; trial++) + { + hipcub::BlockMergeSort sort; + sort.Sort(keys, compare_op); + } - hipcub::StoreDirectStriped(lid, output + block_offset, keys); + hipcub::StoreDirectStriped(lid, output + block_offset, keys); } -template -__global__ -__launch_bounds__(BlockSize) void sort_pairs_kernel(const T *input, T *output, - CompareOp compare_op) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - - T keys[ItemsPerThread]; - T values[ItemsPerThread]; - hipcub::LoadDirectStriped(lid, input + block_offset, keys); - - for (unsigned int i = 0; i < ItemsPerThread; i++) { - values[i] = keys[i] + T(1); - } +template +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T* input, + T* output, + CompareOp compare_op) +{ + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + + T keys[ItemsPerThread]; + T values[ItemsPerThread]; + hipcub::LoadDirectStriped(lid, input + block_offset, keys); + + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + values[i] = keys[i] + T(1); + } #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - hipcub::BlockMergeSort sort; - sort.Sort(keys, values, compare_op); - } - - for (unsigned int i = 0; i < ItemsPerThread; i++) { - keys[i] += values[i]; - } - hipcub::StoreDirectStriped(lid, output + block_offset, keys); + for(unsigned int trial = 0; trial < Trials; trial++) + { + hipcub::BlockMergeSort sort; + sort.Sort(keys, values, compare_op); + } + + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + keys[i] += values[i]; + } + hipcub::StoreDirectStriped(lid, output + block_offset, keys); } -template -void run_benchmark(benchmark::State &state, benchmark_kinds benchmark_kind, - hipStream_t stream, size_t N) { - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = - items_per_block * ((N + items_per_block - 1) / items_per_block); - - std::vector input; - if (std::is_floating_point::value) { - input = benchmark_utils::get_random_data(size, (T)-1000, (T) + 1000); - } else { - input = benchmark_utils::get_random_data( - size, std::numeric_limits::min(), std::numeric_limits::max()); - } - T *d_input; - T *d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - if (benchmark_kind == benchmark_kinds::sort_keys) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(sort_keys_kernel), - dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, - d_output, CompareOp()); - } else if (benchmark_kind == benchmark_kinds::sort_pairs) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(sort_pairs_kernel), - dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, - d_output, CompareOp()); +template +void run_benchmark(benchmark::State& state, + benchmark_kinds benchmark_kind, + hipStream_t stream, + size_t N) +{ + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); + + std::vector input; + if(std::is_floating_point::value) + { + input = benchmark_utils::get_random_data(size, (T)-1000, (T) + 1000); + } else + { + input = benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); } - HIP_CHECK(hipPeekAtLastError()); + T* d_input; + T* d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + if(benchmark_kind == benchmark_kinds::sort_keys) + { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(sort_keys_kernel), + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_output, + CompareOp()); + } else if(benchmark_kind == benchmark_kinds::sort_pairs) + { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(sort_pairs_kernel), + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_output, + CompareOp()); + } + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark( \ - std::string("block_merge_sort.sub_algorithm_name:" + \ - name) \ - .c_str(), \ - &run_benchmark, benchmark_kind, stream, size) - -#define BENCHMARK_TYPE(type, block) \ - CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2), \ - CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \ - CREATE_BENCHMARK(type, block, 8) - -void add_benchmarks(benchmark_kinds benchmark_kind, const std::string &name, - std::vector &benchmarks, - hipStream_t stream, size_t size) { - std::vector bs = { - BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(int, 128), - BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int, 512), - - BENCHMARK_TYPE(int8_t, 64), BENCHMARK_TYPE(int8_t, 128), - BENCHMARK_TYPE(int8_t, 256), BENCHMARK_TYPE(int8_t, 512), - - BENCHMARK_TYPE(uint8_t, 64), BENCHMARK_TYPE(uint8_t, 128), - BENCHMARK_TYPE(uint8_t, 256), BENCHMARK_TYPE(uint8_t, 512), - - BENCHMARK_TYPE(long long, 64), BENCHMARK_TYPE(long long, 128), - BENCHMARK_TYPE(long long, 256), BENCHMARK_TYPE(long long, 512)}; - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark(std::string("block_merge_sort.sub_algorithm_name:" \ + + name) \ + .c_str(), \ + &run_benchmark, \ + benchmark_kind, \ + stream, \ + size) + +#define BENCHMARK_TYPE(type, block) \ + CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2), \ + CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \ + CREATE_BENCHMARK(type, block, 8) + +void add_benchmarks(benchmark_kinds benchmark_kind, + const std::string& name, + std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + std::vector bs = {BENCHMARK_TYPE(int, 64), + BENCHMARK_TYPE(int, 128), + BENCHMARK_TYPE(int, 256), + BENCHMARK_TYPE(int, 512), + + BENCHMARK_TYPE(int8_t, 64), + BENCHMARK_TYPE(int8_t, 128), + BENCHMARK_TYPE(int8_t, 256), + BENCHMARK_TYPE(int8_t, 512), + + BENCHMARK_TYPE(uint8_t, 64), + BENCHMARK_TYPE(uint8_t, 128), + BENCHMARK_TYPE(uint8_t, 256), + BENCHMARK_TYPE(uint8_t, 512), + + BENCHMARK_TYPE(long long, 64), + BENCHMARK_TYPE(long long, 128), + BENCHMARK_TYPE(long long, 256), + BENCHMARK_TYPE(long long, 512)}; + + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_block_merge_sort" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_benchmarks(benchmark_kinds::sort_keys, "sort(keys)", benchmarks, stream, - size); - add_benchmarks(benchmark_kinds::sort_pairs, "sort(keys, values)", benchmarks, - stream, size); - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_block_merge_sort" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_benchmarks(benchmark_kinds::sort_keys, "sort(keys)", benchmarks, stream, size); + add_benchmarks(benchmark_kinds::sort_pairs, "sort(keys, values)", benchmarks, stream, size); + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_block_radix_rank.cpp b/benchmark/benchmark_block_radix_rank.cpp index e9d1f474..8578b75c 100644 --- a/benchmark/benchmark_block_radix_rank.cpp +++ b/benchmark/benchmark_block_radix_rank.cpp @@ -33,116 +33,137 @@ const size_t DEFAULT_N = 1024 * 1024 * 128; #endif -enum class RadixRankAlgorithm { - RADIX_RANK_BASIC, - RADIX_RANK_MEMOIZE, - RADIX_RANK_MATCH, +enum class RadixRankAlgorithm +{ + RADIX_RANK_BASIC, + RADIX_RANK_MEMOIZE, + RADIX_RANK_MATCH, }; -template -__global__ __launch_bounds__(BlockSize) void rank_kernel(const T *keys_input, - int *ranks_output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - - T keys[ItemsPerThread]; - hipcub::LoadDirectBlocked(lid, keys_input + block_offset, keys); - - using KeyTraits = hipcub::Traits; - using UnsignedBits = typename KeyTraits::UnsignedBits; - using DigitExtractor = hipcub::BFEDigitExtractor; - - UnsignedBits(&unsigned_keys)[ItemsPerThread] = - reinterpret_cast(keys); - - using RankType = std::conditional_t< - BenchmarkKind == RadixRankAlgorithm::RADIX_RANK_MATCH, - hipcub::BlockRadixRankMatch, - hipcub::BlockRadixRank>; +template +__global__ __launch_bounds__(BlockSize) void rank_kernel(const T* keys_input, int* ranks_output) +{ + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + + T keys[ItemsPerThread]; + hipcub::LoadDirectBlocked(lid, keys_input + block_offset, keys); + + using KeyTraits = hipcub::Traits; + using UnsignedBits = typename KeyTraits::UnsignedBits; + using DigitExtractor = hipcub::BFEDigitExtractor; + + UnsignedBits(&unsigned_keys)[ItemsPerThread] + = reinterpret_cast(keys); + + using RankType = std::conditional_t< + BenchmarkKind == RadixRankAlgorithm::RADIX_RANK_MATCH, + hipcub::BlockRadixRankMatch, + hipcub::BlockRadixRank>; #pragma unroll - for (unsigned int key = 0; key < ItemsPerThread; key++) { - unsigned_keys[key] = KeyTraits::TwiddleIn(unsigned_keys[key]); - } + for(unsigned int key = 0; key < ItemsPerThread; key++) + { + unsigned_keys[key] = KeyTraits::TwiddleIn(unsigned_keys[key]); + } - int ranks[ItemsPerThread]; + int ranks[ItemsPerThread]; #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - __shared__ typename RankType::TempStorage storage; - RankType rank(storage); - unsigned begin_bit = 0; - const unsigned end_bit = sizeof(T) * 8; - - while (begin_bit < end_bit) { - const unsigned pass_bits = min(RadixBits, end_bit - begin_bit); - DigitExtractor digit_extractor(begin_bit, pass_bits); - - rank.RankKeys(unsigned_keys, ranks, digit_extractor); - begin_bit += RadixBits; + for(unsigned int trial = 0; trial < Trials; trial++) + { + __shared__ typename RankType::TempStorage storage; + RankType rank(storage); + unsigned begin_bit = 0; + const unsigned end_bit = sizeof(T) * 8; + + while(begin_bit < end_bit) + { + const unsigned pass_bits = min(RadixBits, end_bit - begin_bit); + DigitExtractor digit_extractor(begin_bit, pass_bits); + + rank.RankKeys(unsigned_keys, ranks, digit_extractor); + begin_bit += RadixBits; + } } - } - hipcub::StoreDirectBlocked(lid, ranks_output + block_offset, ranks); + hipcub::StoreDirectBlocked(lid, ranks_output + block_offset, ranks); } -template -void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { - constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int size = - items_per_block * ((N + items_per_block - 1) / items_per_block); - - std::vector input; - if (std::is_floating_point::value) { - input = benchmark_utils::get_random_data(size, static_cast(-1000), - static_cast(1000)); - } else { - input = benchmark_utils::get_random_data( - size, std::numeric_limits::min(), std::numeric_limits::max()); - } - T *d_input; - int *d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(int))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - hipLaunchKernelGGL( - HIP_KERNEL_NAME(rank_kernel), - dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, - d_output); - HIP_CHECK(hipPeekAtLastError()); +template +void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) +{ + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + const unsigned int size = items_per_block * ((N + items_per_block - 1) / items_per_block); + + std::vector input; + if(std::is_floating_point::value) + { + input = benchmark_utils::get_random_data(size, + static_cast(-1000), + static_cast(1000)); + } else + { + input = benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); + } + T* d_input; + int* d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(int))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + rank_kernel), + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_output); + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, KIND, BS, IPT) \ - benchmark::RegisterBenchmark(std::string("block_radix_rank." + \ - name) \ - .c_str(), \ - &run_benchmark, stream, size) +#define CREATE_BENCHMARK(T, KIND, BS, IPT) \ + benchmark::RegisterBenchmark(std::string("block_radix_rank." \ + + name) \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) // clang-format off #define CREATE_BENCHMARK_KINDS(type, block, ipt) \ @@ -158,62 +179,71 @@ void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { CREATE_BENCHMARK_KINDS(type, block, 32) // clang-format on -void add_benchmarks(const std::string &name, - std::vector &benchmarks, - hipStream_t stream, size_t size) { - std::vector bs = { - BENCHMARK_TYPE(int, 128), BENCHMARK_TYPE(int, 256), - BENCHMARK_TYPE(int, 512), - - BENCHMARK_TYPE(uint8_t, 128), BENCHMARK_TYPE(uint8_t, 256), - BENCHMARK_TYPE(uint8_t, 512), - - BENCHMARK_TYPE(long long, 128), BENCHMARK_TYPE(long long, 256), - BENCHMARK_TYPE(long long, 512), - }; - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +void add_benchmarks(const std::string& name, + std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + std::vector bs = { + BENCHMARK_TYPE(int, 128), + BENCHMARK_TYPE(int, 256), + BENCHMARK_TYPE(int, 512), + + BENCHMARK_TYPE(uint8_t, 128), + BENCHMARK_TYPE(uint8_t, 256), + BENCHMARK_TYPE(uint8_t, 512), + + BENCHMARK_TYPE(long long, 128), + BENCHMARK_TYPE(long long, 256), + BENCHMARK_TYPE(long long, 512), + }; + + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - - std::cout << "benchmark_block_radix_rank" << std::endl; - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_benchmarks("rank", benchmarks, stream, size); - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_block_radix_rank" << std::endl; + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_benchmarks("rank", benchmarks, stream, size); + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_block_radix_sort.cpp b/benchmark/benchmark_block_radix_sort.cpp index d31fbbac..7413214e 100644 --- a/benchmark/benchmark_block_radix_sort.cpp +++ b/benchmark/benchmark_block_radix_sort.cpp @@ -31,191 +31,218 @@ const size_t DEFAULT_N = 1024 * 1024 * 128; #endif -enum class benchmark_kinds { sort_keys, sort_pairs }; - -struct helper_blocked_blocked { - template - HIPCUB_DEVICE static void load(int linear_id, InputIteratorT block_iter, - T (&items)[ItemsPerThread]) { - hipcub::LoadDirectStriped(linear_id, block_iter, items); - } - - template - HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread]) { - hipcub::BlockRadixSort sort; - sort.Sort(keys); - } - - template - HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread], - T (&values)[ItemsPerThread]) { - hipcub::BlockRadixSort sort; - sort.Sort(keys, values); - } - - template - HIPCUB_DEVICE static void - sort(benchmark_utils::custom_type (&keys)[ItemsPerThread]) { - using custom_t = benchmark_utils::custom_type; - hipcub::BlockRadixSort sort; - sort.Sort(keys, benchmark_utils::custom_type_decomposer{}); - } - - template - HIPCUB_DEVICE static void - sort(benchmark_utils::custom_type (&keys)[ItemsPerThread], - benchmark_utils::custom_type (&values)[ItemsPerThread]) { - using custom_t = benchmark_utils::custom_type; - hipcub::BlockRadixSort sort; - sort.Sort(keys, values, - benchmark_utils::custom_type_decomposer{}); - } +enum class benchmark_kinds +{ + sort_keys, + sort_pairs }; -struct helper_blocked_striped { - template - HIPCUB_DEVICE static void load(int linear_id, InputIteratorT block_iter, - T (&items)[ItemsPerThread]) { - hipcub::LoadDirectBlocked(linear_id, block_iter, items); - } - - template - HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread]) { - hipcub::BlockRadixSort sort; - sort.SortBlockedToStriped(keys); - } - - template - HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread], - T (&values)[ItemsPerThread]) { - hipcub::BlockRadixSort sort; - sort.SortBlockedToStriped(keys, values); - } - - template - HIPCUB_DEVICE static void - sort(benchmark_utils::custom_type (&keys)[ItemsPerThread]) { - using custom_t = benchmark_utils::custom_type; - hipcub::BlockRadixSort sort; - sort.SortBlockedToStriped( - keys, benchmark_utils::custom_type_decomposer{}); - } - - template - HIPCUB_DEVICE static void - sort(benchmark_utils::custom_type (&keys)[ItemsPerThread], - benchmark_utils::custom_type (&values)[ItemsPerThread]) { - using custom_t = benchmark_utils::custom_type; - hipcub::BlockRadixSort sort; - sort.SortBlockedToStriped( - keys, values, benchmark_utils::custom_type_decomposer{}); - } +struct helper_blocked_blocked +{ + template + HIPCUB_DEVICE static void + load(int linear_id, InputIteratorT block_iter, T (&items)[ItemsPerThread]) + { + hipcub::LoadDirectStriped(linear_id, block_iter, items); + } + + template + HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread]) + { + hipcub::BlockRadixSort sort; + sort.Sort(keys); + } + + template + HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread], T (&values)[ItemsPerThread]) + { + hipcub::BlockRadixSort sort; + sort.Sort(keys, values); + } + + template + HIPCUB_DEVICE static void sort(benchmark_utils::custom_type (&keys)[ItemsPerThread]) + { + using custom_t = benchmark_utils::custom_type; + hipcub::BlockRadixSort sort; + sort.Sort(keys, benchmark_utils::custom_type_decomposer{}); + } + + template + HIPCUB_DEVICE static void sort(benchmark_utils::custom_type (&keys)[ItemsPerThread], + benchmark_utils::custom_type (&values)[ItemsPerThread]) + { + using custom_t = benchmark_utils::custom_type; + hipcub::BlockRadixSort sort; + sort.Sort(keys, values, benchmark_utils::custom_type_decomposer{}); + } }; -template -__global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T *input, - T *output) { - const unsigned int lid = threadIdx.x; - const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; +struct helper_blocked_striped +{ + template + HIPCUB_DEVICE static void + load(int linear_id, InputIteratorT block_iter, T (&items)[ItemsPerThread]) + { + hipcub::LoadDirectBlocked(linear_id, block_iter, items); + } - T keys[ItemsPerThread]; - Helper::template load(lid, input + block_offset, keys); + template + HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread]) + { + hipcub::BlockRadixSort sort; + sort.SortBlockedToStriped(keys); + } -#pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - Helper::template sort(keys); - } + template + HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread], T (&values)[ItemsPerThread]) + { + hipcub::BlockRadixSort sort; + sort.SortBlockedToStriped(keys, values); + } - hipcub::StoreDirectStriped(lid, output + block_offset, keys); -} + template + HIPCUB_DEVICE static void sort(benchmark_utils::custom_type (&keys)[ItemsPerThread]) + { + using custom_t = benchmark_utils::custom_type; + hipcub::BlockRadixSort sort; + sort.SortBlockedToStriped(keys, benchmark_utils::custom_type_decomposer{}); + } -template -__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T *input, - T *output) { - const unsigned int lid = threadIdx.x; - const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; + template + HIPCUB_DEVICE static void sort(benchmark_utils::custom_type (&keys)[ItemsPerThread], + benchmark_utils::custom_type (&values)[ItemsPerThread]) + { + using custom_t = benchmark_utils::custom_type; + hipcub::BlockRadixSort sort; + sort.SortBlockedToStriped(keys, + values, + benchmark_utils::custom_type_decomposer{}); + } +}; - T keys[ItemsPerThread]; - T values[ItemsPerThread]; - Helper::template load(lid, input + block_offset, keys); +template +__global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T* input, T* output) +{ + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; - for (unsigned int i = 0; i < ItemsPerThread; i++) { - values[i] = keys[i] + T(1); - } + T keys[ItemsPerThread]; + Helper::template load(lid, input + block_offset, keys); #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - Helper::template sort(keys, values); - } + for(unsigned int trial = 0; trial < Trials; trial++) + { + Helper::template sort(keys); + } - for (unsigned int i = 0; i < ItemsPerThread; i++) { - keys[i] += values[i]; - } + hipcub::StoreDirectStriped(lid, output + block_offset, keys); +} + +template +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T* input, T* output) +{ + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; + + T keys[ItemsPerThread]; + T values[ItemsPerThread]; + Helper::template load(lid, input + block_offset, keys); + + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + values[i] = keys[i] + T(1); + } - hipcub::StoreDirectStriped(lid, output + block_offset, keys); +#pragma nounroll + for(unsigned int trial = 0; trial < Trials; trial++) + { + Helper::template sort(keys, values); + } + + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + keys[i] += values[i]; + } + + hipcub::StoreDirectStriped(lid, output + block_offset, keys); } -template -void run_benchmark(benchmark::State &state, benchmark_kinds benchmark_kind, - hipStream_t stream, size_t N) { - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = - items_per_block * ((N + items_per_block - 1) / items_per_block); - - std::vector input; - if (std::is_floating_point::value) { - input = benchmark_utils::get_random_data(size, (T)-1000, (T) + 1000); - } else { - input = benchmark_utils::get_random_data( - size, std::numeric_limits::min(), std::numeric_limits::max()); - } - T *d_input; - T *d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - if (benchmark_kind == benchmark_kinds::sort_keys) { - sort_keys_kernel - <<>>( - d_input, d_output); - } else if (benchmark_kind == benchmark_kinds::sort_pairs) { - sort_pairs_kernel - <<>>( - d_input, d_output); +template +void run_benchmark(benchmark::State& state, + benchmark_kinds benchmark_kind, + hipStream_t stream, + size_t N) +{ + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); + + std::vector input; + if(std::is_floating_point::value) + { + input = benchmark_utils::get_random_data(size, (T)-1000, (T) + 1000); + } else + { + input = benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); } - HIP_CHECK(hipPeekAtLastError()); + T* d_input; + T* d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + if(benchmark_kind == benchmark_kinds::sort_keys) + { + sort_keys_kernel + <<>>(d_input, d_output); + } else if(benchmark_kind == benchmark_kinds::sort_pairs) + { + sort_pairs_kernel + <<>>(d_input, d_output); + } + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark( \ - std::string("block_radix_sort.sub_algorithm_name:" + \ - name) \ - .c_str(), \ - &run_benchmark, benchmark_kind, stream, size) +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark(std::string("block_radix_sort.sub_algorithm_name:" \ + + name) \ + .c_str(), \ + &run_benchmark, \ + benchmark_kind, \ + stream, \ + size) // clang-format off #define BENCHMARK_TYPE(type, block) \ @@ -225,57 +252,61 @@ void run_benchmark(benchmark::State &state, benchmark_kinds benchmark_kind, CREATE_BENCHMARK(type, block, 8) // clang-format on -template -void add_benchmarks(benchmark_kinds benchmark_kind, const std::string &name, - std::vector &benchmarks, - hipStream_t stream, size_t size) { - using custom_int_t = benchmark_utils::custom_type; - - std::vector bs = { - BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(int, 128), - BENCHMARK_TYPE(int, 192), BENCHMARK_TYPE(int, 256), - BENCHMARK_TYPE(int, 320), BENCHMARK_TYPE(int, 512), - - BENCHMARK_TYPE(int8_t, 64), BENCHMARK_TYPE(int8_t, 128), - BENCHMARK_TYPE(int8_t, 192), BENCHMARK_TYPE(int8_t, 256), - BENCHMARK_TYPE(int8_t, 320), BENCHMARK_TYPE(int8_t, 512), - - BENCHMARK_TYPE(long long, 64), BENCHMARK_TYPE(long long, 128), - BENCHMARK_TYPE(long long, 192), BENCHMARK_TYPE(long long, 256), - BENCHMARK_TYPE(long long, 320), BENCHMARK_TYPE(long long, 512), - - BENCHMARK_TYPE(custom_int_t, 64), BENCHMARK_TYPE(custom_int_t, 128), - BENCHMARK_TYPE(custom_int_t, 192), BENCHMARK_TYPE(custom_int_t, 256), - BENCHMARK_TYPE(custom_int_t, 320), BENCHMARK_TYPE(custom_int_t, 512), - }; - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +template +void add_benchmarks(benchmark_kinds benchmark_kind, + const std::string& name, + std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + using custom_int_t = benchmark_utils::custom_type; + + std::vector bs = { + BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(int, 128), + BENCHMARK_TYPE(int, 192), BENCHMARK_TYPE(int, 256), + BENCHMARK_TYPE(int, 320), BENCHMARK_TYPE(int, 512), + + BENCHMARK_TYPE(int8_t, 64), BENCHMARK_TYPE(int8_t, 128), + BENCHMARK_TYPE(int8_t, 192), BENCHMARK_TYPE(int8_t, 256), + BENCHMARK_TYPE(int8_t, 320), BENCHMARK_TYPE(int8_t, 512), + + BENCHMARK_TYPE(long long, 64), BENCHMARK_TYPE(long long, 128), + BENCHMARK_TYPE(long long, 192), BENCHMARK_TYPE(long long, 256), + BENCHMARK_TYPE(long long, 320), BENCHMARK_TYPE(long long, 512), + + BENCHMARK_TYPE(custom_int_t, 64), BENCHMARK_TYPE(custom_int_t, 128), + BENCHMARK_TYPE(custom_int_t, 192), BENCHMARK_TYPE(custom_int_t, 256), + BENCHMARK_TYPE(custom_int_t, 320), BENCHMARK_TYPE(custom_int_t, 512), + }; + + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_block_radix_sort" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - // clang-format off +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_block_radix_sort" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + // clang-format off add_benchmarks( benchmark_kinds::sort_keys, "sort(keys)", benchmarks, stream, size); add_benchmarks( @@ -284,22 +315,25 @@ int main(int argc, char *argv[]) { benchmark_kinds::sort_keys, "sort_to_striped(keys)", benchmarks, stream, size); add_benchmarks( benchmark_kinds::sort_pairs, "sort_to_striped(keys, values)", benchmarks, stream, size); - // clang-format on - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); + // clang-format on + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_block_reduce.cpp b/benchmark/benchmark_block_reduce.cpp index 72b9ed7c..2d0aba92 100644 --- a/benchmark/benchmark_block_reduce.cpp +++ b/benchmark/benchmark_block_reduce.cpp @@ -30,167 +30,194 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template -__global__ __launch_bounds__(BlockSize) void kernel(const T *input, T *output) { - Runner::template run(input, output); +template +__global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) +{ + Runner::template run(input, output); } -template struct reduce { - template - __device__ static void run(const T *input, T *output) { - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; +template +struct reduce +{ + template + __device__ static void run(const T* input, T* output) + { + const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - T values[ItemsPerThread]; - T reduced_value; - for (unsigned int k = 0; k < ItemsPerThread; k++) { - values[k] = input[i * ItemsPerThread + k]; - } + T values[ItemsPerThread]; + T reduced_value; + for(unsigned int k = 0; k < ItemsPerThread; k++) + { + values[k] = input[i * ItemsPerThread + k]; + } - using breduce_t = hipcub::BlockReduce; - __shared__ typename breduce_t::TempStorage storage; + using breduce_t = hipcub::BlockReduce; + __shared__ typename breduce_t::TempStorage storage; #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - reduced_value = breduce_t(storage).Reduce(values, hipcub::Sum()); - values[0] = reduced_value; - } - - if (hipThreadIdx_x == 0) { - output[hipBlockIdx_x] = reduced_value; + for(unsigned int trial = 0; trial < Trials; trial++) + { + reduced_value = breduce_t(storage).Reduce(values, hipcub::Sum()); + values[0] = reduced_value; + } + + if(hipThreadIdx_x == 0) + { + output[hipBlockIdx_x] = reduced_value; + } } - } }; -template -void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { - // Make sure size is a multiple of BlockSize - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = - items_per_block * ((N + items_per_block - 1) / items_per_block); - // Allocate and fill memory - std::vector input(size, T(1)); - T *d_input; - T *d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel), - dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, - d_output); - HIP_CHECK(hipPeekAtLastError()); +template +void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) +{ + // Make sure size is a multiple of BlockSize + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); + // Allocate and fill memory + std::vector input(size, T(1)); + T* d_input; + T* d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); - state.SetItemsProcessed(state.iterations() * size * Trials); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_output); + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); + state.SetItemsProcessed(state.iterations() * size * Trials); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } // IPT - items per thread -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark( \ - std::string("block_reduce.method_name:" + method_name) \ - .c_str(), \ - &run_benchmark, stream, size) - -#define BENCHMARK_TYPE(type, block) \ - CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2), \ - CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \ - CREATE_BENCHMARK(type, block, 8), CREATE_BENCHMARK(type, block, 11), \ - CREATE_BENCHMARK(type, block, 16) - -template -void add_benchmarks(std::vector &benchmarks, - const std::string &method_name, - const std::string &algorithm_name, hipStream_t stream, - size_t size) { - - std::vector new_benchmarks = { - // When block size is less than or equal to warp size - BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(float, 64), - BENCHMARK_TYPE(double, 64), BENCHMARK_TYPE(int8_t, 64), - BENCHMARK_TYPE(uint8_t, 64), - - BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(float, 256), - BENCHMARK_TYPE(double, 256), BENCHMARK_TYPE(int8_t, 256), - BENCHMARK_TYPE(uint8_t, 256), - }; - benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), - new_benchmarks.end()); +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark(std::string("block_reduce.method_name:" + method_name) \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) + +#define BENCHMARK_TYPE(type, block) \ + CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2), \ + CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \ + CREATE_BENCHMARK(type, block, 8), CREATE_BENCHMARK(type, block, 11), \ + CREATE_BENCHMARK(type, block, 16) + +template +void add_benchmarks(std::vector& benchmarks, + const std::string& method_name, + const std::string& algorithm_name, + hipStream_t stream, + size_t size) +{ + + std::vector new_benchmarks = { + // When block size is less than or equal to warp size + BENCHMARK_TYPE(int, 64), + BENCHMARK_TYPE(float, 64), + BENCHMARK_TYPE(double, 64), + BENCHMARK_TYPE(int8_t, 64), + BENCHMARK_TYPE(uint8_t, 64), + + BENCHMARK_TYPE(int, 256), + BENCHMARK_TYPE(float, 256), + BENCHMARK_TYPE(double, 256), + BENCHMARK_TYPE(int8_t, 256), + BENCHMARK_TYPE(uint8_t, 256), + }; + benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_block_reduce" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - // using_warp_scan - using reduce_uwr_t = - reduce; - add_benchmarks(benchmarks, "reduce", - "BLOCK_REDUCE_WARP_REDUCTIONS", stream, size); - // raking reduce - using reduce_rr_t = reduce; - add_benchmarks(benchmarks, "reduce", "BLOCK_REDUCE_RAKING", - stream, size); - // raking reduce commutative only - using reduce_rrco_t = reduce< - hipcub::BlockReduceAlgorithm::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY>; - add_benchmarks(benchmarks, "reduce", - "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY", stream, - size); - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_block_reduce" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + // using_warp_scan + using reduce_uwr_t = reduce; + add_benchmarks(benchmarks, + "reduce", + "BLOCK_REDUCE_WARP_REDUCTIONS", + stream, + size); + // raking reduce + using reduce_rr_t = reduce; + add_benchmarks(benchmarks, "reduce", "BLOCK_REDUCE_RAKING", stream, size); + // raking reduce commutative only + using reduce_rrco_t + = reduce; + add_benchmarks(benchmarks, + "reduce", + "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY", + stream, + size); + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_block_run_length_decode.cpp b/benchmark/benchmark_block_run_length_decode.cpp index 9f60a522..6769fd47 100644 --- a/benchmark/benchmark_block_run_length_decode.cpp +++ b/benchmark/benchmark_block_run_length_decode.cpp @@ -30,180 +30,208 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template -__global__ __launch_bounds__(BlockSize) void block_run_length_decode_kernel( - const ItemT *d_run_items, const OffsetT *d_run_offsets, - ItemT *d_decoded_items, bool enable_store = false) { - using BlockRunLengthDecodeT = - hipcub::BlockRunLengthDecode; - - ItemT run_items[RunsPerThread]; - OffsetT run_offsets[RunsPerThread]; - - const unsigned global_thread_idx = BlockSize * hipBlockIdx_x + hipThreadIdx_x; - hipcub::LoadDirectBlocked(global_thread_idx, d_run_items, run_items); - hipcub::LoadDirectBlocked(global_thread_idx, d_run_offsets, run_offsets); - - BlockRunLengthDecodeT block_run_length_decode(run_items, run_offsets); - - const OffsetT total_decoded_size = - d_run_offsets[(hipBlockIdx_x + 1) * BlockSize * RunsPerThread] - - d_run_offsets[hipBlockIdx_x * BlockSize * RunsPerThread]; +template +__global__ + __launch_bounds__(BlockSize) void block_run_length_decode_kernel(const ItemT* d_run_items, + const OffsetT* d_run_offsets, + ItemT* d_decoded_items, + bool enable_store = false) +{ + using BlockRunLengthDecodeT + = hipcub::BlockRunLengthDecode; + + ItemT run_items[RunsPerThread]; + OffsetT run_offsets[RunsPerThread]; + + const unsigned global_thread_idx = BlockSize * hipBlockIdx_x + hipThreadIdx_x; + hipcub::LoadDirectBlocked(global_thread_idx, d_run_items, run_items); + hipcub::LoadDirectBlocked(global_thread_idx, d_run_offsets, run_offsets); + + BlockRunLengthDecodeT block_run_length_decode(run_items, run_offsets); + + const OffsetT total_decoded_size + = d_run_offsets[(hipBlockIdx_x + 1) * BlockSize * RunsPerThread] + - d_run_offsets[hipBlockIdx_x * BlockSize * RunsPerThread]; #pragma nounroll - for (unsigned i = 0; i < Trials; ++i) { - OffsetT decoded_window_offset = 0; - while (decoded_window_offset < total_decoded_size) { - ItemT decoded_items[DecodedItemsPerThread]; - block_run_length_decode.RunLengthDecode(decoded_items, - decoded_window_offset); - - if (enable_store) { - hipcub::StoreDirectBlocked(global_thread_idx, - d_decoded_items + decoded_window_offset, - decoded_items); - } - - decoded_window_offset += BlockSize * DecodedItemsPerThread; + for(unsigned i = 0; i < Trials; ++i) + { + OffsetT decoded_window_offset = 0; + while(decoded_window_offset < total_decoded_size) + { + ItemT decoded_items[DecodedItemsPerThread]; + block_run_length_decode.RunLengthDecode(decoded_items, decoded_window_offset); + + if(enable_store) + { + hipcub::StoreDirectBlocked(global_thread_idx, + d_decoded_items + decoded_window_offset, + decoded_items); + } + + decoded_window_offset += BlockSize * DecodedItemsPerThread; + } } - } } -template -void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { - constexpr auto runs_per_block = BlockSize * RunsPerThread; - const auto target_num_runs = 2 * N / (MinRunLength + MaxRunLength); - const auto num_runs = - runs_per_block * - ((target_num_runs + runs_per_block - 1) / runs_per_block); - - std::vector run_items(num_runs); - std::vector run_offsets(num_runs + 1); - - std::default_random_engine prng(std::random_device{}()); - using ItemDistribution = - std::conditional_t::value, - std::uniform_int_distribution, - std::uniform_real_distribution>; - ItemDistribution run_item_dist(0, 100); - std::uniform_int_distribution run_length_dist(MinRunLength, - MaxRunLength); - - for (size_t i = 0; i < num_runs; ++i) { - run_items[i] = run_item_dist(prng); - } - for (size_t i = 1; i < num_runs + 1; ++i) { - const OffsetT next_run_length = run_length_dist(prng); - run_offsets[i] = run_offsets[i - 1] + next_run_length; - } - const OffsetT output_length = run_offsets.back(); - - ItemT *d_run_items{}; - HIP_CHECK(hipMalloc(&d_run_items, run_items.size() * sizeof(ItemT))); - HIP_CHECK(hipMemcpy(d_run_items, run_items.data(), - run_items.size() * sizeof(ItemT), hipMemcpyHostToDevice)); - - OffsetT *d_run_offsets{}; - HIP_CHECK(hipMalloc(&d_run_offsets, run_offsets.size() * sizeof(OffsetT))); - HIP_CHECK(hipMemcpy(d_run_offsets, run_offsets.data(), - run_offsets.size() * sizeof(OffsetT), - hipMemcpyHostToDevice)); - - ItemT *d_output{}; - HIP_CHECK(hipMalloc(&d_output, output_length * sizeof(ItemT))); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL(HIP_KERNEL_NAME(block_run_length_decode_kernel< - ItemT, OffsetT, BlockSize, RunsPerThread, - DecodedItemsPerThread, Trials>), - dim3(num_runs / runs_per_block), dim3(BlockSize), 0, - stream, d_run_items, d_run_offsets, d_output); - HIP_CHECK(hipPeekAtLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * output_length * sizeof(ItemT) * - Trials); - state.SetItemsProcessed(state.iterations() * output_length * Trials); - - HIP_CHECK(hipFree(d_run_items)); - HIP_CHECK(hipFree(d_run_offsets)); - HIP_CHECK(hipFree(d_output)); +template +void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) +{ + constexpr auto runs_per_block = BlockSize * RunsPerThread; + const auto target_num_runs = 2 * N / (MinRunLength + MaxRunLength); + const auto num_runs + = runs_per_block * ((target_num_runs + runs_per_block - 1) / runs_per_block); + + std::vector run_items(num_runs); + std::vector run_offsets(num_runs + 1); + + std::default_random_engine prng(std::random_device{}()); + using ItemDistribution = std::conditional_t::value, + std::uniform_int_distribution, + std::uniform_real_distribution>; + ItemDistribution run_item_dist(0, 100); + std::uniform_int_distribution run_length_dist(MinRunLength, MaxRunLength); + + for(size_t i = 0; i < num_runs; ++i) + { + run_items[i] = run_item_dist(prng); + } + for(size_t i = 1; i < num_runs + 1; ++i) + { + const OffsetT next_run_length = run_length_dist(prng); + run_offsets[i] = run_offsets[i - 1] + next_run_length; + } + const OffsetT output_length = run_offsets.back(); + + ItemT* d_run_items{}; + HIP_CHECK(hipMalloc(&d_run_items, run_items.size() * sizeof(ItemT))); + HIP_CHECK(hipMemcpy(d_run_items, + run_items.data(), + run_items.size() * sizeof(ItemT), + hipMemcpyHostToDevice)); + + OffsetT* d_run_offsets{}; + HIP_CHECK(hipMalloc(&d_run_offsets, run_offsets.size() * sizeof(OffsetT))); + HIP_CHECK(hipMemcpy(d_run_offsets, + run_offsets.data(), + run_offsets.size() * sizeof(OffsetT), + hipMemcpyHostToDevice)); + + ItemT* d_output{}; + HIP_CHECK(hipMalloc(&d_output, output_length * sizeof(ItemT))); + + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(block_run_length_decode_kernel), + dim3(num_runs / runs_per_block), + dim3(BlockSize), + 0, + stream, + d_run_items, + d_run_offsets, + d_output); + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * output_length * sizeof(ItemT) * Trials); + state.SetItemsProcessed(state.iterations() * output_length * Trials); + + HIP_CHECK(hipFree(d_run_items)); + HIP_CHECK(hipFree(d_run_offsets)); + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT) \ - benchmark::RegisterBenchmark( \ - std::string("block_run_length_decode.") \ - .c_str(), \ - &run_benchmark, stream, size) - -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_block_run_length_decode" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks{ - CREATE_BENCHMARK(int, int, 1, 5, 128, 2, 4), - CREATE_BENCHMARK(int, int, 1, 10, 128, 2, 4), - CREATE_BENCHMARK(int, int, 1, 50, 128, 2, 4), - CREATE_BENCHMARK(int, int, 1, 100, 128, 2, 4), - CREATE_BENCHMARK(int, int, 1, 500, 128, 2, 4), - CREATE_BENCHMARK(int, int, 1, 1000, 128, 2, 4), - CREATE_BENCHMARK(int, int, 1, 5000, 128, 2, 4), - - CREATE_BENCHMARK(double, long long, 1, 5, 128, 2, 4), - CREATE_BENCHMARK(double, long long, 1, 10, 128, 2, 4), - CREATE_BENCHMARK(double, long long, 1, 50, 128, 2, 4), - CREATE_BENCHMARK(double, long long, 1, 100, 128, 2, 4), - CREATE_BENCHMARK(double, long long, 1, 500, 128, 2, 4), - CREATE_BENCHMARK(double, long long, 1, 1000, 128, 2, 4), - CREATE_BENCHMARK(double, long long, 1, 5000, 128, 2, 4)}; - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +#define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT) \ + benchmark::RegisterBenchmark( \ + std::string("block_run_length_decode.") \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) + +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_block_run_length_decode" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks{ + CREATE_BENCHMARK(int, int, 1, 5, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 10, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 50, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 100, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 500, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 1000, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 5000, 128, 2, 4), + + CREATE_BENCHMARK(double, long long, 1, 5, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 10, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 50, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 100, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 500, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 1000, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 5000, 128, 2, 4)}; + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_block_scan.cpp b/benchmark/benchmark_block_scan.cpp index 28c0264c..d453d321 100644 --- a/benchmark/benchmark_block_scan.cpp +++ b/benchmark/benchmark_block_scan.cpp @@ -29,114 +29,133 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template -__global__ __launch_bounds__(BlockSize) void kernel(const T *input, T *output, - const T init) { - Runner::template run(input, output, - init); +template +__global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output, const T init) +{ + Runner::template run(input, output, init); } -template struct inclusive_scan { - template - __device__ static void run(const T *input, T *output, const T init) { - (void)init; - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; +template +struct inclusive_scan +{ + template + __device__ static void run(const T* input, T* output, const T init) + { + (void)init; + const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - T values[ItemsPerThread]; - for (unsigned int k = 0; k < ItemsPerThread; k++) { - values[k] = input[i * ItemsPerThread + k]; - } + T values[ItemsPerThread]; + for(unsigned int k = 0; k < ItemsPerThread; k++) + { + values[k] = input[i * ItemsPerThread + k]; + } - using bscan_t = hipcub::BlockScan; - __shared__ typename bscan_t::TempStorage storage; + using bscan_t = hipcub::BlockScan; + __shared__ typename bscan_t::TempStorage storage; #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - bscan_t(storage).InclusiveScan(values, values, hipcub::Sum()); - } - - for (unsigned int k = 0; k < ItemsPerThread; k++) { - output[i * ItemsPerThread + k] = values[k]; + for(unsigned int trial = 0; trial < Trials; trial++) + { + bscan_t(storage).InclusiveScan(values, values, hipcub::Sum()); + } + + for(unsigned int k = 0; k < ItemsPerThread; k++) + { + output[i * ItemsPerThread + k] = values[k]; + } } - } }; -template struct exclusive_scan { - template - __device__ static void run(const T *input, T *output, const T init) { - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; +template +struct exclusive_scan +{ + template + __device__ static void run(const T* input, T* output, const T init) + { + const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - T values[ItemsPerThread]; - for (unsigned int k = 0; k < ItemsPerThread; k++) { - values[k] = input[i * ItemsPerThread + k]; - } + T values[ItemsPerThread]; + for(unsigned int k = 0; k < ItemsPerThread; k++) + { + values[k] = input[i * ItemsPerThread + k]; + } - using bscan_t = hipcub::BlockScan; - __shared__ typename bscan_t::TempStorage storage; + using bscan_t = hipcub::BlockScan; + __shared__ typename bscan_t::TempStorage storage; #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - bscan_t(storage).ExclusiveScan(values, values, init, hipcub::Sum()); - } - - for (unsigned int k = 0; k < ItemsPerThread; k++) { - output[i * ItemsPerThread + k] = values[k]; + for(unsigned int trial = 0; trial < Trials; trial++) + { + bscan_t(storage).ExclusiveScan(values, values, init, hipcub::Sum()); + } + + for(unsigned int k = 0; k < ItemsPerThread; k++) + { + output[i * ItemsPerThread + k] = values[k]; + } } - } }; -template -void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { - // Make sure size is a multiple of BlockSize - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = - items_per_block * ((N + items_per_block - 1) / items_per_block); - // Allocate and fill memory - std::vector input(size, T(1)); - T *d_input; - T *d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel), - dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, - d_output, input[0]); - HIP_CHECK(hipPeekAtLastError()); +template +void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) +{ + // Make sure size is a multiple of BlockSize + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); + // Allocate and fill memory + std::vector input(size, T(1)); + T* d_input; + T* d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); - state.SetItemsProcessed(state.iterations() * size * Trials); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_output, + input[0]); + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); + state.SetItemsProcessed(state.iterations() * size * Trials); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } // IPT - items per thread -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark( \ - (std::string("block_scan.method_name:") + \ - method_name) \ - .c_str(), \ - &run_benchmark, stream, size) +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark((std::string("block_scan.method_name:") \ + + method_name) \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) // clang-format off #define BENCHMARK_TYPE(type, block) \ @@ -148,62 +167,64 @@ void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { CREATE_BENCHMARK(type, block, 16) // clang-format on -template -void add_benchmarks(std::vector &benchmarks, - const std::string &method_name, - const std::string &algorithm_name, hipStream_t stream, - size_t size) { - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - - std::vector new_benchmarks = { - // When block size is less than or equal to warp size - BENCHMARK_TYPE(int, 64), - BENCHMARK_TYPE(float, 64), - BENCHMARK_TYPE(double, 64), - BENCHMARK_TYPE(uint8_t, 64), - - BENCHMARK_TYPE(int, 256), - BENCHMARK_TYPE(float, 256), - BENCHMARK_TYPE(double, 256), - BENCHMARK_TYPE(uint8_t, 256), - - CREATE_BENCHMARK(custom_float2, 256, 1), - CREATE_BENCHMARK(custom_float2, 256, 4), - CREATE_BENCHMARK(custom_float2, 256, 8), - - CREATE_BENCHMARK(custom_double2, 256, 1), - CREATE_BENCHMARK(custom_double2, 256, 4), - CREATE_BENCHMARK(custom_double2, 256, 8), - }; - benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), - new_benchmarks.end()); +template +void add_benchmarks(std::vector& benchmarks, + const std::string& method_name, + const std::string& algorithm_name, + hipStream_t stream, + size_t size) +{ + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + + std::vector new_benchmarks = { + // When block size is less than or equal to warp size + BENCHMARK_TYPE(int, 64), + BENCHMARK_TYPE(float, 64), + BENCHMARK_TYPE(double, 64), + BENCHMARK_TYPE(uint8_t, 64), + + BENCHMARK_TYPE(int, 256), + BENCHMARK_TYPE(float, 256), + BENCHMARK_TYPE(double, 256), + BENCHMARK_TYPE(uint8_t, 256), + + CREATE_BENCHMARK(custom_float2, 256, 1), + CREATE_BENCHMARK(custom_float2, 256, 4), + CREATE_BENCHMARK(custom_float2, 256, 8), + + CREATE_BENCHMARK(custom_double2, 256, 1), + CREATE_BENCHMARK(custom_double2, 256, 4), + CREATE_BENCHMARK(custom_double2, 256, 8), + }; + benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_block_scan" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - // clang-format off +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_block_scan" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + // clang-format off add_benchmarks>( benchmarks, "inclusive_scan", "BLOCK_SCAN_RAKING", stream, size); add_benchmarks>( @@ -216,22 +237,25 @@ int main(int argc, char *argv[]) { benchmarks, "exclusive_scan", "BLOCK_SCAN_RAKING_MEMOIZE", stream, size); add_benchmarks>( benchmarks, "exclusive_scan", "BLOCK_SCAN_WARP_SCANS", stream, size); - // clang-format on - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); + // clang-format on + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_block_shuffle.cpp b/benchmark/benchmark_block_shuffle.cpp index e7f4fd13..4ba9fb0e 100644 --- a/benchmark/benchmark_block_shuffle.cpp +++ b/benchmark/benchmark_block_shuffle.cpp @@ -28,270 +28,309 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template -__global__ __launch_bounds__(BlockSize) void kernel(const T *input, T *output) { - Runner::template run(input, output); +template +__global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) +{ + Runner::template run(input, output); } -struct offset { - template - __device__ static void run(const T *input, T *output) { - const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; +struct offset +{ + template + __device__ static void run(const T* input, T* output) + { + const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; - T value = input[tid]; + T value = input[tid]; - using bshuffle_t = hipcub::BlockShuffle; - __shared__ typename bshuffle_t::TempStorage storage; + using bshuffle_t = hipcub::BlockShuffle; + __shared__ typename bshuffle_t::TempStorage storage; #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - bshuffle_t(storage).Offset(value, value, 1); + for(unsigned int trial = 0; trial < Trials; trial++) + { + bshuffle_t(storage).Offset(value, value, 1); - // sync is required because of loop since - // temporary storage is accessed next iteration - __syncthreads(); - } + // sync is required because of loop since + // temporary storage is accessed next iteration + __syncthreads(); + } - output[tid] = value; - } + output[tid] = value; + } - static constexpr bool uses_ipt = false; + static constexpr bool uses_ipt = false; }; -struct rotate { - template - __device__ static void run(const T *input, T *output) { - const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; +struct rotate +{ + template + __device__ static void run(const T* input, T* output) + { + const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; - T value = input[tid]; + T value = input[tid]; - using bshuffle_t = hipcub::BlockShuffle; - __shared__ typename bshuffle_t::TempStorage storage; + using bshuffle_t = hipcub::BlockShuffle; + __shared__ typename bshuffle_t::TempStorage storage; #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - bshuffle_t(storage).Rotate(value, value, 1); + for(unsigned int trial = 0; trial < Trials; trial++) + { + bshuffle_t(storage).Rotate(value, value, 1); - // sync is required because of loop since - // temporary storage is accessed next iteration - __syncthreads(); - } + // sync is required because of loop since + // temporary storage is accessed next iteration + __syncthreads(); + } - output[tid] = value; - } + output[tid] = value; + } - static constexpr bool uses_ipt = false; + static constexpr bool uses_ipt = false; }; -struct up { - template - __device__ static void run(const T *input, T *output) { - const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; +struct up +{ + template + __device__ static void run(const T* input, T* output) + { + const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; - T values[ItemsPerThread]; - for (unsigned int i = 0; i < ItemsPerThread; i++) { - values[i] = input[ItemsPerThread * tid + i]; - } + T values[ItemsPerThread]; + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + values[i] = input[ItemsPerThread * tid + i]; + } - using bshuffle_t = hipcub::BlockShuffle; - __shared__ typename bshuffle_t::TempStorage storage; + using bshuffle_t = hipcub::BlockShuffle; + __shared__ typename bshuffle_t::TempStorage storage; #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - bshuffle_t(storage).Up(values, values); - - // sync is required because of loop since - // temporary storage is accessed next iteration - __syncthreads(); - } - - for (unsigned int i = 0; i < ItemsPerThread; i++) { - output[ItemsPerThread * tid + i] = values[i]; + for(unsigned int trial = 0; trial < Trials; trial++) + { + bshuffle_t(storage).Up(values, values); + + // sync is required because of loop since + // temporary storage is accessed next iteration + __syncthreads(); + } + + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + output[ItemsPerThread * tid + i] = values[i]; + } } - } - static constexpr bool uses_ipt = true; + static constexpr bool uses_ipt = true; }; -struct down { - template - __device__ static void run(const T *input, T *output) { - const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; +struct down +{ + template + __device__ static void run(const T* input, T* output) + { + const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; - T values[ItemsPerThread]; - for (unsigned int i = 0; i < ItemsPerThread; i++) { - values[i] = input[ItemsPerThread * tid + i]; - } + T values[ItemsPerThread]; + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + values[i] = input[ItemsPerThread * tid + i]; + } - using bshuffle_t = hipcub::BlockShuffle; - __shared__ typename bshuffle_t::TempStorage storage; + using bshuffle_t = hipcub::BlockShuffle; + __shared__ typename bshuffle_t::TempStorage storage; #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - bshuffle_t(storage).Down(values, values); - - // sync is required because of loop since - // temporary storage is accessed next iteration - __syncthreads(); - } - - for (unsigned int i = 0; i < ItemsPerThread; i++) { - output[ItemsPerThread * tid + i] = values[i]; + for(unsigned int trial = 0; trial < Trials; trial++) + { + bshuffle_t(storage).Down(values, values); + + // sync is required because of loop since + // temporary storage is accessed next iteration + __syncthreads(); + } + + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + output[ItemsPerThread * tid + i] = values[i]; + } } - } - static constexpr bool uses_ipt = true; + static constexpr bool uses_ipt = true; }; -template -void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = - items_per_block * ((N + items_per_block - 1) / items_per_block); - - std::vector input(size, T(1)); - T *d_input; - T *d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel), - dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, - d_output); - HIP_CHECK(hipPeekAtLastError()); +template +void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) +{ + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); + + std::vector input(size, T(1)); + T* d_input; + T* d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_output); + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK_IPT(BS, IPT) \ - benchmark::RegisterBenchmark( \ - ("block_shuffle.sub_algorithm_name:" + \ - name) \ - .c_str(), \ - &run_benchmark, stream, size) - -#define CREATE_BENCHMARK(BS) \ - benchmark::RegisterBenchmark( \ - ("block_shuffle.sub_algorithm_name:" + name) \ - .c_str(), \ - &run_benchmark, stream, size) - -template = true> -void add_benchmarks_type( - const std::string &name, - std::vector &benchmarks, - hipStream_t stream, size_t size, const std::string &type_name) { - std::vector bs = { - CREATE_BENCHMARK_IPT(256, 1), CREATE_BENCHMARK_IPT(256, 3), - CREATE_BENCHMARK_IPT(256, 4), CREATE_BENCHMARK_IPT(256, 8), - CREATE_BENCHMARK_IPT(256, 16), CREATE_BENCHMARK_IPT(256, 32), - }; - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_BENCHMARK_IPT(BS, IPT) \ + benchmark::RegisterBenchmark( \ + ("block_shuffle.sub_algorithm_name:" + name) \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) + +#define CREATE_BENCHMARK(BS) \ + benchmark::RegisterBenchmark(("block_shuffle.sub_algorithm_name:" + name) \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) + +template = true> +void add_benchmarks_type(const std::string& name, + std::vector& benchmarks, + hipStream_t stream, + size_t size, + const std::string& type_name) +{ + std::vector bs = { + CREATE_BENCHMARK_IPT(256, 1), + CREATE_BENCHMARK_IPT(256, 3), + CREATE_BENCHMARK_IPT(256, 4), + CREATE_BENCHMARK_IPT(256, 8), + CREATE_BENCHMARK_IPT(256, 16), + CREATE_BENCHMARK_IPT(256, 32), + }; + + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -template = true> -void add_benchmarks_type( - const std::string &name, - std::vector &benchmarks, - hipStream_t stream, size_t size, const std::string &type_name) { - std::vector bs = { - CREATE_BENCHMARK(256), - }; - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +template = true> +void add_benchmarks_type(const std::string& name, + std::vector& benchmarks, + hipStream_t stream, + size_t size, + const std::string& type_name) +{ + std::vector bs = { + CREATE_BENCHMARK(256), + }; + + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_BENCHMARKS(T) \ - add_benchmarks_type(name, benchmarks, stream, size, #T) - -template -void add_benchmarks(const std::string &name, - std::vector &benchmarks, - hipStream_t stream, size_t size) { - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - - CREATE_BENCHMARKS(int); - CREATE_BENCHMARKS(float); - CREATE_BENCHMARKS(double); - CREATE_BENCHMARKS(int8_t); - CREATE_BENCHMARKS(long long); - CREATE_BENCHMARKS(custom_float2); - CREATE_BENCHMARKS(custom_double2); +#define CREATE_BENCHMARKS(T) add_benchmarks_type(name, benchmarks, stream, size, #T) + +template +void add_benchmarks(const std::string& name, + std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + + CREATE_BENCHMARKS(int); + CREATE_BENCHMARKS(float); + CREATE_BENCHMARKS(double); + CREATE_BENCHMARKS(int8_t); + CREATE_BENCHMARKS(long long); + CREATE_BENCHMARKS(custom_float2); + CREATE_BENCHMARKS(custom_double2); } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_block_shuffle" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_benchmarks("offset", benchmarks, stream, size); - add_benchmarks("rotate", benchmarks, stream, size); - add_benchmarks("up", benchmarks, stream, size); - add_benchmarks("down", benchmarks, stream, size); - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_block_shuffle" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_benchmarks("offset", benchmarks, stream, size); + add_benchmarks("rotate", benchmarks, stream, size); + add_benchmarks("up", benchmarks, stream, size); + add_benchmarks("down", benchmarks, stream, size); + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_adjacent_difference.cpp b/benchmark/benchmark_device_adjacent_difference.cpp index 9d1a087f..07551637 100644 --- a/benchmark/benchmark_device_adjacent_difference.cpp +++ b/benchmark/benchmark_device_adjacent_difference.cpp @@ -39,138 +39,167 @@ #include #include -namespace { +namespace +{ #ifndef DEFAULT_N constexpr std::size_t DEFAULT_N = 1024 * 1024 * 128; #endif -constexpr unsigned int batch_size = 10; +constexpr unsigned int batch_size = 10; constexpr unsigned int warmup_size = 5; -template +template auto dispatch_adjacent_difference(std::true_type /*left*/, std::true_type /*copy*/, - void *const temporary_storage, - std::size_t &storage_size, - const InputIt input, const OutputIt output, - Args &&... args) { - return ::hipcub::DeviceAdjacentDifference::SubtractLeftCopy( - temporary_storage, storage_size, input, output, - std::forward(args)...); + void* const temporary_storage, + std::size_t& storage_size, + const InputIt input, + const OutputIt output, + Args&&... args) +{ + return ::hipcub::DeviceAdjacentDifference::SubtractLeftCopy(temporary_storage, + storage_size, + input, + output, + std::forward(args)...); } -template +template auto dispatch_adjacent_difference(std::false_type /*left*/, std::true_type /*copy*/, - void *const temporary_storage, - std::size_t &storage_size, - const InputIt input, const OutputIt output, - Args &&... args) { - return ::hipcub::DeviceAdjacentDifference::SubtractRightCopy( - temporary_storage, storage_size, input, output, - std::forward(args)...); + void* const temporary_storage, + std::size_t& storage_size, + const InputIt input, + const OutputIt output, + Args&&... args) +{ + return ::hipcub::DeviceAdjacentDifference::SubtractRightCopy(temporary_storage, + storage_size, + input, + output, + std::forward(args)...); } -template +template auto dispatch_adjacent_difference(std::true_type /*left*/, std::false_type /*copy*/, - void *const temporary_storage, - std::size_t &storage_size, + void* const temporary_storage, + std::size_t& storage_size, const InputIt input, - const OutputIt /*output*/, Args &&... args) { - return ::hipcub::DeviceAdjacentDifference::SubtractLeft( - temporary_storage, storage_size, input, std::forward(args)...); + const OutputIt /*output*/, + Args&&... args) +{ + return ::hipcub::DeviceAdjacentDifference::SubtractLeft(temporary_storage, + storage_size, + input, + std::forward(args)...); } -template +template auto dispatch_adjacent_difference(std::false_type /*left*/, std::false_type /*copy*/, - void *const temporary_storage, - std::size_t &storage_size, + void* const temporary_storage, + std::size_t& storage_size, const InputIt input, - const OutputIt /*output*/, Args &&... args) { - return ::hipcub::DeviceAdjacentDifference::SubtractRight( - temporary_storage, storage_size, input, std::forward(args)...); + const OutputIt /*output*/, + Args&&... args) +{ + return ::hipcub::DeviceAdjacentDifference::SubtractRight(temporary_storage, + storage_size, + input, + std::forward(args)...); } -template -void run_benchmark(benchmark::State &state, const std::size_t size, - const hipStream_t stream) { - using output_type = T; - - // Generate data - const std::vector input = - benchmark_utils::get_random_data(size, 1, 100); - - T *d_input; - output_type *d_output = nullptr; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); - HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), - hipMemcpyHostToDevice)); - - if (copy) { - HIP_CHECK(hipMalloc(&d_output, size * sizeof(output_type))); - } - - static constexpr std::integral_constant left_tag; - static constexpr std::integral_constant copy_tag; - - // Allocate temporary storage - std::size_t temp_storage_size{}; - void *d_temp_storage = nullptr; - - const auto launch = [&] { - return dispatch_adjacent_difference(left_tag, copy_tag, d_temp_storage, - temp_storage_size, d_input, d_output, - size, hipcub::Sum{}, stream); - }; - HIP_CHECK(launch()); - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size)); - - // Warm-up - for (size_t i = 0; i < warmup_size; i++) { +template +void run_benchmark(benchmark::State& state, const std::size_t size, const hipStream_t stream) +{ + using output_type = T; + + // Generate data + const std::vector input = benchmark_utils::get_random_data(size, 1, 100); + + T* d_input; + output_type* d_output = nullptr; + HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); + HIP_CHECK( + hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice)); + + if(copy) + { + HIP_CHECK(hipMalloc(&d_output, size * sizeof(output_type))); + } + + static constexpr std::integral_constant left_tag; + static constexpr std::integral_constant copy_tag; + + // Allocate temporary storage + std::size_t temp_storage_size{}; + void* d_temp_storage = nullptr; + + const auto launch = [&] + { + return dispatch_adjacent_difference(left_tag, + copy_tag, + d_temp_storage, + temp_storage_size, + d_input, + d_output, + size, + hipcub::Sum{}, + stream); + }; HIP_CHECK(launch()); - } - HIP_CHECK(hipDeviceSynchronize()); + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size)); - // Run - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK(launch()); + } + HIP_CHECK(hipDeviceSynchronize()); + + // Run + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK(launch()); + } + HIP_CHECK(hipStreamSynchronize(stream)); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK(launch()); + hipFree(d_input); + if(copy) + { + hipFree(d_output); } - HIP_CHECK(hipStreamSynchronize(stream)); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - hipFree(d_input); - if (copy) { - hipFree(d_output); - } - hipFree(d_temp_storage); + hipFree(d_temp_storage); } } // namespace using namespace std::string_literals; -#define CREATE_BENCHMARK(T, left, copy) \ - benchmark::RegisterBenchmark( \ - std::string("device_adjacent_difference" \ - "." \ - "sub_algorithm_name:Subtract" + \ - std::string(left ? "Left" : "Right") + \ - std::string(copy ? "Copy" : "")) \ - .c_str(), \ - &run_benchmark, size, stream) +#define CREATE_BENCHMARK(T, left, copy) \ + benchmark::RegisterBenchmark(std::string("device_adjacent_difference" \ + "." \ + "sub_algorithm_name:Subtract" \ + + std::string(left ? "Left" : "Right") \ + + std::string(copy ? "Copy" : "")) \ + .c_str(), \ + &run_benchmark, \ + size, \ + stream) // clang-format off #define CREATE_BENCHMARKS(T) \ @@ -180,56 +209,63 @@ using namespace std::string_literals; CREATE_BENCHMARK(T, false, true) // clang-format on -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - // HIP - const hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - - std::cout << "benchmark_device_adjacent_difference" << std::endl; - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - - // Add benchmarks - const std::vector benchmarks = { - CREATE_BENCHMARKS(int), CREATE_BENCHMARKS(std::int64_t), - - CREATE_BENCHMARKS(uint8_t), - - CREATE_BENCHMARKS(float), CREATE_BENCHMARKS(double), - - CREATE_BENCHMARKS(custom_float2), CREATE_BENCHMARKS(custom_double2), - }; - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + // HIP + const hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_device_adjacent_difference" << std::endl; + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + + // Add benchmarks + const std::vector benchmarks = { + CREATE_BENCHMARKS(int), + CREATE_BENCHMARKS(std::int64_t), + + CREATE_BENCHMARKS(uint8_t), + + CREATE_BENCHMARKS(float), + CREATE_BENCHMARKS(double), + + CREATE_BENCHMARKS(custom_float2), + CREATE_BENCHMARKS(custom_double2), + }; + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); - return 0; + return 0; } diff --git a/benchmark/benchmark_device_batch_copy.cpp b/benchmark/benchmark_device_batch_copy.cpp index a3c2bffa..ff2ccae1 100644 --- a/benchmark/benchmark_device_batch_copy.cpp +++ b/benchmark/benchmark_device_batch_copy.cpp @@ -42,7 +42,7 @@ #include constexpr uint32_t warmup_size = 5; -constexpr int32_t max_size = 1024 * 1024; +constexpr int32_t max_size = 1024 * 1024; constexpr int32_t wlev_min_size = 128; constexpr int32_t blev_min_size = 1024; @@ -69,296 +69,338 @@ constexpr int32_t blev_min_size = 1024; // ┌───┬───┬───┬───┬───┬───┬───┬───┐ // │c0'│a0'│a1'│a2'│d0'│d1'│b0'│b1'│ buffer y contains buffers a', b', c', d' // └───┴───┴───┴───┴───┴───┴───┴───┘ -template -std::vector shuffled_exclusive_scan(const std::vector &input, - RandomGenerator &rng) { - const auto n = input.size(); - assert(n > 0); - - std::vector result(n); - std::vector permute(n); - - std::iota(permute.begin(), permute.end(), 0); - std::shuffle(permute.begin(), permute.end(), rng); - - for (T i = 0, sum = 0; i < n; ++i) { - result[permute[i]] = sum; - sum += input[permute[i]]; - } +template +std::vector shuffled_exclusive_scan(const std::vector& input, RandomGenerator& rng) +{ + const auto n = input.size(); + assert(n > 0); + + std::vector result(n); + std::vector permute(n); + + std::iota(permute.begin(), permute.end(), 0); + std::shuffle(permute.begin(), permute.end(), rng); + + for(T i = 0, sum = 0; i < n; ++i) + { + result[permute[i]] = sum; + sum += input[permute[i]]; + } - return result; + return result; } using offset_type = size_t; -template struct BatchCopyData { - size_t total_num_elements = 0; - ValueType *d_input = nullptr; - ValueType *d_output = nullptr; - ValueType **d_buffer_srcs = nullptr; - ValueType **d_buffer_dsts = nullptr; - BufferSizeType *d_buffer_sizes = nullptr; - - BatchCopyData() = default; - BatchCopyData(const BatchCopyData &) = delete; - - BatchCopyData(BatchCopyData &&other) - : total_num_elements{std::exchange(other.total_num_elements, 0)}, - d_input{std::exchange(other.d_input, nullptr)}, d_output{std::exchange( - other.d_output, - nullptr)}, - d_buffer_srcs{std::exchange(other.d_buffer_srcs, nullptr)}, - d_buffer_dsts{std::exchange(other.d_buffer_dsts, nullptr)}, - d_buffer_sizes{std::exchange(other.d_buffer_sizes, nullptr)} {} - - BatchCopyData &operator=(BatchCopyData &&other) { - total_num_elements = std::exchange(other.total_num_elements, 0); - d_input = std::exchange(other.d_input, nullptr); - d_output = std::exchange(other.d_output, nullptr); - d_buffer_srcs = std::exchange(other.d_buffer_srcs, nullptr); - d_buffer_dsts = std::exchange(other.d_buffer_dsts, nullptr); - d_buffer_sizes = std::exchange(other.d_buffer_sizes, nullptr); - return *this; - }; - - BatchCopyData &operator=(const BatchCopyData &) = delete; - - size_t total_num_bytes() const { - return total_num_elements * sizeof(ValueType); - } - - ~BatchCopyData() { - HIP_CHECK(hipFree(d_buffer_sizes)); - HIP_CHECK(hipFree(d_buffer_srcs)); - HIP_CHECK(hipFree(d_buffer_dsts)); - HIP_CHECK(hipFree(d_output)); - HIP_CHECK(hipFree(d_input)); - } +template +struct BatchCopyData +{ + size_t total_num_elements = 0; + ValueType* d_input = nullptr; + ValueType* d_output = nullptr; + ValueType** d_buffer_srcs = nullptr; + ValueType** d_buffer_dsts = nullptr; + BufferSizeType* d_buffer_sizes = nullptr; + + BatchCopyData() = default; + BatchCopyData(const BatchCopyData&) = delete; + + BatchCopyData(BatchCopyData&& other) + : total_num_elements{std::exchange(other.total_num_elements, 0)} + , d_input{std::exchange(other.d_input, nullptr)} + , d_output{std::exchange(other.d_output, nullptr)} + , d_buffer_srcs{std::exchange(other.d_buffer_srcs, nullptr)} + , d_buffer_dsts{std::exchange(other.d_buffer_dsts, nullptr)} + , d_buffer_sizes{std::exchange(other.d_buffer_sizes, nullptr)} + {} + + BatchCopyData& operator=(BatchCopyData&& other) + { + total_num_elements = std::exchange(other.total_num_elements, 0); + d_input = std::exchange(other.d_input, nullptr); + d_output = std::exchange(other.d_output, nullptr); + d_buffer_srcs = std::exchange(other.d_buffer_srcs, nullptr); + d_buffer_dsts = std::exchange(other.d_buffer_dsts, nullptr); + d_buffer_sizes = std::exchange(other.d_buffer_sizes, nullptr); + return *this; + }; + + BatchCopyData& operator=(const BatchCopyData&) = delete; + + size_t total_num_bytes() const + { + return total_num_elements * sizeof(ValueType); + } + + ~BatchCopyData() + { + HIP_CHECK(hipFree(d_buffer_sizes)); + HIP_CHECK(hipFree(d_buffer_srcs)); + HIP_CHECK(hipFree(d_buffer_dsts)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + } }; -template -BatchCopyData -prepare_data(const int32_t num_tlev_buffers = 1024, - const int32_t num_wlev_buffers = 1024, - const int32_t num_blev_buffers = 1024) { - const bool shuffle_buffers = false; - - BatchCopyData result; - const size_t num_buffers = - num_tlev_buffers + num_wlev_buffers + num_blev_buffers; - - constexpr int32_t wlev_min_elems = - benchmark_utils::ceiling_div(wlev_min_size, sizeof(ValueType)); - constexpr int32_t blev_min_elems = - benchmark_utils::ceiling_div(blev_min_size, sizeof(ValueType)); - constexpr int32_t max_elems = max_size / sizeof(ValueType); - - // Generate data - std::mt19937_64 rng(std::random_device{}()); - - // Number of elements in each buffer. - std::vector h_buffer_num_elements(num_buffers); - - auto iter = h_buffer_num_elements.begin(); - - iter = benchmark_utils::generate_random_data_n(iter, num_tlev_buffers, 1, - wlev_min_elems - 1, rng); - iter = benchmark_utils::generate_random_data_n( - iter, num_wlev_buffers, wlev_min_elems, blev_min_elems - 1, rng); - iter = benchmark_utils::generate_random_data_n( - iter, num_blev_buffers, blev_min_elems, max_elems, rng); - - // Shuffle the sizes so that size classes aren't clustered - std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng); - - result.total_num_elements = std::accumulate( - h_buffer_num_elements.begin(), h_buffer_num_elements.end(), size_t{0}); - - // Generate data. - std::independent_bits_engine bits_engine{rng}; - - const size_t num_ints = - benchmark_utils::ceiling_div(result.total_num_bytes(), sizeof(uint64_t)); - auto h_input = std::make_unique(num_ints * sizeof(uint64_t)); - - std::for_each( - reinterpret_cast(h_input.get()), - reinterpret_cast(h_input.get() + num_ints * sizeof(uint64_t)), - [&bits_engine](uint64_t &elem) { - ::new (&elem) uint64_t{bits_engine()}; - }); - - HIP_CHECK(hipMalloc(&result.d_input, result.total_num_bytes())); - HIP_CHECK(hipMalloc(&result.d_output, result.total_num_bytes())); - - HIP_CHECK( - hipMalloc(&result.d_buffer_srcs, num_buffers * sizeof(ValueType *))); - HIP_CHECK( - hipMalloc(&result.d_buffer_dsts, num_buffers * sizeof(ValueType *))); - HIP_CHECK( - hipMalloc(&result.d_buffer_sizes, num_buffers * sizeof(BufferSizeType))); - - // Generate the source and shuffled destination offsets. - std::vector src_offsets; - std::vector dst_offsets; - - if (shuffle_buffers) { - src_offsets = - shuffled_exclusive_scan(h_buffer_num_elements, rng); - dst_offsets = - shuffled_exclusive_scan(h_buffer_num_elements, rng); - } else { - src_offsets = std::vector(num_buffers); - dst_offsets = std::vector(num_buffers); - - // Consecutive offsets (no shuffling). - // src/dst offsets first element is 0, so skip that! - std::partial_sum(h_buffer_num_elements.begin(), - h_buffer_num_elements.end() - 1, src_offsets.begin() + 1); - std::partial_sum(h_buffer_num_elements.begin(), - h_buffer_num_elements.end() - 1, dst_offsets.begin() + 1); - } - - // Generate the source and destination pointers. - std::vector h_buffer_srcs(num_buffers); - std::vector h_buffer_dsts(num_buffers); - - for (size_t i = 0; i < num_buffers; ++i) { - h_buffer_srcs[i] = result.d_input + src_offsets[i]; - h_buffer_dsts[i] = result.d_output + dst_offsets[i]; - } - - // Prepare the batch copy. - HIP_CHECK(hipMemcpy(result.d_input, h_input.get(), result.total_num_bytes(), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(result.d_buffer_srcs, h_buffer_srcs.data(), - h_buffer_srcs.size() * sizeof(ValueType *), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(result.d_buffer_dsts, h_buffer_dsts.data(), - h_buffer_dsts.size() * sizeof(ValueType *), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(result.d_buffer_sizes, h_buffer_num_elements.data(), - h_buffer_num_elements.size() * sizeof(BufferSizeType), - hipMemcpyHostToDevice)); - - return result; +template +BatchCopyData prepare_data(const int32_t num_tlev_buffers = 1024, + const int32_t num_wlev_buffers = 1024, + const int32_t num_blev_buffers = 1024) +{ + const bool shuffle_buffers = false; + + BatchCopyData result; + const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; + + constexpr int32_t wlev_min_elems + = benchmark_utils::ceiling_div(wlev_min_size, sizeof(ValueType)); + constexpr int32_t blev_min_elems + = benchmark_utils::ceiling_div(blev_min_size, sizeof(ValueType)); + constexpr int32_t max_elems = max_size / sizeof(ValueType); + + // Generate data + std::mt19937_64 rng(std::random_device{}()); + + // Number of elements in each buffer. + std::vector h_buffer_num_elements(num_buffers); + + auto iter = h_buffer_num_elements.begin(); + + iter = benchmark_utils::generate_random_data_n(iter, + num_tlev_buffers, + 1, + wlev_min_elems - 1, + rng); + iter = benchmark_utils::generate_random_data_n(iter, + num_wlev_buffers, + wlev_min_elems, + blev_min_elems - 1, + rng); + iter = benchmark_utils::generate_random_data_n(iter, + num_blev_buffers, + blev_min_elems, + max_elems, + rng); + + // Shuffle the sizes so that size classes aren't clustered + std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng); + + result.total_num_elements + = std::accumulate(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), size_t{0}); + + // Generate data. + std::independent_bits_engine bits_engine{rng}; + + const size_t num_ints + = benchmark_utils::ceiling_div(result.total_num_bytes(), sizeof(uint64_t)); + auto h_input = std::make_unique(num_ints * sizeof(uint64_t)); + + std::for_each(reinterpret_cast(h_input.get()), + reinterpret_cast(h_input.get() + num_ints * sizeof(uint64_t)), + [&bits_engine](uint64_t& elem) { ::new(&elem) uint64_t{bits_engine()}; }); + + HIP_CHECK(hipMalloc(&result.d_input, result.total_num_bytes())); + HIP_CHECK(hipMalloc(&result.d_output, result.total_num_bytes())); + + HIP_CHECK(hipMalloc(&result.d_buffer_srcs, num_buffers * sizeof(ValueType*))); + HIP_CHECK(hipMalloc(&result.d_buffer_dsts, num_buffers * sizeof(ValueType*))); + HIP_CHECK(hipMalloc(&result.d_buffer_sizes, num_buffers * sizeof(BufferSizeType))); + + // Generate the source and shuffled destination offsets. + std::vector src_offsets; + std::vector dst_offsets; + + if(shuffle_buffers) + { + src_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); + dst_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); + } else + { + src_offsets = std::vector(num_buffers); + dst_offsets = std::vector(num_buffers); + + // Consecutive offsets (no shuffling). + // src/dst offsets first element is 0, so skip that! + std::partial_sum(h_buffer_num_elements.begin(), + h_buffer_num_elements.end() - 1, + src_offsets.begin() + 1); + std::partial_sum(h_buffer_num_elements.begin(), + h_buffer_num_elements.end() - 1, + dst_offsets.begin() + 1); + } + + // Generate the source and destination pointers. + std::vector h_buffer_srcs(num_buffers); + std::vector h_buffer_dsts(num_buffers); + + for(size_t i = 0; i < num_buffers; ++i) + { + h_buffer_srcs[i] = result.d_input + src_offsets[i]; + h_buffer_dsts[i] = result.d_output + dst_offsets[i]; + } + + // Prepare the batch copy. + HIP_CHECK( + hipMemcpy(result.d_input, h_input.get(), result.total_num_bytes(), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_srcs, + h_buffer_srcs.data(), + h_buffer_srcs.size() * sizeof(ValueType*), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_dsts, + h_buffer_dsts.data(), + h_buffer_dsts.size() * sizeof(ValueType*), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_sizes, + h_buffer_num_elements.data(), + h_buffer_num_elements.size() * sizeof(BufferSizeType), + hipMemcpyHostToDevice)); + + return result; } -template -void run_benchmark(benchmark::State &state, hipStream_t stream, - const int32_t num_tlev_buffers = 1024, - const int32_t num_wlev_buffers = 1024, - const int32_t num_blev_buffers = 1024) { - const size_t num_buffers = - num_tlev_buffers + num_wlev_buffers + num_blev_buffers; - - size_t temp_storage_bytes = 0; - BatchCopyData data; - HIP_CHECK(hipcub::DeviceCopy::Batched(nullptr, temp_storage_bytes, - data.d_buffer_srcs, data.d_buffer_dsts, - data.d_buffer_sizes, num_buffers)); - - void *d_temp_storage = nullptr; - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); - - data = prepare_data( - num_tlev_buffers, num_wlev_buffers, num_blev_buffers); - - // Warm-up - for (size_t i = 0; i < warmup_size; i++) { - HIP_CHECK(hipcub::DeviceCopy::Batched( - d_temp_storage, temp_storage_bytes, data.d_buffer_srcs, - data.d_buffer_dsts, data.d_buffer_sizes, num_buffers, stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - // HIP events creation - hipEvent_t start, stop; - HIP_CHECK(hipEventCreate(&start)); - HIP_CHECK(hipEventCreate(&stop)); - - for (auto _ : state) { - // Record start event - HIP_CHECK(hipEventRecord(start, stream)); - - HIP_CHECK(hipcub::DeviceCopy::Batched( - d_temp_storage, temp_storage_bytes, data.d_buffer_srcs, - data.d_buffer_dsts, data.d_buffer_sizes, num_buffers, stream)); - - // Record stop event and wait until it completes - HIP_CHECK(hipEventRecord(stop, stream)); - HIP_CHECK(hipEventSynchronize(stop)); - - float elapsed_mseconds; - HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); - state.SetIterationTime(elapsed_mseconds / 1000); - } - state.SetBytesProcessed(state.iterations() * data.total_num_bytes()); - state.SetItemsProcessed(state.iterations() * data.total_num_elements); - - HIP_CHECK(hipFree(d_temp_storage)); +template +void run_benchmark(benchmark::State& state, + hipStream_t stream, + const int32_t num_tlev_buffers = 1024, + const int32_t num_wlev_buffers = 1024, + const int32_t num_blev_buffers = 1024) +{ + const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; + + size_t temp_storage_bytes = 0; + BatchCopyData data; + HIP_CHECK(hipcub::DeviceCopy::Batched(nullptr, + temp_storage_bytes, + data.d_buffer_srcs, + data.d_buffer_dsts, + data.d_buffer_sizes, + num_buffers)); + + void* d_temp_storage = nullptr; + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); + + data = prepare_data(num_tlev_buffers, + num_wlev_buffers, + num_blev_buffers); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK(hipcub::DeviceCopy::Batched(d_temp_storage, + temp_storage_bytes, + data.d_buffer_srcs, + data.d_buffer_dsts, + data.d_buffer_sizes, + num_buffers, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + // HIP events creation + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + + for(auto _ : state) + { + // Record start event + HIP_CHECK(hipEventRecord(start, stream)); + + HIP_CHECK(hipcub::DeviceCopy::Batched(d_temp_storage, + temp_storage_bytes, + data.d_buffer_srcs, + data.d_buffer_dsts, + data.d_buffer_sizes, + num_buffers, + stream)); + + // Record stop event and wait until it completes + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed_mseconds; + HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); + state.SetIterationTime(elapsed_mseconds / 1000); + } + state.SetBytesProcessed(state.iterations() * data.total_num_bytes()); + state.SetItemsProcessed(state.iterations() * data.total_num_elements); + + HIP_CHECK(hipFree(d_temp_storage)); } -#define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev) \ - benchmark::RegisterBenchmark( \ - std::string("device_batch_copy" \ - ".") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_benchmark, T>( \ - state, stream, num_tlev, num_wlev, num_blev); \ - }) - -#define BENCHMARK_TYPE(item_size, item_alignment) \ - CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ - CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \ - CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000), \ - CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) - -int32_t main(int32_t argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", 1024, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.set_optional("name_format", "name_format", "human", - "either: json,human,txt"); - - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int32_t trials = parser.get("trials"); - - // HIP - hipStream_t stream = hipStreamDefault; // default - - // Benchmark info - benchmark::AddCustomContext("size", std::to_string(size)); - - // Add benchmarks - std::vector benchmarks; - - benchmarks = {BENCHMARK_TYPE(1, 1), BENCHMARK_TYPE(1, 2), - BENCHMARK_TYPE(1, 4), BENCHMARK_TYPE(1, 8), - BENCHMARK_TYPE(2, 2), BENCHMARK_TYPE(4, 4), - BENCHMARK_TYPE(8, 8)}; - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +#define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev) \ + benchmark::RegisterBenchmark( \ + std::string("device_batch_copy" \ + ".") \ + .c_str(), \ + [=](benchmark::State& state) \ + { \ + run_benchmark, T>(state, \ + stream, \ + num_tlev, \ + num_wlev, \ + num_blev); \ + }) + +#define BENCHMARK_TYPE(item_size, item_alignment) \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) + +int32_t main(int32_t argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", 1024, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.set_optional("name_format", + "name_format", + "human", + "either: json,human,txt"); + + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int32_t trials = parser.get("trials"); + + // HIP + hipStream_t stream = hipStreamDefault; // default + + // Benchmark info + benchmark::AddCustomContext("size", std::to_string(size)); + + // Add benchmarks + std::vector benchmarks; + + benchmarks = {BENCHMARK_TYPE(1, 1), + BENCHMARK_TYPE(1, 2), + BENCHMARK_TYPE(1, 4), + BENCHMARK_TYPE(1, 8), + BENCHMARK_TYPE(2, 2), + BENCHMARK_TYPE(4, 4), + BENCHMARK_TYPE(8, 8)}; + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_batch_memcpy.cpp b/benchmark/benchmark_device_batch_memcpy.cpp index beb98c99..f0f38be2 100644 --- a/benchmark/benchmark_device_batch_memcpy.cpp +++ b/benchmark/benchmark_device_batch_memcpy.cpp @@ -30,8 +30,8 @@ #include "hipcub/hipcub.hpp" #ifdef __HIP_PLATFORM_AMD__ -// Only include this on AMD as it contains specialized config information -#include + // Only include this on AMD as it contains specialized config information + #include #endif #include @@ -45,7 +45,7 @@ #include constexpr uint32_t warmup_size = 5; -constexpr int32_t max_size = 1024 * 1024; +constexpr int32_t max_size = 1024 * 1024; constexpr int32_t wlev_min_size = 128; constexpr int32_t blev_min_size = 1024; @@ -72,310 +72,352 @@ constexpr int32_t blev_min_size = 1024; // ┌───┬───┬───┬───┬───┬───┬───┬───┐ // │c0'│a0'│a1'│a2'│d0'│d1'│b0'│b1'│ buffer y contains buffers a', b', c', d' // └───┴───┴───┴───┴───┴───┴───┴───┘ -template -std::vector shuffled_exclusive_scan(const std::vector &input, - RandomGenerator &rng) { - const auto n = input.size(); - assert(n > 0); - - std::vector result(n); - std::vector permute(n); - - std::iota(permute.begin(), permute.end(), 0); - std::shuffle(permute.begin(), permute.end(), rng); - - for (T i = 0, sum = 0; i < n; ++i) { - result[permute[i]] = sum; - sum += input[permute[i]]; - } +template +std::vector shuffled_exclusive_scan(const std::vector& input, RandomGenerator& rng) +{ + const auto n = input.size(); + assert(n > 0); + + std::vector result(n); + std::vector permute(n); + + std::iota(permute.begin(), permute.end(), 0); + std::shuffle(permute.begin(), permute.end(), rng); + + for(T i = 0, sum = 0; i < n; ++i) + { + result[permute[i]] = sum; + sum += input[permute[i]]; + } - return result; + return result; } using offset_type = size_t; -template struct BatchMemcpyData { - size_t total_num_elements = 0; - ValueType *d_input = nullptr; - ValueType *d_output = nullptr; - ValueType **d_buffer_srcs = nullptr; - ValueType **d_buffer_dsts = nullptr; - BufferSizeType *d_buffer_sizes = nullptr; - - BatchMemcpyData() = default; - BatchMemcpyData(const BatchMemcpyData &) = delete; - - BatchMemcpyData(BatchMemcpyData &&other) - : total_num_elements{std::exchange(other.total_num_elements, 0)}, - d_input{std::exchange(other.d_input, nullptr)}, d_output{std::exchange( - other.d_output, - nullptr)}, - d_buffer_srcs{std::exchange(other.d_buffer_srcs, nullptr)}, - d_buffer_dsts{std::exchange(other.d_buffer_dsts, nullptr)}, - d_buffer_sizes{std::exchange(other.d_buffer_sizes, nullptr)} {} - - BatchMemcpyData &operator=(BatchMemcpyData &&other) { - total_num_elements = std::exchange(other.total_num_elements, 0); - d_input = std::exchange(other.d_input, nullptr); - d_output = std::exchange(other.d_output, nullptr); - d_buffer_srcs = std::exchange(other.d_buffer_srcs, nullptr); - d_buffer_dsts = std::exchange(other.d_buffer_dsts, nullptr); - d_buffer_sizes = std::exchange(other.d_buffer_sizes, nullptr); - return *this; - }; - - BatchMemcpyData &operator=(const BatchMemcpyData &) = delete; - - size_t total_num_bytes() const { - return total_num_elements * sizeof(ValueType); - } - - ~BatchMemcpyData() { - HIP_CHECK(hipFree(d_buffer_sizes)); - HIP_CHECK(hipFree(d_buffer_srcs)); - HIP_CHECK(hipFree(d_buffer_dsts)); - HIP_CHECK(hipFree(d_output)); - HIP_CHECK(hipFree(d_input)); - } +template +struct BatchMemcpyData +{ + size_t total_num_elements = 0; + ValueType* d_input = nullptr; + ValueType* d_output = nullptr; + ValueType** d_buffer_srcs = nullptr; + ValueType** d_buffer_dsts = nullptr; + BufferSizeType* d_buffer_sizes = nullptr; + + BatchMemcpyData() = default; + BatchMemcpyData(const BatchMemcpyData&) = delete; + + BatchMemcpyData(BatchMemcpyData&& other) + : total_num_elements{std::exchange(other.total_num_elements, 0)} + , d_input{std::exchange(other.d_input, nullptr)} + , d_output{std::exchange(other.d_output, nullptr)} + , d_buffer_srcs{std::exchange(other.d_buffer_srcs, nullptr)} + , d_buffer_dsts{std::exchange(other.d_buffer_dsts, nullptr)} + , d_buffer_sizes{std::exchange(other.d_buffer_sizes, nullptr)} + {} + + BatchMemcpyData& operator=(BatchMemcpyData&& other) + { + total_num_elements = std::exchange(other.total_num_elements, 0); + d_input = std::exchange(other.d_input, nullptr); + d_output = std::exchange(other.d_output, nullptr); + d_buffer_srcs = std::exchange(other.d_buffer_srcs, nullptr); + d_buffer_dsts = std::exchange(other.d_buffer_dsts, nullptr); + d_buffer_sizes = std::exchange(other.d_buffer_sizes, nullptr); + return *this; + }; + + BatchMemcpyData& operator=(const BatchMemcpyData&) = delete; + + size_t total_num_bytes() const + { + return total_num_elements * sizeof(ValueType); + } + + ~BatchMemcpyData() + { + HIP_CHECK(hipFree(d_buffer_sizes)); + HIP_CHECK(hipFree(d_buffer_srcs)); + HIP_CHECK(hipFree(d_buffer_dsts)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + } }; -template -BatchMemcpyData -prepare_data(const int32_t num_tlev_buffers = 1024, - const int32_t num_wlev_buffers = 1024, - const int32_t num_blev_buffers = 1024) { - const bool shuffle_buffers = false; - - BatchMemcpyData result; - const size_t num_buffers = - num_tlev_buffers + num_wlev_buffers + num_blev_buffers; - - constexpr int32_t wlev_min_elems = - benchmark_utils::ceiling_div(wlev_min_size, sizeof(ValueType)); - constexpr int32_t blev_min_elems = - benchmark_utils::ceiling_div(blev_min_size, sizeof(ValueType)); - constexpr int32_t max_elems = max_size / sizeof(ValueType); - - // Generate data - std::mt19937_64 rng(std::random_device{}()); - - // Number of elements in each buffer. - std::vector h_buffer_num_elements(num_buffers); - - auto iter = h_buffer_num_elements.begin(); - - iter = benchmark_utils::generate_random_data_n(iter, num_tlev_buffers, 1, - wlev_min_elems - 1, rng); - iter = benchmark_utils::generate_random_data_n( - iter, num_wlev_buffers, wlev_min_elems, blev_min_elems - 1, rng); - iter = benchmark_utils::generate_random_data_n( - iter, num_blev_buffers, blev_min_elems, max_elems, rng); - - // Shuffle the sizes so that size classes aren't clustered - std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng); - - // Get the byte size of each buffer - std::vector h_buffer_num_bytes(num_buffers); - for (size_t i = 0; i < num_buffers; ++i) { - h_buffer_num_bytes[i] = h_buffer_num_elements[i] * sizeof(ValueType); - } - - result.total_num_elements = std::accumulate( - h_buffer_num_elements.begin(), h_buffer_num_elements.end(), size_t{0}); - - // Generate data. - std::independent_bits_engine bits_engine{rng}; - - const size_t num_ints = - benchmark_utils::ceiling_div(result.total_num_bytes(), sizeof(uint64_t)); - auto h_input = std::make_unique(num_ints * sizeof(uint64_t)); - - std::for_each( - reinterpret_cast(h_input.get()), - reinterpret_cast(h_input.get() + num_ints * sizeof(uint64_t)), - [&bits_engine](uint64_t &elem) { - ::new (&elem) uint64_t{bits_engine()}; - }); - - HIP_CHECK(hipMalloc(&result.d_input, result.total_num_bytes())); - HIP_CHECK(hipMalloc(&result.d_output, result.total_num_bytes())); - - HIP_CHECK( - hipMalloc(&result.d_buffer_srcs, num_buffers * sizeof(ValueType *))); - HIP_CHECK( - hipMalloc(&result.d_buffer_dsts, num_buffers * sizeof(ValueType *))); - HIP_CHECK( - hipMalloc(&result.d_buffer_sizes, num_buffers * sizeof(BufferSizeType))); - - // Generate the source and shuffled destination offsets. - std::vector src_offsets; - std::vector dst_offsets; - - if (shuffle_buffers) { - src_offsets = - shuffled_exclusive_scan(h_buffer_num_elements, rng); - dst_offsets = - shuffled_exclusive_scan(h_buffer_num_elements, rng); - } else { - src_offsets = std::vector(num_buffers); - dst_offsets = std::vector(num_buffers); - - // Consecutive offsets (no shuffling). - // src/dst offsets first element is 0, so skip that! - std::partial_sum(h_buffer_num_elements.begin(), - h_buffer_num_elements.end() - 1, src_offsets.begin() + 1); - std::partial_sum(h_buffer_num_elements.begin(), - h_buffer_num_elements.end() - 1, dst_offsets.begin() + 1); - } - - // Generate the source and destination pointers. - std::vector h_buffer_srcs(num_buffers); - std::vector h_buffer_dsts(num_buffers); - - for (size_t i = 0; i < num_buffers; ++i) { - h_buffer_srcs[i] = result.d_input + src_offsets[i]; - h_buffer_dsts[i] = result.d_output + dst_offsets[i]; - } - - // Prepare the batch memcpy. - HIP_CHECK(hipMemcpy(result.d_input, h_input.get(), result.total_num_bytes(), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(result.d_buffer_srcs, h_buffer_srcs.data(), - h_buffer_srcs.size() * sizeof(ValueType *), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(result.d_buffer_dsts, h_buffer_dsts.data(), - h_buffer_dsts.size() * sizeof(ValueType *), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(result.d_buffer_sizes, h_buffer_num_bytes.data(), - h_buffer_num_bytes.size() * sizeof(BufferSizeType), - hipMemcpyHostToDevice)); - - return result; +template +BatchMemcpyData prepare_data(const int32_t num_tlev_buffers = 1024, + const int32_t num_wlev_buffers = 1024, + const int32_t num_blev_buffers = 1024) +{ + const bool shuffle_buffers = false; + + BatchMemcpyData result; + const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; + + constexpr int32_t wlev_min_elems + = benchmark_utils::ceiling_div(wlev_min_size, sizeof(ValueType)); + constexpr int32_t blev_min_elems + = benchmark_utils::ceiling_div(blev_min_size, sizeof(ValueType)); + constexpr int32_t max_elems = max_size / sizeof(ValueType); + + // Generate data + std::mt19937_64 rng(std::random_device{}()); + + // Number of elements in each buffer. + std::vector h_buffer_num_elements(num_buffers); + + auto iter = h_buffer_num_elements.begin(); + + iter = benchmark_utils::generate_random_data_n(iter, + num_tlev_buffers, + 1, + wlev_min_elems - 1, + rng); + iter = benchmark_utils::generate_random_data_n(iter, + num_wlev_buffers, + wlev_min_elems, + blev_min_elems - 1, + rng); + iter = benchmark_utils::generate_random_data_n(iter, + num_blev_buffers, + blev_min_elems, + max_elems, + rng); + + // Shuffle the sizes so that size classes aren't clustered + std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng); + + // Get the byte size of each buffer + std::vector h_buffer_num_bytes(num_buffers); + for(size_t i = 0; i < num_buffers; ++i) + { + h_buffer_num_bytes[i] = h_buffer_num_elements[i] * sizeof(ValueType); + } + + result.total_num_elements + = std::accumulate(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), size_t{0}); + + // Generate data. + std::independent_bits_engine bits_engine{rng}; + + const size_t num_ints + = benchmark_utils::ceiling_div(result.total_num_bytes(), sizeof(uint64_t)); + auto h_input = std::make_unique(num_ints * sizeof(uint64_t)); + + std::for_each(reinterpret_cast(h_input.get()), + reinterpret_cast(h_input.get() + num_ints * sizeof(uint64_t)), + [&bits_engine](uint64_t& elem) { ::new(&elem) uint64_t{bits_engine()}; }); + + HIP_CHECK(hipMalloc(&result.d_input, result.total_num_bytes())); + HIP_CHECK(hipMalloc(&result.d_output, result.total_num_bytes())); + + HIP_CHECK(hipMalloc(&result.d_buffer_srcs, num_buffers * sizeof(ValueType*))); + HIP_CHECK(hipMalloc(&result.d_buffer_dsts, num_buffers * sizeof(ValueType*))); + HIP_CHECK(hipMalloc(&result.d_buffer_sizes, num_buffers * sizeof(BufferSizeType))); + + // Generate the source and shuffled destination offsets. + std::vector src_offsets; + std::vector dst_offsets; + + if(shuffle_buffers) + { + src_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); + dst_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); + } else + { + src_offsets = std::vector(num_buffers); + dst_offsets = std::vector(num_buffers); + + // Consecutive offsets (no shuffling). + // src/dst offsets first element is 0, so skip that! + std::partial_sum(h_buffer_num_elements.begin(), + h_buffer_num_elements.end() - 1, + src_offsets.begin() + 1); + std::partial_sum(h_buffer_num_elements.begin(), + h_buffer_num_elements.end() - 1, + dst_offsets.begin() + 1); + } + + // Generate the source and destination pointers. + std::vector h_buffer_srcs(num_buffers); + std::vector h_buffer_dsts(num_buffers); + + for(size_t i = 0; i < num_buffers; ++i) + { + h_buffer_srcs[i] = result.d_input + src_offsets[i]; + h_buffer_dsts[i] = result.d_output + dst_offsets[i]; + } + + // Prepare the batch memcpy. + HIP_CHECK( + hipMemcpy(result.d_input, h_input.get(), result.total_num_bytes(), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_srcs, + h_buffer_srcs.data(), + h_buffer_srcs.size() * sizeof(ValueType*), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_dsts, + h_buffer_dsts.data(), + h_buffer_dsts.size() * sizeof(ValueType*), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_sizes, + h_buffer_num_bytes.data(), + h_buffer_num_bytes.size() * sizeof(BufferSizeType), + hipMemcpyHostToDevice)); + + return result; } -template -void run_benchmark(benchmark::State &state, hipStream_t stream, - const int32_t num_tlev_buffers = 1024, - const int32_t num_wlev_buffers = 1024, - const int32_t num_blev_buffers = 1024) { - const size_t num_buffers = - num_tlev_buffers + num_wlev_buffers + num_blev_buffers; - - size_t temp_storage_bytes = 0; - BatchMemcpyData data; - HIP_CHECK(hipcub::DeviceMemcpy::Batched( - nullptr, temp_storage_bytes, data.d_buffer_srcs, data.d_buffer_dsts, - data.d_buffer_sizes, num_buffers)); - - void *d_temp_storage = nullptr; - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); - - data = prepare_data( - num_tlev_buffers, num_wlev_buffers, num_blev_buffers); - - // Warm-up - for (size_t i = 0; i < warmup_size; i++) { - HIP_CHECK(hipcub::DeviceMemcpy::Batched( - d_temp_storage, temp_storage_bytes, data.d_buffer_srcs, - data.d_buffer_dsts, data.d_buffer_sizes, num_buffers, stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - // HIP events creation - hipEvent_t start, stop; - HIP_CHECK(hipEventCreate(&start)); - HIP_CHECK(hipEventCreate(&stop)); - - for (auto _ : state) { - // Record start event - HIP_CHECK(hipEventRecord(start, stream)); - - HIP_CHECK(hipcub::DeviceMemcpy::Batched( - d_temp_storage, temp_storage_bytes, data.d_buffer_srcs, - data.d_buffer_dsts, data.d_buffer_sizes, num_buffers, stream)); - - // Record stop event and wait until it completes - HIP_CHECK(hipEventRecord(stop, stream)); - HIP_CHECK(hipEventSynchronize(stop)); - - float elapsed_mseconds; - HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); - state.SetIterationTime(elapsed_mseconds / 1000); - } - state.SetBytesProcessed(state.iterations() * data.total_num_bytes()); - state.SetItemsProcessed(state.iterations() * data.total_num_elements); - - HIP_CHECK(hipFree(d_temp_storage)); +template +void run_benchmark(benchmark::State& state, + hipStream_t stream, + const int32_t num_tlev_buffers = 1024, + const int32_t num_wlev_buffers = 1024, + const int32_t num_blev_buffers = 1024) +{ + const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; + + size_t temp_storage_bytes = 0; + BatchMemcpyData data; + HIP_CHECK(hipcub::DeviceMemcpy::Batched(nullptr, + temp_storage_bytes, + data.d_buffer_srcs, + data.d_buffer_dsts, + data.d_buffer_sizes, + num_buffers)); + + void* d_temp_storage = nullptr; + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); + + data = prepare_data(num_tlev_buffers, + num_wlev_buffers, + num_blev_buffers); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK(hipcub::DeviceMemcpy::Batched(d_temp_storage, + temp_storage_bytes, + data.d_buffer_srcs, + data.d_buffer_dsts, + data.d_buffer_sizes, + num_buffers, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + // HIP events creation + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + + for(auto _ : state) + { + // Record start event + HIP_CHECK(hipEventRecord(start, stream)); + + HIP_CHECK(hipcub::DeviceMemcpy::Batched(d_temp_storage, + temp_storage_bytes, + data.d_buffer_srcs, + data.d_buffer_dsts, + data.d_buffer_sizes, + num_buffers, + stream)); + + // Record stop event and wait until it completes + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed_mseconds; + HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); + state.SetIterationTime(elapsed_mseconds / 1000); + } + state.SetBytesProcessed(state.iterations() * data.total_num_bytes()); + state.SetItemsProcessed(state.iterations() * data.total_num_elements); + + HIP_CHECK(hipFree(d_temp_storage)); } -#define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev) \ - benchmark::RegisterBenchmark( \ - std::string("device_batch_memcpy.") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_benchmark, T>( \ - state, stream, num_tlev, num_wlev, num_blev); \ - }) - -#define BENCHMARK_TYPE(item_size, item_alignment) \ - CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ - CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \ - CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000), \ - CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) - -int32_t main(int32_t argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", 1024, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.set_optional("name_format", "name_format", "human", - "either: json,human,txt"); - - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int32_t trials = parser.get("trials"); - - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - - std::cout << "benchmark_device_adjacent_difference" << std::endl; - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // HIP - hipStream_t stream = hipStreamDefault; // default - - // Benchmark info - benchmark::AddCustomContext("size", std::to_string(size)); - - // Add benchmarks - std::vector benchmarks; - - benchmarks = {BENCHMARK_TYPE(1, 1), BENCHMARK_TYPE(1, 2), - BENCHMARK_TYPE(1, 4), BENCHMARK_TYPE(1, 8), - BENCHMARK_TYPE(2, 2), BENCHMARK_TYPE(4, 4), - BENCHMARK_TYPE(8, 8)}; - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +#define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev) \ + benchmark::RegisterBenchmark( \ + std::string("device_batch_memcpy.") \ + .c_str(), \ + [=](benchmark::State& state) \ + { \ + run_benchmark, T>(state, \ + stream, \ + num_tlev, \ + num_wlev, \ + num_blev); \ + }) + +#define BENCHMARK_TYPE(item_size, item_alignment) \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) + +int32_t main(int32_t argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", 1024, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.set_optional("name_format", + "name_format", + "human", + "either: json,human,txt"); + + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int32_t trials = parser.get("trials"); + + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_device_adjacent_difference" << std::endl; + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // HIP + hipStream_t stream = hipStreamDefault; // default + + // Benchmark info + benchmark::AddCustomContext("size", std::to_string(size)); + + // Add benchmarks + std::vector benchmarks; + + benchmarks = {BENCHMARK_TYPE(1, 1), + BENCHMARK_TYPE(1, 2), + BENCHMARK_TYPE(1, 4), + BENCHMARK_TYPE(1, 8), + BENCHMARK_TYPE(2, 2), + BENCHMARK_TYPE(4, 4), + BENCHMARK_TYPE(8, 8)}; + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_histogram.cpp b/benchmark/benchmark_device_histogram.cpp index c8cb32b7..a5019e4b 100644 --- a/benchmark/benchmark_device_histogram.cpp +++ b/benchmark/benchmark_device_histogram.cpp @@ -23,7 +23,7 @@ // CUB's implementation of DeviceRunLengthEncode has unused parameters, // disable the warning because all warnings are threated as errors: #ifdef __HIP_PLATFORM_NVIDIA__ -#pragma GCC diagnostic ignored "-Wunused-parameter" + #pragma GCC diagnostic ignored "-Wunused-parameter" #endif #include "common_benchmark_header.hpp" @@ -36,515 +36,623 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template -std::vector generate(size_t size, int entropy_reduction, - long long lower_level, long long upper_level) { - if (entropy_reduction >= 5) { - return std::vector(size, (lower_level + upper_level) / 2); - } - - const size_t max_random_size = 1024 * 1024; - - std::random_device rd; - std::default_random_engine gen(rd()); - std::vector data(size); - std::generate(data.begin(), data.begin() + std::min(size, max_random_size), - [&]() { - // Reduce entropy by applying bitwise AND to random bits - // "An Improved Supercomputer Sorting Benchmark", 1992 - // Kurt Thearling & Stephen Smith - auto v = gen(); - for (int e = 0; e < entropy_reduction; e++) { - v &= gen(); - } - return T(lower_level + v % (upper_level - lower_level)); - }); - for (size_t i = max_random_size; i < size; i += max_random_size) { - std::copy_n(data.begin(), std::min(size - i, max_random_size), - data.begin() + i); - } - return data; +template +std::vector + generate(size_t size, int entropy_reduction, long long lower_level, long long upper_level) +{ + if(entropy_reduction >= 5) + { + return std::vector(size, (lower_level + upper_level) / 2); + } + + const size_t max_random_size = 1024 * 1024; + + std::random_device rd; + std::default_random_engine gen(rd()); + std::vector data(size); + std::generate(data.begin(), + data.begin() + std::min(size, max_random_size), + [&]() + { + // Reduce entropy by applying bitwise AND to random bits + // "An Improved Supercomputer Sorting Benchmark", 1992 + // Kurt Thearling & Stephen Smith + auto v = gen(); + for(int e = 0; e < entropy_reduction; e++) + { + v &= gen(); + } + return T(lower_level + v % (upper_level - lower_level)); + }); + for(size_t i = max_random_size; i < size; i += max_random_size) + { + std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); + } + return data; } -int get_entropy_percents(int entropy_reduction) { - switch (entropy_reduction) { - case 0: - return 100; - case 1: - return 81; - case 2: - return 54; - case 3: - return 33; - case 4: - return 20; - default: - return 0; - } +int get_entropy_percents(int entropy_reduction) +{ + switch(entropy_reduction) + { + case 0: return 100; + case 1: return 81; + case 2: return 54; + case 3: return 33; + case 4: return 20; + default: return 0; + } } const int entropy_reductions[] = {0, 2, 4, 6}; -template -void run_even_benchmark(benchmark::State &state, size_t bins, size_t scale, - int entropy_reduction, hipStream_t stream, - size_t size) { - using counter_type = unsigned int; - - const T lower_level = 0; - // casting for compilation with CUB backend because - // there is no casting from size_t (aka unsigned long) to __half - const T upper_level = static_cast(bins * scale); - - // Generate data - std::vector input = - generate(size, entropy_reduction, lower_level, upper_level); - - T *d_input; - counter_type *d_histogram; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), - hipMemcpyHostToDevice)); - - void *d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK(hipcub::DeviceHistogram::HistogramEven( - d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, - bins + 1, lower_level, upper_level, int(size), stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < warmup_size; i++) { - HIP_CHECK(hipcub::DeviceHistogram::HistogramEven( - d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, - bins + 1, lower_level, upper_level, int(size), stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK(hipcub::DeviceHistogram::HistogramEven( - d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, - bins + 1, lower_level, upper_level, int(size), stream)); +template +void run_even_benchmark(benchmark::State& state, + size_t bins, + size_t scale, + int entropy_reduction, + hipStream_t stream, + size_t size) +{ + using counter_type = unsigned int; + + const T lower_level = 0; + // casting for compilation with CUB backend because + // there is no casting from size_t (aka unsigned long) to __half + const T upper_level = static_cast(bins * scale); + + // Generate data + std::vector input = generate(size, entropy_reduction, lower_level, upper_level); + + T* d_input; + counter_type* d_histogram; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK(hipcub::DeviceHistogram::HistogramEven(d_temporary_storage, + temporary_storage_bytes, + d_input, + d_histogram, + bins + 1, + lower_level, + upper_level, + int(size), + stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK(hipcub::DeviceHistogram::HistogramEven(d_temporary_storage, + temporary_storage_bytes, + d_input, + d_histogram, + bins + 1, + lower_level, + upper_level, + int(size), + stream)); } HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_histogram)); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK(hipcub::DeviceHistogram::HistogramEven(d_temporary_storage, + temporary_storage_bytes, + d_input, + d_histogram, + bins + 1, + lower_level, + upper_level, + int(size), + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_histogram)); } -template -void run_multi_even_benchmark(benchmark::State &state, size_t bins, - size_t scale, int entropy_reduction, - hipStream_t stream, size_t size) { - using counter_type = unsigned int; - - int num_levels[ActiveChannels]; - int lower_level[ActiveChannels]; - int upper_level[ActiveChannels]; - for (unsigned int channel = 0; channel < ActiveChannels; channel++) { - lower_level[channel] = 0; - upper_level[channel] = bins * scale; - num_levels[channel] = bins + 1; - } - - // Generate data - std::vector input = generate(size * Channels, entropy_reduction, - lower_level[0], upper_level[0]); - - T *d_input; - counter_type *d_histogram[ActiveChannels]; - HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T))); - for (unsigned int channel = 0; channel < ActiveChannels; channel++) { - HIP_CHECK(hipMalloc(&d_histogram[channel], bins * sizeof(counter_type))); - } - HIP_CHECK(hipMemcpy(d_input, input.data(), size * Channels * sizeof(T), - hipMemcpyHostToDevice)); - - void *d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK( - (hipcub::DeviceHistogram::MultiHistogramEven( - d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, - num_levels, lower_level, upper_level, int(size), stream))); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < warmup_size; i++) { - HIP_CHECK( - (hipcub::DeviceHistogram::MultiHistogramEven( - d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, - num_levels, lower_level, upper_level, int(size), stream))); - } - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK(( - hipcub::DeviceHistogram::MultiHistogramEven( - d_temporary_storage, temporary_storage_bytes, d_input, - d_histogram, num_levels, lower_level, upper_level, int(size), - stream))); +template +void run_multi_even_benchmark(benchmark::State& state, + size_t bins, + size_t scale, + int entropy_reduction, + hipStream_t stream, + size_t size) +{ + using counter_type = unsigned int; + + int num_levels[ActiveChannels]; + int lower_level[ActiveChannels]; + int upper_level[ActiveChannels]; + for(unsigned int channel = 0; channel < ActiveChannels; channel++) + { + lower_level[channel] = 0; + upper_level[channel] = bins * scale; + num_levels[channel] = bins + 1; + } + + // Generate data + std::vector input + = generate(size * Channels, entropy_reduction, lower_level[0], upper_level[0]); + + T* d_input; + counter_type* d_histogram[ActiveChannels]; + HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T))); + for(unsigned int channel = 0; channel < ActiveChannels; channel++) + { + HIP_CHECK(hipMalloc(&d_histogram[channel], bins * sizeof(counter_type))); + } + HIP_CHECK(hipMemcpy(d_input, input.data(), size * Channels * sizeof(T), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramEven( + d_temporary_storage, + temporary_storage_bytes, + d_input, + d_histogram, + num_levels, + lower_level, + upper_level, + int(size), + stream))); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramEven( + d_temporary_storage, + temporary_storage_bytes, + d_input, + d_histogram, + num_levels, + lower_level, + upper_level, + int(size), + stream))); } HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * - sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size * Channels); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_input)); - for (unsigned int channel = 0; channel < ActiveChannels; channel++) { - HIP_CHECK(hipFree(d_histogram[channel])); - } + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramEven( + d_temporary_storage, + temporary_storage_bytes, + d_input, + d_histogram, + num_levels, + lower_level, + upper_level, + int(size), + stream))); + } + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size * Channels); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_input)); + for(unsigned int channel = 0; channel < ActiveChannels; channel++) + { + HIP_CHECK(hipFree(d_histogram[channel])); + } } -template -void run_range_benchmark(benchmark::State &state, size_t bins, - hipStream_t stream, size_t size) { - using counter_type = unsigned int; - - // Generate data - std::vector input = benchmark_utils::get_random_data(size, 0, bins); - - std::vector levels(bins + 1); - std::iota(levels.begin(), levels.end(), static_cast(0)); - - T *d_input; - T *d_levels; - counter_type *d_histogram; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_levels, (bins + 1) * sizeof(T))); - HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_levels, levels.data(), (bins + 1) * sizeof(T), - hipMemcpyHostToDevice)); - - void *d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK(hipcub::DeviceHistogram::HistogramRange( - d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, - bins + 1, d_levels, int(size), stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < warmup_size; i++) { - HIP_CHECK(hipcub::DeviceHistogram::HistogramRange( - d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, - bins + 1, d_levels, int(size), stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK(hipcub::DeviceHistogram::HistogramRange( - d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, - bins + 1, d_levels, int(size), stream)); +template +void run_range_benchmark(benchmark::State& state, size_t bins, hipStream_t stream, size_t size) +{ + using counter_type = unsigned int; + + // Generate data + std::vector input = benchmark_utils::get_random_data(size, 0, bins); + + std::vector levels(bins + 1); + std::iota(levels.begin(), levels.end(), static_cast(0)); + + T* d_input; + T* d_levels; + counter_type* d_histogram; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_levels, (bins + 1) * sizeof(T))); + HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_levels, levels.data(), (bins + 1) * sizeof(T), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK(hipcub::DeviceHistogram::HistogramRange(d_temporary_storage, + temporary_storage_bytes, + d_input, + d_histogram, + bins + 1, + d_levels, + int(size), + stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK(hipcub::DeviceHistogram::HistogramRange(d_temporary_storage, + temporary_storage_bytes, + d_input, + d_histogram, + bins + 1, + d_levels, + int(size), + stream)); } HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_levels)); - HIP_CHECK(hipFree(d_histogram)); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK(hipcub::DeviceHistogram::HistogramRange(d_temporary_storage, + temporary_storage_bytes, + d_input, + d_histogram, + bins + 1, + d_levels, + int(size), + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_levels)); + HIP_CHECK(hipFree(d_histogram)); } -template -void run_multi_range_benchmark(benchmark::State &state, size_t bins, - hipStream_t stream, size_t size) { - using counter_type = unsigned int; - - // Number of levels for a single channel - const int num_levels_channel = bins + 1; - int num_levels[ActiveChannels]; - std::vector levels[ActiveChannels]; - for (unsigned int channel = 0; channel < ActiveChannels; channel++) { - levels[channel].resize(num_levels_channel); - std::iota(levels[channel].begin(), levels[channel].end(), - static_cast(0)); - num_levels[channel] = num_levels_channel; - } - - // Generate data - std::vector input = - benchmark_utils::get_random_data(size * Channels, 0, bins); - - T *d_input; - T *d_levels[ActiveChannels]; - counter_type *d_histogram[ActiveChannels]; - HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T))); - for (unsigned int channel = 0; channel < ActiveChannels; channel++) { - HIP_CHECK(hipMalloc(&d_levels[channel], num_levels_channel * sizeof(T))); - HIP_CHECK(hipMalloc(&d_histogram[channel], size * sizeof(counter_type))); - } - - HIP_CHECK(hipMemcpy(d_input, input.data(), size * Channels * sizeof(T), - hipMemcpyHostToDevice)); - for (unsigned int channel = 0; channel < ActiveChannels; channel++) { - HIP_CHECK(hipMemcpy(d_levels[channel], levels[channel].data(), - num_levels_channel * sizeof(T), hipMemcpyHostToDevice)); - } - - void *d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK( - (hipcub::DeviceHistogram::MultiHistogramRange( - d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, - num_levels, d_levels, int(size), stream))); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < warmup_size; i++) { - HIP_CHECK( - (hipcub::DeviceHistogram::MultiHistogramRange( - d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, - num_levels, d_levels, int(size), stream))); - } - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramRange( - d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, - num_levels, d_levels, int(size), stream))); +template +void run_multi_range_benchmark(benchmark::State& state, + size_t bins, + hipStream_t stream, + size_t size) +{ + using counter_type = unsigned int; + + // Number of levels for a single channel + const int num_levels_channel = bins + 1; + int num_levels[ActiveChannels]; + std::vector levels[ActiveChannels]; + for(unsigned int channel = 0; channel < ActiveChannels; channel++) + { + levels[channel].resize(num_levels_channel); + std::iota(levels[channel].begin(), levels[channel].end(), static_cast(0)); + num_levels[channel] = num_levels_channel; } + + // Generate data + std::vector input = benchmark_utils::get_random_data(size * Channels, 0, bins); + + T* d_input; + T* d_levels[ActiveChannels]; + counter_type* d_histogram[ActiveChannels]; + HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T))); + for(unsigned int channel = 0; channel < ActiveChannels; channel++) + { + HIP_CHECK(hipMalloc(&d_levels[channel], num_levels_channel * sizeof(T))); + HIP_CHECK(hipMalloc(&d_histogram[channel], size * sizeof(counter_type))); + } + + HIP_CHECK(hipMemcpy(d_input, input.data(), size * Channels * sizeof(T), hipMemcpyHostToDevice)); + for(unsigned int channel = 0; channel < ActiveChannels; channel++) + { + HIP_CHECK(hipMemcpy(d_levels[channel], + levels[channel].data(), + num_levels_channel * sizeof(T), + hipMemcpyHostToDevice)); + } + + void* d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramRange( + d_temporary_storage, + temporary_storage_bytes, + d_input, + d_histogram, + num_levels, + d_levels, + int(size), + stream))); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * - sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size * Channels); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_input)); - for (unsigned int channel = 0; channel < ActiveChannels; channel++) { - HIP_CHECK(hipFree(d_levels[channel])); - HIP_CHECK(hipFree(d_histogram[channel])); - } + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramRange( + d_temporary_storage, + temporary_storage_bytes, + d_input, + d_histogram, + num_levels, + d_levels, + int(size), + stream))); + } + HIP_CHECK(hipDeviceSynchronize()); + + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramRange( + d_temporary_storage, + temporary_storage_bytes, + d_input, + d_histogram, + num_levels, + d_levels, + int(size), + stream))); + } + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size * Channels); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_input)); + for(unsigned int channel = 0; channel < ActiveChannels; channel++) + { + HIP_CHECK(hipFree(d_levels[channel])); + HIP_CHECK(hipFree(d_histogram[channel])); + } } -template struct num_limits { - static constexpr T max() { return std::numeric_limits::max(); }; +template +struct num_limits +{ + static constexpr T max() + { + return std::numeric_limits::max(); + }; }; -template <> struct num_limits<__half> { - static constexpr double max() { return 65504.0; }; +template<> +struct num_limits<__half> +{ + static constexpr double max() + { + return 65504.0; + }; }; -#define CREATE_EVEN_BENCHMARK(VECTOR, T, BINS, SCALE) \ - if (num_limits::max() > BINS * SCALE) { \ - VECTOR.push_back(benchmark::RegisterBenchmark( \ - std::string("device_histogram_even" \ - "." \ - "(entropy_percent:" + \ - std::to_string(get_entropy_percents(entropy_reduction)) + \ - "%,bin_count:" + std::to_string(BINS) + " bins)") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_even_benchmark(state, BINS, SCALE, entropy_reduction, stream, \ - size); \ - })); \ - } - -#define BENCHMARK_TYPE(VECTOR, T) \ - CREATE_EVEN_BENCHMARK(VECTOR, T, 10, 1234); \ - CREATE_EVEN_BENCHMARK(VECTOR, T, 100, 1234); \ - CREATE_EVEN_BENCHMARK(VECTOR, T, 1000, 1234); \ - CREATE_EVEN_BENCHMARK(VECTOR, T, 16, 10); \ - CREATE_EVEN_BENCHMARK(VECTOR, T, 256, 10); \ - CREATE_EVEN_BENCHMARK(VECTOR, T, 65536, 1) - -void add_even_benchmarks( - std::vector &benchmarks, - hipStream_t stream, size_t size) { - for (int entropy_reduction : entropy_reductions) { - BENCHMARK_TYPE(benchmarks, long long); - BENCHMARK_TYPE(benchmarks, int); - BENCHMARK_TYPE(benchmarks, unsigned short); - BENCHMARK_TYPE(benchmarks, uint8_t); - BENCHMARK_TYPE(benchmarks, double); - BENCHMARK_TYPE(benchmarks, float); - // this limitation can be removed once - // https://github.com/NVIDIA/cub/issues/484 is fixed +#define CREATE_EVEN_BENCHMARK(VECTOR, T, BINS, SCALE) \ + if(num_limits::max() > BINS * SCALE) \ + { \ + VECTOR.push_back(benchmark::RegisterBenchmark( \ + std::string("device_histogram_even" \ + "." \ + "(entropy_percent:" \ + + std::to_string(get_entropy_percents(entropy_reduction)) \ + + "%,bin_count:" + std::to_string(BINS) + " bins)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_even_benchmark(state, BINS, SCALE, entropy_reduction, stream, size); })); \ + } + +#define BENCHMARK_TYPE(VECTOR, T) \ + CREATE_EVEN_BENCHMARK(VECTOR, T, 10, 1234); \ + CREATE_EVEN_BENCHMARK(VECTOR, T, 100, 1234); \ + CREATE_EVEN_BENCHMARK(VECTOR, T, 1000, 1234); \ + CREATE_EVEN_BENCHMARK(VECTOR, T, 16, 10); \ + CREATE_EVEN_BENCHMARK(VECTOR, T, 256, 10); \ + CREATE_EVEN_BENCHMARK(VECTOR, T, 65536, 1) + +void add_even_benchmarks(std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + for(int entropy_reduction : entropy_reductions) + { + BENCHMARK_TYPE(benchmarks, long long); + BENCHMARK_TYPE(benchmarks, int); + BENCHMARK_TYPE(benchmarks, unsigned short); + BENCHMARK_TYPE(benchmarks, uint8_t); + BENCHMARK_TYPE(benchmarks, double); + BENCHMARK_TYPE(benchmarks, float); + // this limitation can be removed once + // https://github.com/NVIDIA/cub/issues/484 is fixed #ifdef __HIP_PLATFORM_AMD__ - BENCHMARK_TYPE(benchmarks, __half); + BENCHMARK_TYPE(benchmarks, __half); #endif - }; + }; } -#define CREATE_MULTI_EVEN_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS, SCALE) \ - benchmark::RegisterBenchmark( \ - std::string("device_multi_histogram_even" \ - "." \ - "(entropy_percent:" + \ - std::to_string(get_entropy_percents(entropy_reduction)) + \ - "%,bin_count:" + std::to_string(BINS) + " bins)") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_multi_even_benchmark( \ - state, BINS, SCALE, entropy_reduction, stream, size); \ - }) - -void add_multi_even_benchmarks( - std::vector &benchmarks, - hipStream_t stream, size_t size) { - for (int entropy_reduction : entropy_reductions) { - std::vector bs = { - CREATE_MULTI_EVEN_BENCHMARK(4, 3, int, 10, 1234), - CREATE_MULTI_EVEN_BENCHMARK(4, 3, int, 100, 1234), - - CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned char, 16, 10), - CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned char, 256, 1), - - CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned short, 16, 10), - CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned short, 256, 10), - CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned short, 65536, 1), +#define CREATE_MULTI_EVEN_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS, SCALE) \ + benchmark::RegisterBenchmark( \ + std::string("device_multi_histogram_even" \ + "." \ + "(entropy_percent:" \ + + std::to_string(get_entropy_percents(entropy_reduction)) \ + + "%,bin_count:" + std::to_string(BINS) + " bins)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { \ + run_multi_even_benchmark(state, \ + BINS, \ + SCALE, \ + entropy_reduction, \ + stream, \ + size); \ + }) + +void add_multi_even_benchmarks(std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + for(int entropy_reduction : entropy_reductions) + { + std::vector bs = { + CREATE_MULTI_EVEN_BENCHMARK(4, 3, int, 10, 1234), + CREATE_MULTI_EVEN_BENCHMARK(4, 3, int, 100, 1234), + + CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned char, 16, 10), + CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned char, 256, 1), + + CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned short, 16, 10), + CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned short, 256, 10), + CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned short, 65536, 1), + }; + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); }; - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); - }; } -#define CREATE_RANGE_BENCHMARK(T, BINS) \ - benchmark::RegisterBenchmark(std::string("device_histogram_range" \ - "." \ - "(bin_count:" + \ - std::to_string(BINS) + " bins)") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_range_benchmark(state, BINS, stream, \ - size); \ - }) - -#define BENCHMARK_RANGE_TYPE(T) \ - CREATE_RANGE_BENCHMARK(T, 10), CREATE_RANGE_BENCHMARK(T, 100), \ - CREATE_RANGE_BENCHMARK(T, 1000), CREATE_RANGE_BENCHMARK(T, 10000), \ - CREATE_RANGE_BENCHMARK(T, 100000), CREATE_RANGE_BENCHMARK(T, 1000000) - -void add_range_benchmarks( - std::vector &benchmarks, - hipStream_t stream, size_t size) { - std::vector bs = { - BENCHMARK_RANGE_TYPE(float), BENCHMARK_RANGE_TYPE(double)}; - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_RANGE_BENCHMARK(T, BINS) \ + benchmark::RegisterBenchmark(std::string("device_histogram_range" \ + "." \ + "(bin_count:" \ + + std::to_string(BINS) + " bins)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_range_benchmark(state, BINS, stream, size); }) + +#define BENCHMARK_RANGE_TYPE(T) \ + CREATE_RANGE_BENCHMARK(T, 10), CREATE_RANGE_BENCHMARK(T, 100), \ + CREATE_RANGE_BENCHMARK(T, 1000), CREATE_RANGE_BENCHMARK(T, 10000), \ + CREATE_RANGE_BENCHMARK(T, 100000), CREATE_RANGE_BENCHMARK(T, 1000000) + +void add_range_benchmarks(std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + std::vector bs + = {BENCHMARK_RANGE_TYPE(float), BENCHMARK_RANGE_TYPE(double)}; + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_MULTI_RANGE_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS) \ - benchmark::RegisterBenchmark( \ - std::string("device_multi_histogram_range" \ - ".(bin_count:" + \ - std::to_string(BINS) + " bins)") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_multi_range_benchmark(state, BINS, \ - stream, size); \ - }) - -void add_multi_range_benchmarks( - std::vector &benchmarks, - hipStream_t stream, size_t size) { - std::vector bs = { - CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 10), - CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 100), - CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 1000), - CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 10000), - CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 100000), - CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 1000000), - }; - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_MULTI_RANGE_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS) \ + benchmark::RegisterBenchmark( \ + std::string("device_multi_histogram_range" \ + ".(bin_count:" \ + + std::to_string(BINS) + " bins)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_multi_range_benchmark(state, BINS, stream, size); }) + +void add_multi_range_benchmarks(std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + std::vector bs = { + CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 10), + CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 100), + CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 1000), + CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 10000), + CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 100000), + CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 1000000), + }; + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_histogram" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_even_benchmarks(benchmarks, stream, size); - add_multi_even_benchmarks(benchmarks, stream, size); - add_range_benchmarks(benchmarks, stream, size); - add_multi_range_benchmarks(benchmarks, stream, size); - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_histogram" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_even_benchmarks(benchmarks, stream, size); + add_multi_even_benchmarks(benchmarks, stream, size); + add_range_benchmarks(benchmarks, stream, size); + add_multi_range_benchmarks(benchmarks, stream, size); + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } + } + + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_memory.cpp b/benchmark/benchmark_device_memory.cpp index 0c16d653..220ad96a 100644 --- a/benchmark/benchmark_device_memory.cpp +++ b/benchmark/benchmark_device_memory.cpp @@ -26,347 +26,392 @@ #include "hipcub/block/block_scan.hpp" #include "hipcub/block/block_store.hpp" -enum memory_operation_method { - direct, - striped, - vectorize, - transpose, - warp_transpose +enum memory_operation_method +{ + direct, + striped, + vectorize, + transpose, + warp_transpose }; -enum kernel_operation { - no_operation, - block_scan, - custom_operation, - atomics_no_collision, - atomics_inter_block_collision, - atomics_inter_warp_collision, +enum kernel_operation +{ + no_operation, + block_scan, + custom_operation, + atomics_no_collision, + atomics_inter_block_collision, + atomics_inter_warp_collision, }; -struct empty_storage_type {}; +struct empty_storage_type +{}; -template +template struct operation; // no operation -template -struct operation { - typedef empty_storage_type storage_type; - - HIPCUB_DEVICE inline void operator()(storage_type & /*storage*/, - T (&)[ItemsPerThread], - T * = nullptr) const {} +template +struct operation +{ + typedef empty_storage_type storage_type; + + HIPCUB_DEVICE inline void + operator()(storage_type& /*storage*/, T (&)[ItemsPerThread], T* = nullptr) const + {} }; // custom operation -template -struct operation { - typedef empty_storage_type storage_type; - - HIPCUB_DEVICE inline void operator()(storage_type &storage, - T (&input)[ItemsPerThread], - T *global_mem_output = nullptr) const { - (void)storage; - (void)global_mem_output; +template +struct operation +{ + typedef empty_storage_type storage_type; + + HIPCUB_DEVICE inline void operator()(storage_type& storage, + T (&input)[ItemsPerThread], + T* global_mem_output = nullptr) const + { + (void)storage; + (void)global_mem_output; #pragma unroll - for (unsigned int i = 0; i < ItemsPerThread; i++) { - input[i] = input[i] + 666; - constexpr unsigned int repeats = 30; + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + input[i] = input[i] + 666; + constexpr unsigned int repeats = 30; #pragma unroll - for (unsigned int j = 0; j < repeats; j++) { - input[i] = input[i] * (input[j % ItemsPerThread]); - } + for(unsigned int j = 0; j < repeats; j++) + { + input[i] = input[i] * (input[j % ItemsPerThread]); + } + } } - } }; // block scan -template -struct operation { - typedef typename hipcub::BlockScan< - T, BlockSize, hipcub::BlockScanAlgorithm::BLOCK_SCAN_WARP_SCANS> - block_scan_type; - typedef typename block_scan_type::TempStorage storage_type; - - HIPCUB_DEVICE inline void operator()(storage_type &storage, - T (&input)[ItemsPerThread], - T *global_mem_output = nullptr) { - (void)global_mem_output; - - // sync before re-using shared memory from load - __syncthreads(); - block_scan_type(storage).InclusiveScan(input, input, hipcub::Sum()); - } +template +struct operation +{ + typedef + typename hipcub::BlockScan + block_scan_type; + typedef typename block_scan_type::TempStorage storage_type; + + HIPCUB_DEVICE inline void operator()(storage_type& storage, + T (&input)[ItemsPerThread], + T* global_mem_output = nullptr) + { + (void)global_mem_output; + + // sync before re-using shared memory from load + __syncthreads(); + block_scan_type(storage).InclusiveScan(input, input, hipcub::Sum()); + } }; // atomics_no_collision -template -struct operation { - typedef empty_storage_type storage_type; - - HIPCUB_DEVICE inline void operator()(storage_type &storage, - T (&input)[ItemsPerThread], - T *global_mem_output = nullptr) { - (void)storage; - (void)input; - - const unsigned int index = - threadIdx.x * ItemsPerThread + blockIdx.x * blockDim.x * ItemsPerThread; +template +struct operation +{ + typedef empty_storage_type storage_type; + + HIPCUB_DEVICE inline void operator()(storage_type& storage, + T (&input)[ItemsPerThread], + T* global_mem_output = nullptr) + { + (void)storage; + (void)input; + + const unsigned int index + = threadIdx.x * ItemsPerThread + blockIdx.x * blockDim.x * ItemsPerThread; #pragma unroll - for (unsigned int i = 0; i < ItemsPerThread; i++) { - atomicAdd(&global_mem_output[index + i], T(666)); + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + atomicAdd(&global_mem_output[index + i], T(666)); + } } - } }; // atomics_inter_block_collision -template -struct operation { - typedef empty_storage_type storage_type; - - HIPCUB_DEVICE inline void operator()(storage_type &storage, - T (&input)[ItemsPerThread], - T *global_mem_output = nullptr) { - (void)storage; - (void)input; - - const unsigned int index = (threadIdx.x % warpSize) * ItemsPerThread + - blockIdx.x * blockDim.x * ItemsPerThread; +template +struct operation +{ + typedef empty_storage_type storage_type; + + HIPCUB_DEVICE inline void operator()(storage_type& storage, + T (&input)[ItemsPerThread], + T* global_mem_output = nullptr) + { + (void)storage; + (void)input; + + const unsigned int index + = (threadIdx.x % warpSize) * ItemsPerThread + blockIdx.x * blockDim.x * ItemsPerThread; #pragma unroll - for (unsigned int i = 0; i < ItemsPerThread; i++) { - atomicAdd(&global_mem_output[index + i], T(666)); + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + atomicAdd(&global_mem_output[index + i], T(666)); + } } - } }; // atomics_inter_block_collision -template -struct operation { - typedef empty_storage_type storage_type; - - HIPCUB_DEVICE inline void operator()(storage_type &storage, - T (&input)[ItemsPerThread], - T *global_mem_output = nullptr) { - (void)storage; - (void)input; - - const unsigned int index = threadIdx.x * ItemsPerThread; +template +struct operation +{ + typedef empty_storage_type storage_type; + + HIPCUB_DEVICE inline void operator()(storage_type& storage, + T (&input)[ItemsPerThread], + T* global_mem_output = nullptr) + { + (void)storage; + (void)input; + + const unsigned int index = threadIdx.x * ItemsPerThread; #pragma unroll - for (unsigned int i = 0; i < ItemsPerThread; i++) { - atomicAdd(&global_mem_output[index + i], T(666)); + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + atomicAdd(&global_mem_output[index + i], T(666)); + } } - } }; -template struct memory_operation {}; - -template <> struct memory_operation { - static constexpr hipcub::BlockLoadAlgorithm load_type = - hipcub::BlockLoadAlgorithm::BLOCK_LOAD_DIRECT; - static constexpr hipcub::BlockStoreAlgorithm store_type = - hipcub::BlockStoreAlgorithm::BLOCK_STORE_DIRECT; +template +struct memory_operation +{}; + +template<> +struct memory_operation +{ + static constexpr hipcub::BlockLoadAlgorithm load_type + = hipcub::BlockLoadAlgorithm::BLOCK_LOAD_DIRECT; + static constexpr hipcub::BlockStoreAlgorithm store_type + = hipcub::BlockStoreAlgorithm::BLOCK_STORE_DIRECT; }; -template <> struct memory_operation { - static constexpr hipcub::BlockLoadAlgorithm load_type = - hipcub::BlockLoadAlgorithm::BLOCK_LOAD_STRIPED; - static constexpr hipcub::BlockStoreAlgorithm store_type = - hipcub::BlockStoreAlgorithm::BLOCK_STORE_STRIPED; +template<> +struct memory_operation +{ + static constexpr hipcub::BlockLoadAlgorithm load_type + = hipcub::BlockLoadAlgorithm::BLOCK_LOAD_STRIPED; + static constexpr hipcub::BlockStoreAlgorithm store_type + = hipcub::BlockStoreAlgorithm::BLOCK_STORE_STRIPED; }; -template <> struct memory_operation { - static constexpr hipcub::BlockLoadAlgorithm load_type = - hipcub::BlockLoadAlgorithm::BLOCK_LOAD_VECTORIZE; - static constexpr hipcub::BlockStoreAlgorithm store_type = - hipcub::BlockStoreAlgorithm::BLOCK_STORE_VECTORIZE; +template<> +struct memory_operation +{ + static constexpr hipcub::BlockLoadAlgorithm load_type + = hipcub::BlockLoadAlgorithm::BLOCK_LOAD_VECTORIZE; + static constexpr hipcub::BlockStoreAlgorithm store_type + = hipcub::BlockStoreAlgorithm::BLOCK_STORE_VECTORIZE; }; -template <> struct memory_operation { - static constexpr hipcub::BlockLoadAlgorithm load_type = - hipcub::BlockLoadAlgorithm::BLOCK_LOAD_TRANSPOSE; - static constexpr hipcub::BlockStoreAlgorithm store_type = - hipcub::BlockStoreAlgorithm::BLOCK_STORE_TRANSPOSE; +template<> +struct memory_operation +{ + static constexpr hipcub::BlockLoadAlgorithm load_type + = hipcub::BlockLoadAlgorithm::BLOCK_LOAD_TRANSPOSE; + static constexpr hipcub::BlockStoreAlgorithm store_type + = hipcub::BlockStoreAlgorithm::BLOCK_STORE_TRANSPOSE; }; -template <> struct memory_operation { - static constexpr hipcub::BlockLoadAlgorithm load_type = - hipcub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE; - static constexpr hipcub::BlockStoreAlgorithm store_type = - hipcub::BlockStoreAlgorithm::BLOCK_STORE_WARP_TRANSPOSE; +template<> +struct memory_operation +{ + static constexpr hipcub::BlockLoadAlgorithm load_type + = hipcub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE; + static constexpr hipcub::BlockStoreAlgorithm store_type + = hipcub::BlockStoreAlgorithm::BLOCK_STORE_WARP_TRANSPOSE; }; -template -__global__ __launch_bounds__(BlockSize) void operation_kernel(T *input, - T *output, - CustomOp op) { - typedef memory_operation mem_op; - typedef hipcub::BlockLoad - load_type; - typedef hipcub::BlockStore - store_type; - - __shared__ union { - typename load_type::TempStorage load; - typename store_type::TempStorage store; - typename CustomOp::storage_type operand; - } storage; - - constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int offset = blockIdx.x * items_per_block; - - T items[ItemsPerThread]; - load_type(storage.load).Load(input + offset, items); - - op(storage.operand, items, output); - // sync before re-using shared memory from load or from operand - __syncthreads(); - store_type(storage.store).Store(output + offset, items); +template +__global__ __launch_bounds__(BlockSize) void operation_kernel(T* input, T* output, CustomOp op) +{ + typedef memory_operation mem_op; + typedef hipcub::BlockLoad load_type; + typedef hipcub::BlockStore store_type; + + __shared__ union + { + typename load_type::TempStorage load; + typename store_type::TempStorage store; + typename CustomOp::storage_type operand; + } storage; + + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + const unsigned int offset = blockIdx.x * items_per_block; + + T items[ItemsPerThread]; + load_type(storage.load).Load(input + offset, items); + + op(storage.operand, items, output); + // sync before re-using shared memory from load or from operand + __syncthreads(); + store_type(storage.store).Store(output + offset, items); } -template -void run_benchmark(benchmark::State &state, size_t size, - const hipStream_t stream) { - const size_t grid_size = size / (BlockSize * ItemsPerThread); - std::vector input; - if (std::is_floating_point::value) { - input = benchmark_utils::get_random_data(size, (T)-1000, (T) + 1000); - } else { - input = benchmark_utils::get_random_data( - size, std::numeric_limits::min(), std::numeric_limits::max()); - } - T *d_input; - T *d_output; - HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); - HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - operation selected_operation; - - // Warm-up - for (size_t i = 0; i < 10; i++) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(operation_kernel), - dim3(grid_size), dim3(BlockSize), 0, stream, d_input, d_output, - selected_operation); - } - HIP_CHECK(hipDeviceSynchronize()); - - // HIP events creation - hipEvent_t start, stop; - HIP_CHECK(hipEventCreate(&start)); - HIP_CHECK(hipEventCreate(&stop)); - - const unsigned int batch_size = 10; - for (auto _ : state) { - // Record start event - HIP_CHECK(hipEventRecord(start, stream)); - - for (size_t i = 0; i < batch_size; i++) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - operation_kernel), - dim3(grid_size), dim3(BlockSize), 0, stream, d_input, d_output, - selected_operation); +template +void run_benchmark(benchmark::State& state, size_t size, const hipStream_t stream) +{ + const size_t grid_size = size / (BlockSize * ItemsPerThread); + std::vector input; + if(std::is_floating_point::value) + { + input = benchmark_utils::get_random_data(size, (T)-1000, (T) + 1000); + } else + { + input = benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); + } + T* d_input; + T* d_output; + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + operation selected_operation; + + // Warm-up + for(size_t i = 0; i < 10; i++) + { + hipLaunchKernelGGL(HIP_KERNEL_NAME(operation_kernel), + dim3(grid_size), + dim3(BlockSize), + 0, + stream, + d_input, + d_output, + selected_operation); + } + HIP_CHECK(hipDeviceSynchronize()); + + // HIP events creation + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + + const unsigned int batch_size = 10; + for(auto _ : state) + { + // Record start event + HIP_CHECK(hipEventRecord(start, stream)); + + for(size_t i = 0; i < batch_size; i++) + { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(operation_kernel), + dim3(grid_size), + dim3(BlockSize), + 0, + stream, + d_input, + d_output, + selected_operation); + } + + // Record stop event and wait until it completes + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed_mseconds; + HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); + state.SetIterationTime(elapsed_mseconds / 1000); } - // Record stop event and wait until it completes - HIP_CHECK(hipEventRecord(stop, stream)); - HIP_CHECK(hipEventSynchronize(stop)); - - float elapsed_mseconds; - HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); - state.SetIterationTime(elapsed_mseconds / 1000); - } - - // Destroy HIP events - HIP_CHECK(hipEventDestroy(start)); - HIP_CHECK(hipEventDestroy(stop)); + // Destroy HIP events + HIP_CHECK(hipEventDestroy(start)); + HIP_CHECK(hipEventDestroy(stop)); - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -template -void run_benchmark_memcpy(benchmark::State &state, size_t size, - const hipStream_t stream) { - // Allocate device buffers - // Note: since this benchmark only tests memcpy performance between device - // buffers, we don't really need to copy data into these from the host - - // whatever happens to be in memory will suffice. - T *d_input; - T *d_output; - HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); - HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); - - // Warm-up - for (size_t i = 0; i < 10; i++) { - HIP_CHECK(hipMemcpy(d_output, d_input, size * sizeof(T), - hipMemcpyDeviceToDevice)); - } - HIP_CHECK(hipDeviceSynchronize()); - - // HIP events creation - hipEvent_t start, stop; - HIP_CHECK(hipEventCreate(&start)); - HIP_CHECK(hipEventCreate(&stop)); - - const unsigned int batch_size = 10; - for (auto _ : state) { - // Record start event - HIP_CHECK(hipEventRecord(start, stream)); - - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK(hipMemcpy(d_output, d_input, size * sizeof(T), - hipMemcpyDeviceToDevice)); +template +void run_benchmark_memcpy(benchmark::State& state, size_t size, const hipStream_t stream) +{ + // Allocate device buffers + // Note: since this benchmark only tests memcpy performance between device + // buffers, we don't really need to copy data into these from the host - + // whatever happens to be in memory will suffice. + T* d_input; + T* d_output; + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); + + // Warm-up + for(size_t i = 0; i < 10; i++) + { + HIP_CHECK(hipMemcpy(d_output, d_input, size * sizeof(T), hipMemcpyDeviceToDevice)); + } + HIP_CHECK(hipDeviceSynchronize()); + + // HIP events creation + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + + const unsigned int batch_size = 10; + for(auto _ : state) + { + // Record start event + HIP_CHECK(hipEventRecord(start, stream)); + + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK(hipMemcpy(d_output, d_input, size * sizeof(T), hipMemcpyDeviceToDevice)); + } + + // Record stop event and wait until it completes + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed_mseconds; + HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); + state.SetIterationTime(elapsed_mseconds / 1000); } - // Record stop event and wait until it completes - HIP_CHECK(hipEventRecord(stop, stream)); - HIP_CHECK(hipEventSynchronize(stop)); - - float elapsed_mseconds; - HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); - state.SetIterationTime(elapsed_mseconds / 1000); - } - - // Destroy HIP events - HIP_CHECK(hipEventDestroy(start)); - HIP_CHECK(hipEventDestroy(stop)); + // Destroy HIP events + HIP_CHECK(hipEventDestroy(start)); + HIP_CHECK(hipEventDestroy(stop)); - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK_IPT(METHOD, OPERATION, T, SIZE, BS, IPT) \ - benchmarks.push_back(benchmark::RegisterBenchmark( \ - std::string("device_memory.") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_benchmark(state, SIZE, stream); \ - })); - -#define CREATE_BENCHMARK_MEMCPY(T, SIZE) \ - benchmarks.push_back(benchmark::RegisterBenchmark( \ - std::string("device_memory_memcpy.") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_benchmark_memcpy(state, SIZE, stream); \ - })); +#define CREATE_BENCHMARK_IPT(METHOD, OPERATION, T, SIZE, BS, IPT) \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_memory.") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_benchmark(state, SIZE, stream); })); + +#define CREATE_BENCHMARK_MEMCPY(T, SIZE) \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_memory_memcpy.").c_str(), \ + [=](benchmark::State& state) { run_benchmark_memcpy(state, SIZE, stream); })); // clang-format off #define CREATE_BENCHMARK_BLOCK_SIZE(MEM_OP, OP, TYPE, SIZE, BLOCK_SIZE) \ @@ -386,59 +431,65 @@ void run_benchmark_memcpy(benchmark::State &state, size_t size, CREATE_BENCHMARK_MEM_OP(warp_transpose, OP, TYPE, SIZE) // clang-format on -template constexpr unsigned int megabytes(unsigned int size) { - return (size * (1024 * 1024 / sizeof(T))); +template +constexpr unsigned int megabytes(unsigned int size) +{ + return (size * (1024 * 1024 / sizeof(T))); } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); - // Parse argv - benchmark::Initialize(&argc, argv); - const int trials = parser.get("trials"); + // Parse argv + benchmark::Initialize(&argc, argv); + const int trials = parser.get("trials"); - std::cout << "benchmark_device_memory" << std::endl; + std::cout << "benchmark_device_memory" << std::endl; - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; - // Add benchmarks - std::vector benchmarks; + // Add benchmarks + std::vector benchmarks; - // Simple memory copy from device to device, not running a kernel - CREATE_BENCHMARK_MEMCPY(int, megabytes(128)) + // Simple memory copy from device to device, not running a kernel + CREATE_BENCHMARK_MEMCPY(int, megabytes(128)) - // clang-format off + // clang-format off CREATE_BENCHMARK(no_operation, int, megabytes(128)) CREATE_BENCHMARK(block_scan, int, megabytes(128)) CREATE_BENCHMARK(custom_operation, int, megabytes(128)) CREATE_BENCHMARK(atomics_no_collision, int, megabytes(128)) CREATE_BENCHMARK(atomics_inter_block_collision, int, megabytes(128)) CREATE_BENCHMARK(atomics_inter_warp_collision, int, megabytes(128)) - // clang-format on - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); + // clang-format on + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); - return 0; + return 0; } diff --git a/benchmark/benchmark_device_merge_sort.cpp b/benchmark/benchmark_device_merge_sort.cpp index 4cebcbd1..506a8c04 100644 --- a/benchmark/benchmark_device_merge_sort.cpp +++ b/benchmark/benchmark_device_merge_sort.cpp @@ -30,249 +30,289 @@ const size_t DEFAULT_N = 32 << 20; #endif -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template std::vector generate_keys(size_t size) { - using key_type = Key; - - if (std::is_floating_point::value) { - return benchmark_utils::get_random_data( - size, static_cast(-1000), static_cast(1000), size); - } else { - return benchmark_utils::get_random_data( - size, std::numeric_limits::min(), - std::numeric_limits::max(), size); - } +template +std::vector generate_keys(size_t size) +{ + using key_type = Key; + + if(std::is_floating_point::value) + { + return benchmark_utils::get_random_data(size, + static_cast(-1000), + static_cast(1000), + size); + } else + { + return benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max(), + size); + } } -template -void run_sort_keys_benchmark(benchmark::State &state, hipStream_t stream, - size_t size) { - using key_type = Key; - auto compare_function = [] __device__(const key_type &a, const key_type &b) { - return a < b; - }; - - auto keys_input = generate_keys(size); - - key_type *d_keys_input; - key_type *d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), - hipMemcpyHostToDevice)); - - void *d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK(hipcub::DeviceMergeSort::SortKeysCopy( - d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, - size, compare_function, stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < warmup_size; i++) { - HIP_CHECK(hipcub::DeviceMergeSort::SortKeysCopy( - d_temporary_storage, temporary_storage_bytes, d_keys_input, - d_keys_output, size, compare_function, stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK(hipcub::DeviceMergeSort::SortKeysCopy( - d_temporary_storage, temporary_storage_bytes, d_keys_input, - d_keys_output, size, compare_function, stream)); +template +void run_sort_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t size) +{ + using key_type = Key; + auto compare_function = [] __device__(const key_type& a, const key_type& b) { return a < b; }; + + auto keys_input = generate_keys(size); + + key_type* d_keys_input; + key_type* d_keys_output; + HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK( + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK(hipcub::DeviceMergeSort::SortKeysCopy(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + compare_function, + stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK(hipcub::DeviceMergeSort::SortKeysCopy(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + compare_function, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * - sizeof(key_type)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_keys_input)); - HIP_CHECK(hipFree(d_keys_output)); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK(hipcub::DeviceMergeSort::SortKeysCopy(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + compare_function, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_keys_output)); } -template -void run_sort_pairs_benchmark(benchmark::State &state, hipStream_t stream, - size_t size) { - using key_type = Key; - using value_type = Value; - auto compare_function = [] __device__(const key_type &a, const key_type &b) { - return a < b; - }; - - auto keys_input = generate_keys(size); - std::vector values_input(size); - for (size_t i = 0; i < size; i++) { - values_input[i] = value_type(i); - } - - key_type *d_keys_input; - key_type *d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), - hipMemcpyHostToDevice)); - - value_type *d_values_input; - value_type *d_values_output; - HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); - HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); - HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), - size * sizeof(value_type), hipMemcpyHostToDevice)); - - void *d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK(hipcub::DeviceMergeSort::SortPairsCopy( - d_temporary_storage, temporary_storage_bytes, d_keys_input, - d_values_input, d_keys_output, d_values_output, size, compare_function, - stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < warmup_size; i++) { - HIP_CHECK(hipcub::DeviceMergeSort::SortPairsCopy( - d_temporary_storage, temporary_storage_bytes, d_keys_input, - d_values_input, d_keys_output, d_values_output, size, compare_function, - stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK(hipcub::DeviceMergeSort::SortPairsCopy( - d_temporary_storage, temporary_storage_bytes, d_keys_input, - d_values_input, d_keys_output, d_values_output, size, - compare_function, stream)); +template +void run_sort_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_t size) +{ + using key_type = Key; + using value_type = Value; + auto compare_function = [] __device__(const key_type& a, const key_type& b) { return a < b; }; + + auto keys_input = generate_keys(size); + std::vector values_input(size); + for(size_t i = 0; i < size; i++) + { + values_input[i] = value_type(i); + } + + key_type* d_keys_input; + key_type* d_keys_output; + HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK( + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + value_type* d_values_input; + value_type* d_values_output; + HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); + HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); + HIP_CHECK(hipMemcpy(d_values_input, + values_input.data(), + size * sizeof(value_type), + hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK(hipcub::DeviceMergeSort::SortPairsCopy(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_values_input, + d_keys_output, + d_values_output, + size, + compare_function, + stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK(hipcub::DeviceMergeSort::SortPairsCopy(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_values_input, + d_keys_output, + d_values_output, + size, + compare_function, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * - (sizeof(key_type) + sizeof(value_type))); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_keys_input)); - HIP_CHECK(hipFree(d_keys_output)); - HIP_CHECK(hipFree(d_values_input)); - HIP_CHECK(hipFree(d_values_output)); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK(hipcub::DeviceMergeSort::SortPairsCopy(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_values_input, + d_keys_output, + d_values_output, + size, + compare_function, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size + * (sizeof(key_type) + sizeof(value_type))); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_keys_output)); + HIP_CHECK(hipFree(d_values_input)); + HIP_CHECK(hipFree(d_values_output)); } -#define CREATE_SORT_KEYS_BENCHMARK(T) \ - benchmarks.push_back(benchmark::RegisterBenchmark( \ - std::string("device_merge_sort_sort_keys" \ - ".") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_sort_keys_benchmark(state, stream, size); \ - })); - -#define CREATE_SORT_PAIRS_BENCHMARK(T, V) \ - benchmarks.push_back(benchmark::RegisterBenchmark( \ - std::string("device_merge_sort_sort_pairs<" \ - ",key_data_type:" #T ",value_data_type:" #V ">.") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_sort_pairs_benchmark(state, stream, size); \ - })); - -void add_sort_keys_benchmarks( - std::vector &benchmarks, - hipStream_t stream, size_t size) { - CREATE_SORT_KEYS_BENCHMARK(int) - CREATE_SORT_KEYS_BENCHMARK(long long) - CREATE_SORT_KEYS_BENCHMARK(int8_t) - CREATE_SORT_KEYS_BENCHMARK(uint8_t) - CREATE_SORT_KEYS_BENCHMARK(short) +#define CREATE_SORT_KEYS_BENCHMARK(T) \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_merge_sort_sort_keys" \ + ".") \ + .c_str(), \ + [=](benchmark::State& state) { run_sort_keys_benchmark(state, stream, size); })); + +#define CREATE_SORT_PAIRS_BENCHMARK(T, V) \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_merge_sort_sort_pairs<" \ + ",key_data_type:" #T ",value_data_type:" #V ">.") \ + .c_str(), \ + [=](benchmark::State& state) { run_sort_pairs_benchmark(state, stream, size); })); + +void add_sort_keys_benchmarks(std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + CREATE_SORT_KEYS_BENCHMARK(int) + CREATE_SORT_KEYS_BENCHMARK(long long) + CREATE_SORT_KEYS_BENCHMARK(int8_t) + CREATE_SORT_KEYS_BENCHMARK(uint8_t) + CREATE_SORT_KEYS_BENCHMARK(short) } -void add_sort_pairs_benchmarks( - std::vector &benchmarks, - hipStream_t stream, size_t size) { - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - using custom_char_double = benchmark_utils::custom_type; - using custom_double_char = benchmark_utils::custom_type; - - CREATE_SORT_PAIRS_BENCHMARK(int, float) - CREATE_SORT_PAIRS_BENCHMARK(int, double) - CREATE_SORT_PAIRS_BENCHMARK(int, custom_float2) - CREATE_SORT_PAIRS_BENCHMARK(int, custom_double2) - CREATE_SORT_PAIRS_BENCHMARK(int, custom_char_double) - CREATE_SORT_PAIRS_BENCHMARK(int, custom_double_char) - - CREATE_SORT_PAIRS_BENCHMARK(long long, float) - CREATE_SORT_PAIRS_BENCHMARK(long long, double) - CREATE_SORT_PAIRS_BENCHMARK(long long, custom_float2) - CREATE_SORT_PAIRS_BENCHMARK(long long, custom_char_double) - CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double_char) - CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double2) - - CREATE_SORT_PAIRS_BENCHMARK(int8_t, int8_t) - CREATE_SORT_PAIRS_BENCHMARK(uint8_t, uint8_t) +void add_sort_pairs_benchmarks(std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + using custom_char_double = benchmark_utils::custom_type; + using custom_double_char = benchmark_utils::custom_type; + + CREATE_SORT_PAIRS_BENCHMARK(int, float) + CREATE_SORT_PAIRS_BENCHMARK(int, double) + CREATE_SORT_PAIRS_BENCHMARK(int, custom_float2) + CREATE_SORT_PAIRS_BENCHMARK(int, custom_double2) + CREATE_SORT_PAIRS_BENCHMARK(int, custom_char_double) + CREATE_SORT_PAIRS_BENCHMARK(int, custom_double_char) + + CREATE_SORT_PAIRS_BENCHMARK(long long, float) + CREATE_SORT_PAIRS_BENCHMARK(long long, double) + CREATE_SORT_PAIRS_BENCHMARK(long long, custom_float2) + CREATE_SORT_PAIRS_BENCHMARK(long long, custom_char_double) + CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double_char) + CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double2) + + CREATE_SORT_PAIRS_BENCHMARK(int8_t, int8_t) + CREATE_SORT_PAIRS_BENCHMARK(uint8_t, uint8_t) } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - - std::cout << "benchmark_device_merge_sort" << std::endl; - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_sort_keys_benchmarks(benchmarks, stream, size); - add_sort_pairs_benchmarks(benchmarks, stream, size); - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_device_merge_sort" << std::endl; + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_sort_keys_benchmarks(benchmarks, stream, size); + add_sort_pairs_benchmarks(benchmarks, stream, size); + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_partition.cpp b/benchmark/benchmark_device_partition.cpp index 5f40b712..786fe139 100644 --- a/benchmark/benchmark_device_partition.cpp +++ b/benchmark/benchmark_device_partition.cpp @@ -32,343 +32,415 @@ constexpr size_t DEFAULT_N = 1024 * 1024 * 32; #endif -constexpr unsigned int batch_size = 10; +constexpr unsigned int batch_size = 10; constexpr unsigned int warmup_size = 5; -namespace { -template struct LessOp { - HIPCUB_HOST_DEVICE LessOp(const T &pivot) : pivot_{pivot} {} +namespace +{ +template +struct LessOp +{ + HIPCUB_HOST_DEVICE LessOp(const T& pivot) : pivot_{pivot} {} - HIPCUB_HOST_DEVICE bool operator()(const T &val) const { - return val < pivot_; - } + HIPCUB_HOST_DEVICE bool operator()(const T& val) const + { + return val < pivot_; + } private: - T pivot_; + T pivot_; }; } // namespace -template -void run_flagged(benchmark::State &state, const hipStream_t stream, - const T threshold, const size_t size) { - const auto select_op = LessOp{threshold}; - const auto input = benchmark_utils::get_random_data( - size, static_cast(0), static_cast(100)); - - std::vector flags(size); - for (unsigned int i = 0; i < size; i++) { - flags[i] = static_cast(select_op(input[i])); - } - - T *d_input = nullptr; - F *d_flags = nullptr; - T *d_output = nullptr; - unsigned int *d_num_selected_output = nullptr; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_flags, input.size() * sizeof(F))); - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_num_selected_output, sizeof(unsigned int))); - - // Allocate temporary storage - void *d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - HIP_CHECK(hipcub::DevicePartition::Flagged( - nullptr, temp_storage_bytes, d_input, d_flags, d_output, - d_num_selected_output, static_cast(input.size()), stream)); - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); - - // Warm-up - HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(F), - hipMemcpyHostToDevice)); - for (unsigned int i = 0; i < warmup_size; ++i) { - HIP_CHECK(hipcub::DevicePartition::Flagged( - d_temp_storage, temp_storage_bytes, d_input, d_flags, d_output, - d_num_selected_output, static_cast(input.size()), stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - // Run benchmark - for (auto _ : state) { - namespace chrono = std::chrono; - using clock = chrono::high_resolution_clock; - - const auto start = clock::now(); - for (unsigned int i = 0; i < batch_size; ++i) { - HIP_CHECK(hipcub::DevicePartition::Flagged( - d_temp_storage, temp_storage_bytes, d_input, d_flags, d_output, - d_num_selected_output, static_cast(input.size()), stream)); +template +void run_flagged(benchmark::State& state, + const hipStream_t stream, + const T threshold, + const size_t size) +{ + const auto select_op = LessOp{threshold}; + const auto input + = benchmark_utils::get_random_data(size, static_cast(0), static_cast(100)); + + std::vector flags(size); + for(unsigned int i = 0; i < size; i++) + { + flags[i] = static_cast(select_op(input[i])); } - HIP_CHECK(hipDeviceSynchronize()); - const auto end = clock::now(); - using seconds_d = chrono::duration; - const auto elapsed_seconds = chrono::duration_cast(end - start); + T* d_input = nullptr; + F* d_flags = nullptr; + T* d_output = nullptr; + unsigned int* d_num_selected_output = nullptr; + HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_flags, input.size() * sizeof(F))); + HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_num_selected_output, sizeof(unsigned int))); + + // Allocate temporary storage + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + HIP_CHECK(hipcub::DevicePartition::Flagged(nullptr, + temp_storage_bytes, + d_input, + d_flags, + d_output, + d_num_selected_output, + static_cast(input.size()), + stream)); + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); + + // Warm-up + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(F), hipMemcpyHostToDevice)); + for(unsigned int i = 0; i < warmup_size; ++i) + { + HIP_CHECK(hipcub::DevicePartition::Flagged(d_temp_storage, + temp_storage_bytes, + d_input, + d_flags, + d_output, + d_num_selected_output, + static_cast(input.size()), + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); - state.SetIterationTime(elapsed_seconds.count()); - } + // Run benchmark + for(auto _ : state) + { + namespace chrono = std::chrono; + using clock = chrono::high_resolution_clock; + + const auto start = clock::now(); + for(unsigned int i = 0; i < batch_size; ++i) + { + HIP_CHECK(hipcub::DevicePartition::Flagged(d_temp_storage, + temp_storage_bytes, + d_input, + d_flags, + d_output, + d_num_selected_output, + static_cast(input.size()), + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + const auto end = clock::now(); + using seconds_d = chrono::duration; + const auto elapsed_seconds = chrono::duration_cast(end - start); + + state.SetIterationTime(elapsed_seconds.count()); + } - state.SetItemsProcessed(state.iterations() * batch_size * input.size()); - state.SetBytesProcessed(static_cast( - state.iterations() * batch_size * input.size() * sizeof(input[0]))); + state.SetItemsProcessed(state.iterations() * batch_size * input.size()); + state.SetBytesProcessed( + static_cast(state.iterations() * batch_size * input.size() * sizeof(input[0]))); - HIP_CHECK(hipFree(d_temp_storage)); - HIP_CHECK(hipFree(d_num_selected_output)); - HIP_CHECK(hipFree(d_output)); - HIP_CHECK(hipFree(d_flags)); - HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_temp_storage)); + HIP_CHECK(hipFree(d_num_selected_output)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_flags)); + HIP_CHECK(hipFree(d_input)); } -template -void run_predicate(benchmark::State &state, const hipStream_t stream, - const T threshold, const size_t size) { - const auto input = benchmark_utils::get_random_data( - size, static_cast(0), static_cast(100)); - - T *d_input = nullptr; - T *d_output = nullptr; - unsigned int *d_num_selected_output = nullptr; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_num_selected_output, sizeof(unsigned int))); - - const auto select_op = LessOp{threshold}; - - // Allocate temporary storage - void *d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - HIP_CHECK(hipcub::DevicePartition::If( - nullptr, temp_storage_bytes, d_input, d_output, d_num_selected_output, - static_cast(input.size()), select_op, stream)); - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); - - // Warm-up - HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), - hipMemcpyHostToDevice)); - for (unsigned int i = 0; i < warmup_size; ++i) { - HIP_CHECK(hipcub::DevicePartition::If( - d_temp_storage, temp_storage_bytes, d_input, d_output, - d_num_selected_output, static_cast(input.size()), select_op, - stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - // Run benchmark - for (auto _ : state) { - namespace chrono = std::chrono; - using clock = chrono::high_resolution_clock; - - const auto start = clock::now(); - for (unsigned int i = 0; i < batch_size; ++i) { - HIP_CHECK(hipcub::DevicePartition::If( - d_temp_storage, temp_storage_bytes, d_input, d_output, - d_num_selected_output, static_cast(input.size()), select_op, - stream)); +template +void run_predicate(benchmark::State& state, + const hipStream_t stream, + const T threshold, + const size_t size) +{ + const auto input + = benchmark_utils::get_random_data(size, static_cast(0), static_cast(100)); + + T* d_input = nullptr; + T* d_output = nullptr; + unsigned int* d_num_selected_output = nullptr; + HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_num_selected_output, sizeof(unsigned int))); + + const auto select_op = LessOp{threshold}; + + // Allocate temporary storage + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + HIP_CHECK(hipcub::DevicePartition::If(nullptr, + temp_storage_bytes, + d_input, + d_output, + d_num_selected_output, + static_cast(input.size()), + select_op, + stream)); + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); + + // Warm-up + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); + for(unsigned int i = 0; i < warmup_size; ++i) + { + HIP_CHECK(hipcub::DevicePartition::If(d_temp_storage, + temp_storage_bytes, + d_input, + d_output, + d_num_selected_output, + static_cast(input.size()), + select_op, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - const auto end = clock::now(); - using seconds_d = chrono::duration; - const auto elapsed_seconds = chrono::duration_cast(end - start); - - state.SetIterationTime(elapsed_seconds.count()); - } + // Run benchmark + for(auto _ : state) + { + namespace chrono = std::chrono; + using clock = chrono::high_resolution_clock; + + const auto start = clock::now(); + for(unsigned int i = 0; i < batch_size; ++i) + { + HIP_CHECK(hipcub::DevicePartition::If(d_temp_storage, + temp_storage_bytes, + d_input, + d_output, + d_num_selected_output, + static_cast(input.size()), + select_op, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + const auto end = clock::now(); + using seconds_d = chrono::duration; + const auto elapsed_seconds = chrono::duration_cast(end - start); + + state.SetIterationTime(elapsed_seconds.count()); + } - state.SetItemsProcessed(state.iterations() * batch_size * input.size()); - state.SetBytesProcessed(static_cast( - state.iterations() * batch_size * input.size() * sizeof(input[0]))); + state.SetItemsProcessed(state.iterations() * batch_size * input.size()); + state.SetBytesProcessed( + static_cast(state.iterations() * batch_size * input.size() * sizeof(input[0]))); - HIP_CHECK(hipFree(d_temp_storage)); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); - HIP_CHECK(hipFree(d_num_selected_output)); + HIP_CHECK(hipFree(d_temp_storage)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_num_selected_output)); } -template -void run_threeway(benchmark::State &state, const hipStream_t stream, - const T small_threshold, const T large_threshold, - const size_t size) { - const auto input = benchmark_utils::get_random_data( - size, static_cast(0), static_cast(100)); - - T *d_input = nullptr; - T *d_first_output = nullptr; - T *d_second_output = nullptr; - T *d_unselected_output = nullptr; - unsigned int *d_num_selected_output = nullptr; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_first_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_second_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_unselected_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_num_selected_output, 2 * sizeof(unsigned int))); - - const auto select_first_part_op = LessOp{small_threshold}; - const auto select_second_part_op = LessOp{large_threshold}; - - // Allocate temporary storage - void *d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - HIP_CHECK(hipcub::DevicePartition::If( - nullptr, temp_storage_bytes, d_input, d_first_output, d_second_output, - d_unselected_output, d_num_selected_output, - static_cast(input.size()), select_first_part_op, - select_second_part_op, stream)); - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); - - // Warm-up - HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), - hipMemcpyHostToDevice)); - for (unsigned int i = 0; i < warmup_size; ++i) { - HIP_CHECK(hipcub::DevicePartition::If( - d_temp_storage, temp_storage_bytes, d_input, d_first_output, - d_second_output, d_unselected_output, d_num_selected_output, - static_cast(input.size()), select_first_part_op, - select_second_part_op, stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - // Run benchmark - for (auto _ : state) { - namespace chrono = std::chrono; - using clock = chrono::high_resolution_clock; - - const auto start = clock::now(); - for (unsigned int i = 0; i < batch_size; ++i) { - HIP_CHECK(hipcub::DevicePartition::If( - d_temp_storage, temp_storage_bytes, d_input, d_first_output, - d_second_output, d_unselected_output, d_num_selected_output, - static_cast(input.size()), select_first_part_op, - select_second_part_op, stream)); +template +void run_threeway(benchmark::State& state, + const hipStream_t stream, + const T small_threshold, + const T large_threshold, + const size_t size) +{ + const auto input + = benchmark_utils::get_random_data(size, static_cast(0), static_cast(100)); + + T* d_input = nullptr; + T* d_first_output = nullptr; + T* d_second_output = nullptr; + T* d_unselected_output = nullptr; + unsigned int* d_num_selected_output = nullptr; + HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_first_output, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_second_output, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_unselected_output, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_num_selected_output, 2 * sizeof(unsigned int))); + + const auto select_first_part_op = LessOp{small_threshold}; + const auto select_second_part_op = LessOp{large_threshold}; + + // Allocate temporary storage + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + HIP_CHECK(hipcub::DevicePartition::If(nullptr, + temp_storage_bytes, + d_input, + d_first_output, + d_second_output, + d_unselected_output, + d_num_selected_output, + static_cast(input.size()), + select_first_part_op, + select_second_part_op, + stream)); + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); + + // Warm-up + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); + for(unsigned int i = 0; i < warmup_size; ++i) + { + HIP_CHECK(hipcub::DevicePartition::If(d_temp_storage, + temp_storage_bytes, + d_input, + d_first_output, + d_second_output, + d_unselected_output, + d_num_selected_output, + static_cast(input.size()), + select_first_part_op, + select_second_part_op, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - const auto end = clock::now(); - using seconds_d = chrono::duration; - const auto elapsed_seconds = chrono::duration_cast(end - start); - - state.SetIterationTime(elapsed_seconds.count()); - } + // Run benchmark + for(auto _ : state) + { + namespace chrono = std::chrono; + using clock = chrono::high_resolution_clock; + + const auto start = clock::now(); + for(unsigned int i = 0; i < batch_size; ++i) + { + HIP_CHECK(hipcub::DevicePartition::If(d_temp_storage, + temp_storage_bytes, + d_input, + d_first_output, + d_second_output, + d_unselected_output, + d_num_selected_output, + static_cast(input.size()), + select_first_part_op, + select_second_part_op, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + const auto end = clock::now(); + using seconds_d = chrono::duration; + const auto elapsed_seconds = chrono::duration_cast(end - start); + + state.SetIterationTime(elapsed_seconds.count()); + } - state.SetItemsProcessed(state.iterations() * batch_size * input.size()); - state.SetBytesProcessed(static_cast( - state.iterations() * batch_size * input.size() * sizeof(input[0]))); + state.SetItemsProcessed(state.iterations() * batch_size * input.size()); + state.SetBytesProcessed( + static_cast(state.iterations() * batch_size * input.size() * sizeof(input[0]))); - HIP_CHECK(hipFree(d_temp_storage)); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_first_output)); - HIP_CHECK(hipFree(d_second_output)); - HIP_CHECK(hipFree(d_unselected_output)); - HIP_CHECK(hipFree(d_num_selected_output)); + HIP_CHECK(hipFree(d_temp_storage)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_first_output)); + HIP_CHECK(hipFree(d_second_output)); + HIP_CHECK(hipFree(d_unselected_output)); + HIP_CHECK(hipFree(d_num_selected_output)); } -#define CREATE_BENCHMARK_FLAGGED(T, T_FLAG, SPLIT_T) \ - benchmark::RegisterBenchmark( \ - std::string("device_parition_flagged.(split_threshold:" #SPLIT_T "%)") \ - .c_str(), \ - &run_flagged, stream, static_cast(SPLIT_T), size) - -#define CREATE_BENCHMARK_PREDICATE(T, SPLIT_T) \ - benchmark::RegisterBenchmark( \ - std::string("device_parition_predicate.(split_threshold:" #SPLIT_T "%)") \ - .c_str(), \ - &run_predicate, stream, static_cast(SPLIT_T), size) - -#define CREATE_BENCHMARK_THREEWAY(T, SMALL_T, LARGE_T) \ - benchmark::RegisterBenchmark( \ - std::string("device_parition_three_way" \ - ".(small_threshold:" #SMALL_T \ - "%,large_threshold:" #LARGE_T "%)") \ - .c_str(), \ - &run_threeway, stream, static_cast(SMALL_T), \ - static_cast(LARGE_T), size) - -#define BENCHMARK_FLAGGED_TYPE(type, flag_type) \ - CREATE_BENCHMARK_FLAGGED(type, flag_type, 33), \ - CREATE_BENCHMARK_FLAGGED(type, flag_type, 50), \ - CREATE_BENCHMARK_FLAGGED(type, flag_type, 60), \ - CREATE_BENCHMARK_FLAGGED(type, flag_type, 90) - -#define BENCHMARK_PREDICATE_TYPE(type) \ - CREATE_BENCHMARK_PREDICATE(type, 33), CREATE_BENCHMARK_PREDICATE(type, 50), \ - CREATE_BENCHMARK_PREDICATE(type, 60), \ - CREATE_BENCHMARK_PREDICATE(type, 90) - -#define BENCHMARK_THREEWAY_TYPE(type) \ - CREATE_BENCHMARK_THREEWAY(type, 33, 66), \ - CREATE_BENCHMARK_THREEWAY(type, 10, 66), \ - CREATE_BENCHMARK_THREEWAY(type, 50, 60), \ - CREATE_BENCHMARK_THREEWAY(type, 50, 90) - -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_partition" << std::endl; - - // HIP - const hipStream_t stream = 0; // default - { - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - } - - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - - // Add benchmarks - std::vector benchmarks = { - BENCHMARK_FLAGGED_TYPE(int8_t, unsigned char), - BENCHMARK_FLAGGED_TYPE(int, unsigned char), - BENCHMARK_FLAGGED_TYPE(float, unsigned char), - BENCHMARK_FLAGGED_TYPE(long long, uint8_t), - BENCHMARK_FLAGGED_TYPE(double, int8_t), - BENCHMARK_FLAGGED_TYPE(custom_float2, int8_t), - BENCHMARK_FLAGGED_TYPE(custom_double2, unsigned char), - - BENCHMARK_PREDICATE_TYPE(int8_t), - BENCHMARK_PREDICATE_TYPE(int), - BENCHMARK_PREDICATE_TYPE(float), - BENCHMARK_PREDICATE_TYPE(long long), - BENCHMARK_PREDICATE_TYPE(double), - BENCHMARK_PREDICATE_TYPE(custom_float2), - BENCHMARK_PREDICATE_TYPE(custom_double2), - - BENCHMARK_THREEWAY_TYPE(int8_t), - BENCHMARK_THREEWAY_TYPE(int), - BENCHMARK_THREEWAY_TYPE(float), - BENCHMARK_THREEWAY_TYPE(long long), - BENCHMARK_THREEWAY_TYPE(double), - BENCHMARK_THREEWAY_TYPE(custom_float2), - BENCHMARK_THREEWAY_TYPE(custom_double2), - }; - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +#define CREATE_BENCHMARK_FLAGGED(T, T_FLAG, SPLIT_T) \ + benchmark::RegisterBenchmark(std::string("device_parition_flagged.(split_threshold:" #SPLIT_T \ + "%)") \ + .c_str(), \ + &run_flagged, \ + stream, \ + static_cast(SPLIT_T), \ + size) + +#define CREATE_BENCHMARK_PREDICATE(T, SPLIT_T) \ + benchmark::RegisterBenchmark( \ + std::string("device_parition_predicate.(split_threshold:" #SPLIT_T "%)") \ + .c_str(), \ + &run_predicate, \ + stream, \ + static_cast(SPLIT_T), \ + size) + +#define CREATE_BENCHMARK_THREEWAY(T, SMALL_T, LARGE_T) \ + benchmark::RegisterBenchmark(std::string("device_parition_three_way" \ + ".(small_threshold:" #SMALL_T \ + "%,large_threshold:" #LARGE_T "%)") \ + .c_str(), \ + &run_threeway, \ + stream, \ + static_cast(SMALL_T), \ + static_cast(LARGE_T), \ + size) + +#define BENCHMARK_FLAGGED_TYPE(type, flag_type) \ + CREATE_BENCHMARK_FLAGGED(type, flag_type, 33), CREATE_BENCHMARK_FLAGGED(type, flag_type, 50), \ + CREATE_BENCHMARK_FLAGGED(type, flag_type, 60), \ + CREATE_BENCHMARK_FLAGGED(type, flag_type, 90) + +#define BENCHMARK_PREDICATE_TYPE(type) \ + CREATE_BENCHMARK_PREDICATE(type, 33), CREATE_BENCHMARK_PREDICATE(type, 50), \ + CREATE_BENCHMARK_PREDICATE(type, 60), CREATE_BENCHMARK_PREDICATE(type, 90) + +#define BENCHMARK_THREEWAY_TYPE(type) \ + CREATE_BENCHMARK_THREEWAY(type, 33, 66), CREATE_BENCHMARK_THREEWAY(type, 10, 66), \ + CREATE_BENCHMARK_THREEWAY(type, 50, 60), CREATE_BENCHMARK_THREEWAY(type, 50, 90) + +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_partition" << std::endl; + + // HIP + const hipStream_t stream = 0; // default + { + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + } + + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + + // Add benchmarks + std::vector benchmarks = { + BENCHMARK_FLAGGED_TYPE(int8_t, unsigned char), + BENCHMARK_FLAGGED_TYPE(int, unsigned char), + BENCHMARK_FLAGGED_TYPE(float, unsigned char), + BENCHMARK_FLAGGED_TYPE(long long, uint8_t), + BENCHMARK_FLAGGED_TYPE(double, int8_t), + BENCHMARK_FLAGGED_TYPE(custom_float2, int8_t), + BENCHMARK_FLAGGED_TYPE(custom_double2, unsigned char), + + BENCHMARK_PREDICATE_TYPE(int8_t), + BENCHMARK_PREDICATE_TYPE(int), + BENCHMARK_PREDICATE_TYPE(float), + BENCHMARK_PREDICATE_TYPE(long long), + BENCHMARK_PREDICATE_TYPE(double), + BENCHMARK_PREDICATE_TYPE(custom_float2), + BENCHMARK_PREDICATE_TYPE(custom_double2), + + BENCHMARK_THREEWAY_TYPE(int8_t), + BENCHMARK_THREEWAY_TYPE(int), + BENCHMARK_THREEWAY_TYPE(float), + BENCHMARK_THREEWAY_TYPE(long long), + BENCHMARK_THREEWAY_TYPE(double), + BENCHMARK_THREEWAY_TYPE(custom_float2), + BENCHMARK_THREEWAY_TYPE(custom_double2), + }; + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_radix_sort.cpp b/benchmark/benchmark_device_radix_sort.cpp index 776fc6a4..366e62d9 100644 --- a/benchmark/benchmark_device_radix_sort.cpp +++ b/benchmark/benchmark_device_radix_sort.cpp @@ -32,366 +32,477 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template std::vector generate_keys(size_t size) { - using key_type = Key; - - if (std::is_floating_point::value) { - return benchmark_utils::get_random_data(size, (key_type)-1000, - (key_type) + 1000, size); - } else { - return benchmark_utils::get_random_data( - size, std::numeric_limits::min(), - std::numeric_limits::max(), size); - } +template +std::vector generate_keys(size_t size) +{ + using key_type = Key; + + if(std::is_floating_point::value) + { + return benchmark_utils::get_random_data(size, + (key_type)-1000, + (key_type) + 1000, + size); + } else + { + return benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max(), + size); + } } -template -auto invoke_sort_keys(void *d_temp_storage, size_t &temp_storage_bytes, - Key *d_keys_input, Key *d_keys_output, size_t size, +template +auto invoke_sort_keys(void* d_temp_storage, + size_t& temp_storage_bytes, + Key* d_keys_input, + Key* d_keys_output, + size_t size, hipStream_t stream) - -> std::enable_if_t::value, - hipError_t> { - return hipcub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, - d_keys_input, d_keys_output, size, 0, - sizeof(Key) * 8, stream); + -> std::enable_if_t::value, hipError_t> +{ + return hipcub::DeviceRadixSort::SortKeys(d_temp_storage, + temp_storage_bytes, + d_keys_input, + d_keys_output, + size, + 0, + sizeof(Key) * 8, + stream); } -template -auto invoke_sort_keys(void *d_temp_storage, size_t &temp_storage_bytes, - Key *d_keys_input, Key *d_keys_output, size_t size, +template +auto invoke_sort_keys(void* d_temp_storage, + size_t& temp_storage_bytes, + Key* d_keys_input, + Key* d_keys_output, + size_t size, hipStream_t stream) - -> std::enable_if_t::value, - hipError_t> { - return hipcub::DeviceRadixSort::SortKeysDescending( - d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, size, 0, - sizeof(Key) * 8, stream); + -> std::enable_if_t::value, hipError_t> +{ + return hipcub::DeviceRadixSort::SortKeysDescending(d_temp_storage, + temp_storage_bytes, + d_keys_input, + d_keys_output, + size, + 0, + sizeof(Key) * 8, + stream); } -template -auto invoke_sort_keys(void *d_temp_storage, size_t &temp_storage_bytes, - Key *d_keys_input, Key *d_keys_output, size_t size, +template +auto invoke_sort_keys(void* d_temp_storage, + size_t& temp_storage_bytes, + Key* d_keys_input, + Key* d_keys_output, + size_t size, hipStream_t stream) - -> std::enable_if_t::value, - hipError_t> { - return hipcub::DeviceRadixSort::SortKeys( - d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, size, - benchmark_utils::custom_type_decomposer{}, stream); + -> std::enable_if_t::value, hipError_t> +{ + return hipcub::DeviceRadixSort::SortKeys(d_temp_storage, + temp_storage_bytes, + d_keys_input, + d_keys_output, + size, + benchmark_utils::custom_type_decomposer{}, + stream); } -template -auto invoke_sort_keys(void *d_temp_storage, size_t &temp_storage_bytes, - Key *d_keys_input, Key *d_keys_output, size_t size, +template +auto invoke_sort_keys(void* d_temp_storage, + size_t& temp_storage_bytes, + Key* d_keys_input, + Key* d_keys_output, + size_t size, hipStream_t stream) - -> std::enable_if_t< - Descending && benchmark_utils::is_custom_type::value, hipError_t> { - return hipcub::DeviceRadixSort::SortKeysDescending( - d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, size, - benchmark_utils::custom_type_decomposer{}, stream); + -> std::enable_if_t::value, hipError_t> +{ + return hipcub::DeviceRadixSort::SortKeysDescending( + d_temp_storage, + temp_storage_bytes, + d_keys_input, + d_keys_output, + size, + benchmark_utils::custom_type_decomposer{}, + stream); } -template -void run_sort_keys_benchmark(benchmark::State &state, hipStream_t stream, - size_t size, - std::shared_ptr> keys_input) { - using key_type = Key; - key_type *d_keys_input; - key_type *d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK(hipMemcpy(d_keys_input, keys_input->data(), size * sizeof(key_type), - hipMemcpyHostToDevice)); - - void *d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK(invoke_sort_keys(d_temporary_storage, - temporary_storage_bytes, d_keys_input, - d_keys_output, size, stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < warmup_size; i++) { - HIP_CHECK(invoke_sort_keys( - d_temporary_storage, temporary_storage_bytes, d_keys_input, - d_keys_output, size, stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK(invoke_sort_keys( - d_temporary_storage, temporary_storage_bytes, d_keys_input, - d_keys_output, size, stream)); +template +void run_sort_keys_benchmark(benchmark::State& state, + hipStream_t stream, + size_t size, + std::shared_ptr> keys_input) +{ + using key_type = Key; + key_type* d_keys_input; + key_type* d_keys_output; + HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK(hipMemcpy(d_keys_input, + keys_input->data(), + size * sizeof(key_type), + hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK(invoke_sort_keys(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK(invoke_sort_keys(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * - sizeof(key_type)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_keys_input)); - HIP_CHECK(hipFree(d_keys_output)); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK(invoke_sort_keys(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_keys_output)); } -template -auto invoke_sort_pairs(void *d_temp_storage, size_t &temp_storage_bytes, - Key *d_keys_input, Key *d_keys_output, - Value *d_values_input, Value *d_values_output, - size_t size, hipStream_t stream) - -> std::enable_if_t::value, - hipError_t> { - return hipcub::DeviceRadixSort::SortPairs( - d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, - d_values_input, d_values_output, size, 0, sizeof(Key) * 8, stream); +template +auto invoke_sort_pairs(void* d_temp_storage, + size_t& temp_storage_bytes, + Key* d_keys_input, + Key* d_keys_output, + Value* d_values_input, + Value* d_values_output, + size_t size, + hipStream_t stream) + -> std::enable_if_t::value, hipError_t> +{ + return hipcub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + 0, + sizeof(Key) * 8, + stream); } -template -auto invoke_sort_pairs(void *d_temp_storage, size_t &temp_storage_bytes, - Key *d_keys_input, Key *d_keys_output, - Value *d_values_input, Value *d_values_output, - size_t size, hipStream_t stream) - -> std::enable_if_t::value, - hipError_t> { - return hipcub::DeviceRadixSort::SortPairsDescending( - d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, - d_values_input, d_values_output, size, 0, sizeof(Key) * 8, stream); +template +auto invoke_sort_pairs(void* d_temp_storage, + size_t& temp_storage_bytes, + Key* d_keys_input, + Key* d_keys_output, + Value* d_values_input, + Value* d_values_output, + size_t size, + hipStream_t stream) + -> std::enable_if_t::value, hipError_t> +{ + return hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage, + temp_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + 0, + sizeof(Key) * 8, + stream); } -template -auto invoke_sort_pairs(void *d_temp_storage, size_t &temp_storage_bytes, - Key *d_keys_input, Key *d_keys_output, - Value *d_values_input, Value *d_values_output, - size_t size, hipStream_t stream) - -> std::enable_if_t::value, - hipError_t> { - return hipcub::DeviceRadixSort::SortPairs( - d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, - d_values_input, d_values_output, size, - benchmark_utils::custom_type_decomposer{}, stream); +template +auto invoke_sort_pairs(void* d_temp_storage, + size_t& temp_storage_bytes, + Key* d_keys_input, + Key* d_keys_output, + Value* d_values_input, + Value* d_values_output, + size_t size, + hipStream_t stream) + -> std::enable_if_t::value, hipError_t> +{ + return hipcub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + benchmark_utils::custom_type_decomposer{}, + stream); } -template -auto invoke_sort_pairs(void *d_temp_storage, size_t &temp_storage_bytes, - Key *d_keys_input, Key *d_keys_output, - Value *d_values_input, Value *d_values_output, - size_t size, hipStream_t stream) - -> std::enable_if_t< - Descending && benchmark_utils::is_custom_type::value, hipError_t> { - return hipcub::DeviceRadixSort::SortPairsDescending( - d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, - d_values_input, d_values_output, size, - benchmark_utils::custom_type_decomposer{}, stream); +template +auto invoke_sort_pairs(void* d_temp_storage, + size_t& temp_storage_bytes, + Key* d_keys_input, + Key* d_keys_output, + Value* d_values_input, + Value* d_values_output, + size_t size, + hipStream_t stream) + -> std::enable_if_t::value, hipError_t> +{ + return hipcub::DeviceRadixSort::SortPairsDescending( + d_temp_storage, + temp_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + benchmark_utils::custom_type_decomposer{}, + stream); } -template -void run_sort_pairs_benchmark(benchmark::State &state, hipStream_t stream, - size_t size, - std::shared_ptr> keys_input) { - using key_type = Key; - using value_type = Value; - std::vector values_input(size); - for (size_t i = 0; i < size; i++) { - values_input[i] = value_type(i); - } - - key_type *d_keys_input; - key_type *d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK(hipMemcpy(d_keys_input, keys_input->data(), size * sizeof(key_type), - hipMemcpyHostToDevice)); - - value_type *d_values_input; - value_type *d_values_output; - HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); - HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); - HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), - size * sizeof(value_type), hipMemcpyHostToDevice)); - - void *d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK(invoke_sort_pairs( - d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, - d_values_input, d_values_output, size, stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < warmup_size; i++) { - HIP_CHECK(invoke_sort_pairs( - d_temporary_storage, temporary_storage_bytes, d_keys_input, - d_keys_output, d_values_input, d_values_output, size, stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK(invoke_sort_pairs( - d_temporary_storage, temporary_storage_bytes, d_keys_input, - d_keys_output, d_values_input, d_values_output, size, stream)); +template +void run_sort_pairs_benchmark(benchmark::State& state, + hipStream_t stream, + size_t size, + std::shared_ptr> keys_input) +{ + using key_type = Key; + using value_type = Value; + std::vector values_input(size); + for(size_t i = 0; i < size; i++) + { + values_input[i] = value_type(i); + } + + key_type* d_keys_input; + key_type* d_keys_output; + HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK(hipMemcpy(d_keys_input, + keys_input->data(), + size * sizeof(key_type), + hipMemcpyHostToDevice)); + + value_type* d_values_input; + value_type* d_values_output; + HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); + HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); + HIP_CHECK(hipMemcpy(d_values_input, + values_input.data(), + size * sizeof(value_type), + hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK(invoke_sort_pairs(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK(invoke_sort_pairs(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * - (sizeof(key_type) + sizeof(value_type))); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_keys_input)); - HIP_CHECK(hipFree(d_keys_output)); - HIP_CHECK(hipFree(d_values_input)); - HIP_CHECK(hipFree(d_values_output)); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK(invoke_sort_pairs(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size + * (sizeof(key_type) + sizeof(value_type))); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_keys_output)); + HIP_CHECK(hipFree(d_values_input)); + HIP_CHECK(hipFree(d_values_output)); } -#define CREATE_SORT_KEYS_BENCHMARK(Key) \ - { \ - auto keys_input = \ - std::make_shared>(generate_keys(size)); \ - benchmarks.push_back(benchmark::RegisterBenchmark( \ - std::string("device_radix_sort_keys_ascending" \ - ".") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_sort_keys_benchmark(state, stream, size, keys_input); \ - })); \ - benchmarks.push_back(benchmark::RegisterBenchmark( \ - std::string("device_radix_sort_keys_descending" \ - ".") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_sort_keys_benchmark(state, stream, size, keys_input); \ - })); \ - } - -#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value) \ - { \ - auto keys_input = \ - std::make_shared>(generate_keys(size)); \ - benchmarks.push_back(benchmark::RegisterBenchmark( \ - std::string("device_radix_sort_pairs_ascending" \ - ".") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_sort_pairs_benchmark(state, stream, size, \ - keys_input); \ - })); \ - benchmarks.push_back(benchmark::RegisterBenchmark( \ - std::string("device_radix_sort_pairs_descending" \ - ".") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_sort_pairs_benchmark(state, stream, size, \ - keys_input); \ - })); \ - } - -void add_sort_keys_benchmarks( - std::vector &benchmarks, - hipStream_t stream, size_t size) { - using custom_int_t = benchmark_utils::custom_type; - CREATE_SORT_KEYS_BENCHMARK(int) - CREATE_SORT_KEYS_BENCHMARK(long long) - CREATE_SORT_KEYS_BENCHMARK(int8_t) - CREATE_SORT_KEYS_BENCHMARK(uint8_t) - CREATE_SORT_KEYS_BENCHMARK(short) - CREATE_SORT_KEYS_BENCHMARK(custom_int_t) +#define CREATE_SORT_KEYS_BENCHMARK(Key) \ + { \ + auto keys_input = std::make_shared>(generate_keys(size)); \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_radix_sort_keys_ascending" \ + ".") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_keys_benchmark(state, stream, size, keys_input); })); \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_radix_sort_keys_descending" \ + ".") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_keys_benchmark(state, stream, size, keys_input); })); \ + } + +#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value) \ + { \ + auto keys_input = std::make_shared>(generate_keys(size)); \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_radix_sort_pairs_ascending" \ + ".") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_pairs_benchmark(state, stream, size, keys_input); })); \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_radix_sort_pairs_descending" \ + ".") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_pairs_benchmark(state, stream, size, keys_input); })); \ + } + +void add_sort_keys_benchmarks(std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + using custom_int_t = benchmark_utils::custom_type; + CREATE_SORT_KEYS_BENCHMARK(int) + CREATE_SORT_KEYS_BENCHMARK(long long) + CREATE_SORT_KEYS_BENCHMARK(int8_t) + CREATE_SORT_KEYS_BENCHMARK(uint8_t) + CREATE_SORT_KEYS_BENCHMARK(short) + CREATE_SORT_KEYS_BENCHMARK(custom_int_t) } -void add_sort_pairs_benchmarks( - std::vector &benchmarks, - hipStream_t stream, size_t size) { - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - using custom_char_double = benchmark_utils::custom_type; - using custom_double_char = benchmark_utils::custom_type; - using custom_int_t = benchmark_utils::custom_type; - - CREATE_SORT_PAIRS_BENCHMARK(int, float) - CREATE_SORT_PAIRS_BENCHMARK(int, double) - CREATE_SORT_PAIRS_BENCHMARK(int, custom_float2) - CREATE_SORT_PAIRS_BENCHMARK(int, custom_double2) - CREATE_SORT_PAIRS_BENCHMARK(int, custom_char_double) - CREATE_SORT_PAIRS_BENCHMARK(int, custom_double_char) - - CREATE_SORT_PAIRS_BENCHMARK(long long, float) - CREATE_SORT_PAIRS_BENCHMARK(long long, double) - CREATE_SORT_PAIRS_BENCHMARK(long long, custom_float2) - CREATE_SORT_PAIRS_BENCHMARK(long long, custom_char_double) - CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double_char) - CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double2) - - CREATE_SORT_PAIRS_BENCHMARK(int8_t, int8_t) - CREATE_SORT_PAIRS_BENCHMARK(uint8_t, uint8_t) - - CREATE_SORT_PAIRS_BENCHMARK(custom_int_t, float) +void add_sort_pairs_benchmarks(std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + using custom_char_double = benchmark_utils::custom_type; + using custom_double_char = benchmark_utils::custom_type; + using custom_int_t = benchmark_utils::custom_type; + + CREATE_SORT_PAIRS_BENCHMARK(int, float) + CREATE_SORT_PAIRS_BENCHMARK(int, double) + CREATE_SORT_PAIRS_BENCHMARK(int, custom_float2) + CREATE_SORT_PAIRS_BENCHMARK(int, custom_double2) + CREATE_SORT_PAIRS_BENCHMARK(int, custom_char_double) + CREATE_SORT_PAIRS_BENCHMARK(int, custom_double_char) + + CREATE_SORT_PAIRS_BENCHMARK(long long, float) + CREATE_SORT_PAIRS_BENCHMARK(long long, double) + CREATE_SORT_PAIRS_BENCHMARK(long long, custom_float2) + CREATE_SORT_PAIRS_BENCHMARK(long long, custom_char_double) + CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double_char) + CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double2) + + CREATE_SORT_PAIRS_BENCHMARK(int8_t, int8_t) + CREATE_SORT_PAIRS_BENCHMARK(uint8_t, uint8_t) + + CREATE_SORT_PAIRS_BENCHMARK(custom_int_t, float) } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_radix_sort" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_sort_keys_benchmarks(benchmarks, stream, size); - add_sort_pairs_benchmarks(benchmarks, stream, size); - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_radix_sort" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_sort_keys_benchmarks(benchmarks, stream, size); + add_sort_pairs_benchmarks(benchmarks, stream, size); + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_reduce.cpp b/benchmark/benchmark_device_reduce.cpp index 89c8aa55..2a4d9df7 100644 --- a/benchmark/benchmark_device_reduce.cpp +++ b/benchmark/benchmark_device_reduce.cpp @@ -29,156 +29,170 @@ const size_t DEFAULT_N = 1024 * 1024 * 128; #endif -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template -void run_benchmark(benchmark::State &state, size_t size, - const hipStream_t stream, ReduceKernel reduce) { - std::vector input = - benchmark_utils::get_random_data(size, T(0), T(1000)); - - T *d_input; - OutputT *d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, sizeof(OutputT))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - // Allocate temporary storage memory - size_t temp_storage_size_bytes = 0; - void *d_temp_storage = nullptr; - // Get size of d_temp_storage - HIP_CHECK(reduce(d_temp_storage, temp_storage_size_bytes, d_input, d_output, - size, stream)); - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - for (size_t i = 0; i < warmup_size; i++) { - HIP_CHECK(reduce(d_temp_storage, temp_storage_size_bytes, d_input, d_output, - size, stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK(reduce(d_temp_storage, temp_storage_size_bytes, d_input, - d_output, size, stream)); +template +void run_benchmark(benchmark::State& state, + size_t size, + const hipStream_t stream, + ReduceKernel reduce) +{ + std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); + + T* d_input; + OutputT* d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, sizeof(OutputT))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + // Allocate temporary storage memory + size_t temp_storage_size_bytes = 0; + void* d_temp_storage = nullptr; + // Get size of d_temp_storage + HIP_CHECK(reduce(d_temp_storage, temp_storage_size_bytes, d_input, d_output, size, stream)); + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK(reduce(d_temp_storage, temp_storage_size_bytes, d_input, d_output, size, stream)); } - HIP_CHECK(hipStreamSynchronize(stream)); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); - HIP_CHECK(hipFree(d_temp_storage)); -} + HIP_CHECK(hipDeviceSynchronize()); + + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK( + reduce(d_temp_storage, temp_storage_size_bytes, d_input, d_output, size, stream)); + } + HIP_CHECK(hipStreamSynchronize(stream)); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); -template struct Benchmark; + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_temp_storage)); +} -template struct Benchmark { - static void run(benchmark::State &state, size_t size, - const hipStream_t stream) { - hipError_t (*ptr_to_sum)(void *, size_t &, T *, T *, int, hipStream_t) = - &hipcub::DeviceReduce::Sum; - run_benchmark(state, size, stream, ptr_to_sum); - } +template +struct Benchmark; + +template +struct Benchmark +{ + static void run(benchmark::State& state, size_t size, const hipStream_t stream) + { + hipError_t (*ptr_to_sum)(void*, size_t&, T*, T*, int, hipStream_t) + = &hipcub::DeviceReduce::Sum; + run_benchmark(state, size, stream, ptr_to_sum); + } }; -template struct Benchmark { - static void run(benchmark::State &state, size_t size, - const hipStream_t stream) { - hipError_t (*ptr_to_min)(void *, size_t &, T *, T *, int, hipStream_t) = - &hipcub::DeviceReduce::Min; - run_benchmark(state, size, stream, ptr_to_min); - } +template +struct Benchmark +{ + static void run(benchmark::State& state, size_t size, const hipStream_t stream) + { + hipError_t (*ptr_to_min)(void*, size_t&, T*, T*, int, hipStream_t) + = &hipcub::DeviceReduce::Min; + run_benchmark(state, size, stream, ptr_to_min); + } }; -template struct Benchmark { - using Difference = int; - using Iterator = typename hipcub::ArgIndexInputIterator; - using KeyValue = typename Iterator::value_type; - - static void run(benchmark::State &state, size_t size, - const hipStream_t stream) { - hipError_t (*ptr_to_argmin)(void *, size_t &, T *, KeyValue *, int, - hipStream_t) = &hipcub::DeviceReduce::ArgMin; - run_benchmark(state, size, stream, ptr_to_argmin); - } +template +struct Benchmark +{ + using Difference = int; + using Iterator = typename hipcub::ArgIndexInputIterator; + using KeyValue = typename Iterator::value_type; + + static void run(benchmark::State& state, size_t size, const hipStream_t stream) + { + hipError_t (*ptr_to_argmin)(void*, size_t&, T*, KeyValue*, int, hipStream_t) + = &hipcub::DeviceReduce::ArgMin; + run_benchmark(state, size, stream, ptr_to_argmin); + } }; -#define CREATE_BENCHMARK(T, REDUCE_OP) \ - benchmark::RegisterBenchmark(std::string("device_reduce" \ - ".") \ - .c_str(), \ - &Benchmark::run, size, stream) - -#define CREATE_BENCHMARKS(REDUCE_OP) \ - CREATE_BENCHMARK(int, REDUCE_OP), CREATE_BENCHMARK(long long, REDUCE_OP), \ - CREATE_BENCHMARK(float, REDUCE_OP), CREATE_BENCHMARK(double, REDUCE_OP), \ - CREATE_BENCHMARK(int8_t, REDUCE_OP) - -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_reduce" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - using custom_double2 = benchmark_utils::custom_type; - - // Add benchmarks - std::vector benchmarks = { - CREATE_BENCHMARKS(hipcub::Sum), - CREATE_BENCHMARK(custom_double2, hipcub::Sum), - CREATE_BENCHMARKS(hipcub::Min), +#define CREATE_BENCHMARK(T, REDUCE_OP) \ + benchmark::RegisterBenchmark(std::string("device_reduce" \ + ".") \ + .c_str(), \ + &Benchmark::run, \ + size, \ + stream) + +#define CREATE_BENCHMARKS(REDUCE_OP) \ + CREATE_BENCHMARK(int, REDUCE_OP), CREATE_BENCHMARK(long long, REDUCE_OP), \ + CREATE_BENCHMARK(float, REDUCE_OP), CREATE_BENCHMARK(double, REDUCE_OP), \ + CREATE_BENCHMARK(int8_t, REDUCE_OP) + +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_reduce" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + using custom_double2 = benchmark_utils::custom_type; + + // Add benchmarks + std::vector benchmarks = { + CREATE_BENCHMARKS(hipcub::Sum), + CREATE_BENCHMARK(custom_double2, hipcub::Sum), + CREATE_BENCHMARKS(hipcub::Min), #ifdef HIPCUB_ROCPRIM_API - CREATE_BENCHMARK(custom_double2, hipcub::Min), + CREATE_BENCHMARK(custom_double2, hipcub::Min), #endif - CREATE_BENCHMARKS(hipcub::ArgMin), + CREATE_BENCHMARKS(hipcub::ArgMin), #ifdef HIPCUB_ROCPRIM_API - CREATE_BENCHMARK(custom_double2, hipcub::ArgMin), + CREATE_BENCHMARK(custom_double2, hipcub::ArgMin), #endif - }; - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); + }; + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); - return 0; + return 0; } diff --git a/benchmark/benchmark_device_reduce_by_key.cpp b/benchmark/benchmark_device_reduce_by_key.cpp index 77fbd7ff..54209e65 100644 --- a/benchmark/benchmark_device_reduce_by_key.cpp +++ b/benchmark/benchmark_device_reduce_by_key.cpp @@ -23,7 +23,7 @@ // CUB's implementation of single_pass_scan_operators has maybe uninitialized // parameters, disable the warning because all warnings are threated as errors: #ifdef __HIP_PLATFORM_NVIDIA__ -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" + #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif #include "common_benchmark_header.hpp" @@ -35,177 +35,213 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template -void run_benchmark(benchmark::State &state, size_t max_length, - hipStream_t stream, size_t size, BinaryFunction reduce_op) { - using key_type = Key; - using value_type = Value; - - // Generate data - std::vector keys_input(size); - - unsigned int unique_count = 0; - std::vector key_counts = - benchmark_utils::get_random_data(100000, 1, max_length); - size_t offset = 0; - while (offset < size) { - const size_t key_count = key_counts[unique_count % key_counts.size()]; - const size_t end = std::min(size, offset + key_count); - for (size_t i = offset; i < end; i++) { - keys_input[i] = unique_count; +template +void run_benchmark(benchmark::State& state, + size_t max_length, + hipStream_t stream, + size_t size, + BinaryFunction reduce_op) +{ + using key_type = Key; + using value_type = Value; + + // Generate data + std::vector keys_input(size); + + unsigned int unique_count = 0; + std::vector key_counts + = benchmark_utils::get_random_data(100000, 1, max_length); + size_t offset = 0; + while(offset < size) + { + const size_t key_count = key_counts[unique_count % key_counts.size()]; + const size_t end = std::min(size, offset + key_count); + for(size_t i = offset; i < end; i++) + { + keys_input[i] = unique_count; + } + + unique_count++; + offset += key_count; } - unique_count++; - offset += key_count; - } - - std::vector values_input(size); - std::iota(values_input.begin(), values_input.end(), 0); - - key_type *d_keys_input; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), - hipMemcpyHostToDevice)); - - value_type *d_values_input; - HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); - HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), - size * sizeof(value_type), hipMemcpyHostToDevice)); - - key_type *d_unique_output; - value_type *d_aggregates_output; - unsigned int *d_unique_count_output; - HIP_CHECK(hipMalloc(&d_unique_output, unique_count * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_aggregates_output, unique_count * sizeof(value_type))); - HIP_CHECK(hipMalloc(&d_unique_count_output, sizeof(unsigned int))); - - void *d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - - HIP_CHECK(hipcub::DeviceReduce::ReduceByKey( - nullptr, temporary_storage_bytes, d_keys_input, d_unique_output, - d_values_input, d_aggregates_output, d_unique_count_output, reduce_op, - size, stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < warmup_size; i++) { - HIP_CHECK(hipcub::DeviceReduce::ReduceByKey( - d_temporary_storage, temporary_storage_bytes, d_keys_input, - d_unique_output, d_values_input, d_aggregates_output, - d_unique_count_output, reduce_op, size, stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK(hipcub::DeviceReduce::ReduceByKey( - d_temporary_storage, temporary_storage_bytes, d_keys_input, - d_unique_output, d_values_input, d_aggregates_output, - d_unique_count_output, reduce_op, size, stream)); + std::vector values_input(size); + std::iota(values_input.begin(), values_input.end(), 0); + + key_type* d_keys_input; + HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); + HIP_CHECK( + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + value_type* d_values_input; + HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); + HIP_CHECK(hipMemcpy(d_values_input, + values_input.data(), + size * sizeof(value_type), + hipMemcpyHostToDevice)); + + key_type* d_unique_output; + value_type* d_aggregates_output; + unsigned int* d_unique_count_output; + HIP_CHECK(hipMalloc(&d_unique_output, unique_count * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_aggregates_output, unique_count * sizeof(value_type))); + HIP_CHECK(hipMalloc(&d_unique_count_output, sizeof(unsigned int))); + + void* d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + + HIP_CHECK(hipcub::DeviceReduce::ReduceByKey(nullptr, + temporary_storage_bytes, + d_keys_input, + d_unique_output, + d_values_input, + d_aggregates_output, + d_unique_count_output, + reduce_op, + size, + stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK(hipcub::DeviceReduce::ReduceByKey(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_unique_output, + d_values_input, + d_aggregates_output, + d_unique_count_output, + reduce_op, + size, + stream)); } - HIP_CHECK(hipStreamSynchronize(stream)); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * - (sizeof(key_type) + sizeof(value_type))); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_keys_input)); - HIP_CHECK(hipFree(d_values_input)); - HIP_CHECK(hipFree(d_unique_output)); - HIP_CHECK(hipFree(d_aggregates_output)); - HIP_CHECK(hipFree(d_unique_count_output)); + HIP_CHECK(hipDeviceSynchronize()); + + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK(hipcub::DeviceReduce::ReduceByKey(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_unique_output, + d_values_input, + d_aggregates_output, + d_unique_count_output, + reduce_op, + size, + stream)); + } + HIP_CHECK(hipStreamSynchronize(stream)); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size + * (sizeof(key_type) + sizeof(value_type))); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_values_input)); + HIP_CHECK(hipFree(d_unique_output)); + HIP_CHECK(hipFree(d_aggregates_output)); + HIP_CHECK(hipFree(d_unique_count_output)); } -#define CREATE_BENCHMARK(Key, Value, REDUCE_OP) \ - benchmark::RegisterBenchmark(std::string("device_reduce_by_key" \ - "." \ - "(random_number_range:[1, " + \ - std::to_string(max_length) + "])") \ - .c_str(), \ - &run_benchmark, \ - max_length, stream, size, REDUCE_OP()) - -#define CREATE_BENCHMARKS(REDUCE_OP) \ - CREATE_BENCHMARK(int, float, REDUCE_OP), \ - CREATE_BENCHMARK(int, double, REDUCE_OP), \ - CREATE_BENCHMARK(int, custom_double2, REDUCE_OP), \ - CREATE_BENCHMARK(int8_t, int8_t, REDUCE_OP), \ - CREATE_BENCHMARK(long long, float, REDUCE_OP), \ - CREATE_BENCHMARK(long long, double, REDUCE_OP) - -void add_benchmarks(size_t max_length, - std::vector &benchmarks, - hipStream_t stream, size_t size) { - using custom_double2 = benchmark_utils::custom_type; - - std::vector bs = { - CREATE_BENCHMARKS(hipcub::Sum), - CREATE_BENCHMARK(long long, custom_double2, hipcub::Sum), - CREATE_BENCHMARKS(hipcub::Min), +#define CREATE_BENCHMARK(Key, Value, REDUCE_OP) \ + benchmark::RegisterBenchmark(std::string("device_reduce_by_key" \ + "." \ + "(random_number_range:[1, " \ + + std::to_string(max_length) + "])") \ + .c_str(), \ + &run_benchmark, \ + max_length, \ + stream, \ + size, \ + REDUCE_OP()) + +#define CREATE_BENCHMARKS(REDUCE_OP) \ + CREATE_BENCHMARK(int, float, REDUCE_OP), CREATE_BENCHMARK(int, double, REDUCE_OP), \ + CREATE_BENCHMARK(int, custom_double2, REDUCE_OP), \ + CREATE_BENCHMARK(int8_t, int8_t, REDUCE_OP), \ + CREATE_BENCHMARK(long long, float, REDUCE_OP), \ + CREATE_BENCHMARK(long long, double, REDUCE_OP) + +void add_benchmarks(size_t max_length, + std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + using custom_double2 = benchmark_utils::custom_type; + + std::vector bs = { + CREATE_BENCHMARKS(hipcub::Sum), + CREATE_BENCHMARK(long long, custom_double2, hipcub::Sum), + CREATE_BENCHMARKS(hipcub::Min), #ifdef HIPCUB_ROCPRIM_API - CREATE_BENCHMARK(long long, custom_double2, hipcub::Min), + CREATE_BENCHMARK(long long, custom_double2, hipcub::Min), #endif - }; + }; - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_reduce_by_key" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_benchmarks(1000, benchmarks, stream, size); - add_benchmarks(10, benchmarks, stream, size); - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_reduce_by_key" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_benchmarks(1000, benchmarks, stream, size); + add_benchmarks(10, benchmarks, stream, size); + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_run_length_encode.cpp b/benchmark/benchmark_device_run_length_encode.cpp index ffb41a53..267185c7 100644 --- a/benchmark/benchmark_device_run_length_encode.cpp +++ b/benchmark/benchmark_device_run_length_encode.cpp @@ -23,7 +23,7 @@ // CUB's implementation of DeviceRunLengthEncode has unused parameters, // disable the warning because all warnings are threated as errors: #ifdef __HIP_PLATFORM_NVIDIA__ -#pragma GCC diagnostic ignored "-Wunused-parameter" + #pragma GCC diagnostic ignored "-Wunused-parameter" #endif #include "common_benchmark_header.hpp" @@ -35,271 +35,323 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template -void run_encode_benchmark(benchmark::State &state, size_t max_length, - hipStream_t stream, size_t size) { - using key_type = T; - using count_type = unsigned int; - - // Generate data - std::vector input(size); - - unsigned int runs_count = 0; - std::vector key_counts = - benchmark_utils::get_random_data(100000, 1, max_length); - size_t offset = 0; - while (offset < size) { - const size_t key_count = key_counts[runs_count % key_counts.size()]; - const size_t end = std::min(size, offset + key_count); - for (size_t i = offset; i < end; i++) { - input[i] = runs_count; +template +void run_encode_benchmark(benchmark::State& state, + size_t max_length, + hipStream_t stream, + size_t size) +{ + using key_type = T; + using count_type = unsigned int; + + // Generate data + std::vector input(size); + + unsigned int runs_count = 0; + std::vector key_counts + = benchmark_utils::get_random_data(100000, 1, max_length); + size_t offset = 0; + while(offset < size) + { + const size_t key_count = key_counts[runs_count % key_counts.size()]; + const size_t end = std::min(size, offset + key_count); + for(size_t i = offset; i < end; i++) + { + input[i] = runs_count; + } + + runs_count++; + offset += key_count; } - runs_count++; - offset += key_count; - } - - key_type *d_input; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(key_type), - hipMemcpyHostToDevice)); - - key_type *d_unique_output; - count_type *d_counts_output; - count_type *d_runs_count_output; - HIP_CHECK(hipMalloc(&d_unique_output, runs_count * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type))); - HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type))); - - void *d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - - HIP_CHECK(hipcub::DeviceRunLengthEncode::Encode( - nullptr, temporary_storage_bytes, d_input, d_unique_output, - d_counts_output, d_runs_count_output, size, stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < 10; i++) { - HIP_CHECK(hipcub::DeviceRunLengthEncode::Encode( - d_temporary_storage, temporary_storage_bytes, d_input, d_unique_output, - d_counts_output, d_runs_count_output, size, stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - const unsigned int batch_size = 10; - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < batch_size; i++) { - hipcub::DeviceRunLengthEncode::Encode( - d_temporary_storage, temporary_storage_bytes, d_input, - d_unique_output, d_counts_output, d_runs_count_output, size, stream); + key_type* d_input; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + key_type* d_unique_output; + count_type* d_counts_output; + count_type* d_runs_count_output; + HIP_CHECK(hipMalloc(&d_unique_output, runs_count * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type))); + HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type))); + + void* d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + + HIP_CHECK(hipcub::DeviceRunLengthEncode::Encode(nullptr, + temporary_storage_bytes, + d_input, + d_unique_output, + d_counts_output, + d_runs_count_output, + size, + stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < 10; i++) + { + HIP_CHECK(hipcub::DeviceRunLengthEncode::Encode(d_temporary_storage, + temporary_storage_bytes, + d_input, + d_unique_output, + d_counts_output, + d_runs_count_output, + size, + stream)); } - HIP_CHECK(hipStreamSynchronize(stream)); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * - sizeof(key_type)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_unique_output)); - HIP_CHECK(hipFree(d_counts_output)); - HIP_CHECK(hipFree(d_runs_count_output)); + HIP_CHECK(hipDeviceSynchronize()); + + const unsigned int batch_size = 10; + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < batch_size; i++) + { + hipcub::DeviceRunLengthEncode::Encode(d_temporary_storage, + temporary_storage_bytes, + d_input, + d_unique_output, + d_counts_output, + d_runs_count_output, + size, + stream); + } + HIP_CHECK(hipStreamSynchronize(stream)); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_unique_output)); + HIP_CHECK(hipFree(d_counts_output)); + HIP_CHECK(hipFree(d_runs_count_output)); } -template -void run_non_trivial_runs_benchmark(benchmark::State &state, size_t max_length, - hipStream_t stream, size_t size) { - using key_type = T; - using offset_type = unsigned int; - using count_type = unsigned int; - - // Generate data - std::vector input(size); - - unsigned int runs_count = 0; - std::vector key_counts = - benchmark_utils::get_random_data(100000, 1, max_length); - size_t offset = 0; - while (offset < size) { - const size_t key_count = key_counts[runs_count % key_counts.size()]; - const size_t end = std::min(size, offset + key_count); - for (size_t i = offset; i < end; i++) { - input[i] = runs_count; +template +void run_non_trivial_runs_benchmark(benchmark::State& state, + size_t max_length, + hipStream_t stream, + size_t size) +{ + using key_type = T; + using offset_type = unsigned int; + using count_type = unsigned int; + + // Generate data + std::vector input(size); + + unsigned int runs_count = 0; + std::vector key_counts + = benchmark_utils::get_random_data(100000, 1, max_length); + size_t offset = 0; + while(offset < size) + { + const size_t key_count = key_counts[runs_count % key_counts.size()]; + const size_t end = std::min(size, offset + key_count); + for(size_t i = offset; i < end; i++) + { + input[i] = runs_count; + } + + runs_count++; + offset += key_count; } - runs_count++; - offset += key_count; - } - - key_type *d_input; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(key_type), - hipMemcpyHostToDevice)); - - offset_type *d_offsets_output; - count_type *d_counts_output; - count_type *d_runs_count_output; - HIP_CHECK(hipMalloc(&d_offsets_output, runs_count * sizeof(offset_type))); - HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type))); - HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type))); - - void *d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - - HIP_CHECK(hipcub::DeviceRunLengthEncode::NonTrivialRuns( - nullptr, temporary_storage_bytes, d_input, d_offsets_output, - d_counts_output, d_runs_count_output, size, stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < 10; i++) { - HIP_CHECK(hipcub::DeviceRunLengthEncode::NonTrivialRuns( - d_temporary_storage, temporary_storage_bytes, d_input, d_offsets_output, - d_counts_output, d_runs_count_output, size, stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - const unsigned int batch_size = 10; - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < batch_size; i++) { - hipcub::DeviceRunLengthEncode::NonTrivialRuns( - d_temporary_storage, temporary_storage_bytes, d_input, - d_offsets_output, d_counts_output, d_runs_count_output, size, stream); + key_type* d_input; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + offset_type* d_offsets_output; + count_type* d_counts_output; + count_type* d_runs_count_output; + HIP_CHECK(hipMalloc(&d_offsets_output, runs_count * sizeof(offset_type))); + HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type))); + HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type))); + + void* d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + + HIP_CHECK(hipcub::DeviceRunLengthEncode::NonTrivialRuns(nullptr, + temporary_storage_bytes, + d_input, + d_offsets_output, + d_counts_output, + d_runs_count_output, + size, + stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < 10; i++) + { + HIP_CHECK(hipcub::DeviceRunLengthEncode::NonTrivialRuns(d_temporary_storage, + temporary_storage_bytes, + d_input, + d_offsets_output, + d_counts_output, + d_runs_count_output, + size, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + const unsigned int batch_size = 10; + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < batch_size; i++) + { + hipcub::DeviceRunLengthEncode::NonTrivialRuns(d_temporary_storage, + temporary_storage_bytes, + d_input, + d_offsets_output, + d_counts_output, + d_runs_count_output, + size, + stream); + } + HIP_CHECK(hipStreamSynchronize(stream)); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); } - HIP_CHECK(hipStreamSynchronize(stream)); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * - sizeof(key_type)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_offsets_output)); - HIP_CHECK(hipFree(d_counts_output)); - HIP_CHECK(hipFree(d_runs_count_output)); + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_offsets_output)); + HIP_CHECK(hipFree(d_counts_output)); + HIP_CHECK(hipFree(d_runs_count_output)); } -#define CREATE_ENCODE_BENCHMARK(T) \ - benchmark::RegisterBenchmark(std::string("device_run_length_encode" \ - "." \ - "(random_number_range:[1, " + \ - std::to_string(max_length) + "])") \ - .c_str(), \ - &run_encode_benchmark, max_length, stream, \ - size) - -void add_encode_benchmarks( - size_t max_length, - std::vector &benchmarks, - hipStream_t stream, size_t size) { - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - - std::vector bs = { - CREATE_ENCODE_BENCHMARK(int), - CREATE_ENCODE_BENCHMARK(long long), - - CREATE_ENCODE_BENCHMARK(int8_t), - CREATE_ENCODE_BENCHMARK(uint8_t), - - CREATE_ENCODE_BENCHMARK(custom_float2), - CREATE_ENCODE_BENCHMARK(custom_double2), - }; - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_ENCODE_BENCHMARK(T) \ + benchmark::RegisterBenchmark(std::string("device_run_length_encode" \ + "." \ + "(random_number_range:[1, " \ + + std::to_string(max_length) + "])") \ + .c_str(), \ + &run_encode_benchmark, \ + max_length, \ + stream, \ + size) + +void add_encode_benchmarks(size_t max_length, + std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + + std::vector bs = { + CREATE_ENCODE_BENCHMARK(int), + CREATE_ENCODE_BENCHMARK(long long), + + CREATE_ENCODE_BENCHMARK(int8_t), + CREATE_ENCODE_BENCHMARK(uint8_t), + + CREATE_ENCODE_BENCHMARK(custom_float2), + CREATE_ENCODE_BENCHMARK(custom_double2), + }; + + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_NON_TRIVIAL_RUNS_BENCHMARK(T) \ - benchmark::RegisterBenchmark( \ - std::string("run_length_encode_non_trivial_runs" \ - "" \ - "(random_number_range:[1, " + \ - std::to_string(max_length) + "])") \ - .c_str(), \ - &run_non_trivial_runs_benchmark, max_length, stream, size) - -void add_non_trivial_runs_benchmarks( - size_t max_length, - std::vector &benchmarks, - hipStream_t stream, size_t size) { - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - - std::vector bs = { - CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int), - CREATE_NON_TRIVIAL_RUNS_BENCHMARK(long long), - - CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int8_t), - CREATE_NON_TRIVIAL_RUNS_BENCHMARK(uint8_t), - - CREATE_NON_TRIVIAL_RUNS_BENCHMARK(custom_float2), - CREATE_NON_TRIVIAL_RUNS_BENCHMARK(custom_double2), - }; - - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_NON_TRIVIAL_RUNS_BENCHMARK(T) \ + benchmark::RegisterBenchmark(std::string("run_length_encode_non_trivial_runs" \ + "" \ + "(random_number_range:[1, " \ + + std::to_string(max_length) + "])") \ + .c_str(), \ + &run_non_trivial_runs_benchmark, \ + max_length, \ + stream, \ + size) + +void add_non_trivial_runs_benchmarks(size_t max_length, + std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + + std::vector bs = { + CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int), + CREATE_NON_TRIVIAL_RUNS_BENCHMARK(long long), + + CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int8_t), + CREATE_NON_TRIVIAL_RUNS_BENCHMARK(uint8_t), + + CREATE_NON_TRIVIAL_RUNS_BENCHMARK(custom_float2), + CREATE_NON_TRIVIAL_RUNS_BENCHMARK(custom_double2), + }; + + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_run_length_encode" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_encode_benchmarks(1000, benchmarks, stream, size); - add_encode_benchmarks(10, benchmarks, stream, size); - add_non_trivial_runs_benchmarks(1000, benchmarks, stream, size); - add_non_trivial_runs_benchmarks(10, benchmarks, stream, size); - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_run_length_encode" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_encode_benchmarks(1000, benchmarks, stream, size); + add_encode_benchmarks(10, benchmarks, stream, size); + add_non_trivial_runs_benchmarks(1000, benchmarks, stream, size); + add_non_trivial_runs_benchmarks(10, benchmarks, stream, size); + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_scan.cpp b/benchmark/benchmark_device_scan.cpp index f7656c9f..dbfdda6a 100644 --- a/benchmark/benchmark_device_scan.cpp +++ b/benchmark/benchmark_device_scan.cpp @@ -23,7 +23,7 @@ // CUB's implementation of single_pass_scan_operators has maybe uninitialized // parameters, disable the warning because all warnings are threated as errors: #ifdef __HIP_PLATFORM_NVIDIA__ -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" + #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif #include "common_benchmark_header.hpp" @@ -35,251 +35,342 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template -auto run_device_scan(void *temporary_storage, size_t &storage_size, T *input, - T *output, const T initial_value, const size_t input_size, - BinaryFunction scan_op, const hipStream_t stream) -> - typename std::enable_if::type { - return hipcub::DeviceScan::ExclusiveScan(temporary_storage, storage_size, - input, output, scan_op, - initial_value, input_size, stream); +template +auto run_device_scan(void* temporary_storage, + size_t& storage_size, + T* input, + T* output, + const T initial_value, + const size_t input_size, + BinaryFunction scan_op, + const hipStream_t stream) -> + typename std::enable_if::type +{ + return hipcub::DeviceScan::ExclusiveScan(temporary_storage, + storage_size, + input, + output, + scan_op, + initial_value, + input_size, + stream); } -template -auto run_device_scan(void *temporary_storage, size_t &storage_size, T *input, - T *output, const T initial_value, const size_t input_size, - BinaryFunction scan_op, const hipStream_t stream) -> - typename std::enable_if::type { - (void)initial_value; - return hipcub::DeviceScan::InclusiveScan(temporary_storage, storage_size, - input, output, scan_op, input_size, - stream); +template +auto run_device_scan(void* temporary_storage, + size_t& storage_size, + T* input, + T* output, + const T initial_value, + const size_t input_size, + BinaryFunction scan_op, + const hipStream_t stream) -> + typename std::enable_if::type +{ + (void)initial_value; + return hipcub::DeviceScan::InclusiveScan(temporary_storage, + storage_size, + input, + output, + scan_op, + input_size, + stream); } -template -auto run_device_scan_by_key(void *temporary_storage, size_t &storage_size, - K *keys, T *input, T *output, const T initial_value, - const size_t input_size, BinaryFunction scan_op, +template +auto run_device_scan_by_key(void* temporary_storage, + size_t& storage_size, + K* keys, + T* input, + T* output, + const T initial_value, + const size_t input_size, + BinaryFunction scan_op, const hipStream_t stream) -> - typename std::enable_if::type { - return hipcub::DeviceScan::ExclusiveScanByKey( - temporary_storage, storage_size, keys, input, output, scan_op, - initial_value, static_cast(input_size), hipcub::Equality(), stream); + typename std::enable_if::type +{ + return hipcub::DeviceScan::ExclusiveScanByKey(temporary_storage, + storage_size, + keys, + input, + output, + scan_op, + initial_value, + static_cast(input_size), + hipcub::Equality(), + stream); } -template -auto run_device_scan_by_key(void *temporary_storage, size_t &storage_size, - K *keys, T *input, T *output, - const T /*initial_value*/, const size_t input_size, - BinaryFunction scan_op, const hipStream_t stream) -> - typename std::enable_if::type { - return hipcub::DeviceScan::InclusiveScanByKey( - temporary_storage, storage_size, keys, input, output, scan_op, - static_cast(input_size), hipcub::Equality(), stream); +template +auto run_device_scan_by_key(void* temporary_storage, + size_t& storage_size, + K* keys, + T* input, + T* output, + const T /*initial_value*/, + const size_t input_size, + BinaryFunction scan_op, + const hipStream_t stream) -> + typename std::enable_if::type +{ + return hipcub::DeviceScan::InclusiveScanByKey(temporary_storage, + storage_size, + keys, + input, + output, + scan_op, + static_cast(input_size), + hipcub::Equality(), + stream); } -template -void run_benchmark(benchmark::State &state, size_t size, - const hipStream_t stream, BinaryFunction scan_op) { - std::vector input = - benchmark_utils::get_random_data(size, T(0), T(1000)); - T initial_value = T(123); - T *d_input; - T *d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - // Allocate temporary storage memory - size_t temp_storage_size_bytes = 0; - void *d_temp_storage = nullptr; - // Get size of d_temp_storage - HIP_CHECK((run_device_scan(d_temp_storage, temp_storage_size_bytes, - d_input, d_output, initial_value, size, - scan_op, stream))); - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < 5; i++) { - HIP_CHECK((run_device_scan( - d_temp_storage, temp_storage_size_bytes, d_input, d_output, - initial_value, size, scan_op, stream))); - } - HIP_CHECK(hipDeviceSynchronize()); - - const unsigned int batch_size = 10; - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK((run_device_scan( - d_temp_storage, temp_storage_size_bytes, d_input, d_output, - initial_value, size, scan_op, stream))); +template +void run_benchmark(benchmark::State& state, + size_t size, + const hipStream_t stream, + BinaryFunction scan_op) +{ + std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); + T initial_value = T(123); + T* d_input; + T* d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + // Allocate temporary storage memory + size_t temp_storage_size_bytes = 0; + void* d_temp_storage = nullptr; + // Get size of d_temp_storage + HIP_CHECK((run_device_scan(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + initial_value, + size, + scan_op, + stream))); + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < 5; i++) + { + HIP_CHECK((run_device_scan(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + initial_value, + size, + scan_op, + stream))); + } + HIP_CHECK(hipDeviceSynchronize()); + + const unsigned int batch_size = 10; + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK((run_device_scan(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + initial_value, + size, + scan_op, + stream))); + } + HIP_CHECK(hipStreamSynchronize(stream)); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); } - HIP_CHECK(hipStreamSynchronize(stream)); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); - HIP_CHECK(hipFree(d_temp_storage)); + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_temp_storage)); } -template -void run_benchmark_by_key(benchmark::State &state, size_t size, - const hipStream_t stream, BinaryFunction scan_op) { - using key_type = int; - constexpr size_t max_segment_length = 100; - - const std::vector keys = - benchmark_utils::get_random_segments(size, max_segment_length, - std::random_device{}()); - const std::vector input = - benchmark_utils::get_random_data(size, T(0), T(1000)); - const T initial_value = T(123); - key_type *d_keys; - T *d_input; - T *d_output; - HIP_CHECK(hipMalloc(&d_keys, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK(hipMemcpy(d_keys, keys.data(), size * sizeof(key_type), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - // Allocate temporary storage memory - size_t temp_storage_size_bytes = 0; - void *d_temp_storage = nullptr; - // Get size of d_temp_storage - HIP_CHECK((run_device_scan_by_key( - d_temp_storage, temp_storage_size_bytes, d_keys, d_input, d_output, - initial_value, size, scan_op, stream))); - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < 5; i++) { - HIP_CHECK((run_device_scan_by_key( - d_temp_storage, temp_storage_size_bytes, d_keys, d_input, d_output, - initial_value, size, scan_op, stream))); - } - HIP_CHECK(hipDeviceSynchronize()); - - const unsigned int batch_size = 10; - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK((run_device_scan_by_key( - d_temp_storage, temp_storage_size_bytes, d_keys, d_input, d_output, - initial_value, size, scan_op, stream))); +template +void run_benchmark_by_key(benchmark::State& state, + size_t size, + const hipStream_t stream, + BinaryFunction scan_op) +{ + using key_type = int; + constexpr size_t max_segment_length = 100; + + const std::vector keys + = benchmark_utils::get_random_segments(size, + max_segment_length, + std::random_device{}()); + const std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); + const T initial_value = T(123); + key_type* d_keys; + T* d_input; + T* d_output; + HIP_CHECK(hipMalloc(&d_keys, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_keys, keys.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + // Allocate temporary storage memory + size_t temp_storage_size_bytes = 0; + void* d_temp_storage = nullptr; + // Get size of d_temp_storage + HIP_CHECK((run_device_scan_by_key(d_temp_storage, + temp_storage_size_bytes, + d_keys, + d_input, + d_output, + initial_value, + size, + scan_op, + stream))); + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < 5; i++) + { + HIP_CHECK((run_device_scan_by_key(d_temp_storage, + temp_storage_size_bytes, + d_keys, + d_input, + d_output, + initial_value, + size, + scan_op, + stream))); + } + HIP_CHECK(hipDeviceSynchronize()); + + const unsigned int batch_size = 10; + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK((run_device_scan_by_key(d_temp_storage, + temp_storage_size_bytes, + d_keys, + d_input, + d_output, + initial_value, + size, + scan_op, + stream))); + } + HIP_CHECK(hipStreamSynchronize(stream)); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); } - HIP_CHECK(hipStreamSynchronize(stream)); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_keys)); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); - HIP_CHECK(hipFree(d_temp_storage)); + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_keys)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_temp_storage)); } -#define CREATE_BENCHMARK(EXCL, T, SCAN_OP) \ - benchmark::RegisterBenchmark( \ - std::string(std::string(EXCL ? "device_exclusive_scan" \ - : "device_inclusive_scan") + \ - ".") \ - .c_str(), \ - &run_benchmark, size, stream, SCAN_OP()), \ - benchmark::RegisterBenchmark( \ - std::string(std::string(EXCL ? "device_exclusive_scan_by_key" \ - : "device_inclusive_scan_by_key") + \ - ".") \ - .c_str(), \ - &run_benchmark_by_key, size, stream, SCAN_OP()) - -#define CREATE_BENCHMARKS(SCAN_OP) \ - CREATE_BENCHMARK(false, int, SCAN_OP), CREATE_BENCHMARK(true, int, SCAN_OP), \ - CREATE_BENCHMARK(false, float, SCAN_OP), \ - CREATE_BENCHMARK(true, float, SCAN_OP), \ - CREATE_BENCHMARK(false, double, SCAN_OP), \ - CREATE_BENCHMARK(true, double, SCAN_OP), \ - CREATE_BENCHMARK(false, long long, SCAN_OP), \ - CREATE_BENCHMARK(true, long long, SCAN_OP), \ - CREATE_BENCHMARK(false, custom_float2, SCAN_OP), \ - CREATE_BENCHMARK(true, custom_float2, SCAN_OP), \ - CREATE_BENCHMARK(false, custom_double2, SCAN_OP), \ - CREATE_BENCHMARK(true, custom_double2, SCAN_OP), \ - CREATE_BENCHMARK(false, int8_t, SCAN_OP), \ - CREATE_BENCHMARK(true, int8_t, SCAN_OP), \ - CREATE_BENCHMARK(false, uint8_t, SCAN_OP), \ - CREATE_BENCHMARK(true, uint8_t, SCAN_OP) - -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_scan" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - using custom_double2 = benchmark_utils::custom_type; - using custom_float2 = benchmark_utils::custom_type; - - // Compilation may never finish, if the compiler needs to compile too many - // kernels, it is recommended to compile benchmarks only for 1-2 types when - // BENCHMARK_CONFIG_TUNING is used (all other CREATE_*_BENCHMARK should be - // commented/removed). - - // Add benchmarks - std::vector benchmarks = { - CREATE_BENCHMARKS(hipcub::Sum), - CREATE_BENCHMARKS(hipcub::Min), - }; - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +#define CREATE_BENCHMARK(EXCL, T, SCAN_OP) \ + benchmark::RegisterBenchmark( \ + std::string(std::string(EXCL ? "device_exclusive_scan" : "device_inclusive_scan") \ + + ".") \ + .c_str(), \ + &run_benchmark, \ + size, \ + stream, \ + SCAN_OP()), \ + benchmark::RegisterBenchmark( \ + std::string(std::string(EXCL ? "device_exclusive_scan_by_key" \ + : "device_inclusive_scan_by_key") \ + + ".") \ + .c_str(), \ + &run_benchmark_by_key, \ + size, \ + stream, \ + SCAN_OP()) + +#define CREATE_BENCHMARKS(SCAN_OP) \ + CREATE_BENCHMARK(false, int, SCAN_OP), CREATE_BENCHMARK(true, int, SCAN_OP), \ + CREATE_BENCHMARK(false, float, SCAN_OP), CREATE_BENCHMARK(true, float, SCAN_OP), \ + CREATE_BENCHMARK(false, double, SCAN_OP), CREATE_BENCHMARK(true, double, SCAN_OP), \ + CREATE_BENCHMARK(false, long long, SCAN_OP), CREATE_BENCHMARK(true, long long, SCAN_OP), \ + CREATE_BENCHMARK(false, custom_float2, SCAN_OP), \ + CREATE_BENCHMARK(true, custom_float2, SCAN_OP), \ + CREATE_BENCHMARK(false, custom_double2, SCAN_OP), \ + CREATE_BENCHMARK(true, custom_double2, SCAN_OP), CREATE_BENCHMARK(false, int8_t, SCAN_OP), \ + CREATE_BENCHMARK(true, int8_t, SCAN_OP), CREATE_BENCHMARK(false, uint8_t, SCAN_OP), \ + CREATE_BENCHMARK(true, uint8_t, SCAN_OP) + +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_scan" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + using custom_double2 = benchmark_utils::custom_type; + using custom_float2 = benchmark_utils::custom_type; + + // Compilation may never finish, if the compiler needs to compile too many + // kernels, it is recommended to compile benchmarks only for 1-2 types when + // BENCHMARK_CONFIG_TUNING is used (all other CREATE_*_BENCHMARK should be + // commented/removed). + + // Add benchmarks + std::vector benchmarks = { + CREATE_BENCHMARKS(hipcub::Sum), + CREATE_BENCHMARKS(hipcub::Min), + }; + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); - return 0; + return 0; } diff --git a/benchmark/benchmark_device_segmented_radix_sort.cpp b/benchmark/benchmark_device_segmented_radix_sort.cpp index 548266be..ad7f3075 100644 --- a/benchmark/benchmark_device_segmented_radix_sort.cpp +++ b/benchmark/benchmark_device_segmented_radix_sort.cpp @@ -29,383 +29,459 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -const unsigned int batch_size = 4; +const unsigned int batch_size = 4; const unsigned int warmup_size = 2; -constexpr bool Ascending = false; +constexpr bool Ascending = false; constexpr bool Descending = true; -template -void run_sort_keys_benchmark(benchmark::State &state, size_t desired_segments, - hipStream_t stream, size_t size, - bool descending = false) { - using offset_type = int; - using key_type = Key; - typedef hipError_t (*sort_func)(void *, size_t &, const key_type *, - key_type *, int, int, offset_type *, - offset_type *, int, int, hipStream_t); - - sort_func func_ascending = - &hipcub::DeviceSegmentedRadixSort::SortKeys; - sort_func func_descending = - &hipcub::DeviceSegmentedRadixSort::SortKeysDescending; - - sort_func sorting = descending ? func_descending : func_ascending; - - // Generate data - std::vector offsets; - - const double avg_segment_length = - static_cast(size) / desired_segments; - - const unsigned int seed = 123; - std::default_random_engine gen(seed); - - std::uniform_real_distribution segment_length_dis( - 0, avg_segment_length * 2); - - unsigned int segments_count = 0; - size_t offset = 0; - while (offset < size) { - const size_t segment_length = std::round(segment_length_dis(gen)); - offsets.push_back(offset); - segments_count++; - offset += segment_length; - } - offsets.push_back(size); - - std::vector keys_input; - if (std::is_floating_point::value) { - keys_input = benchmark_utils::get_random_data( - size, (key_type)-1000, (key_type) + 1000); - } else { - keys_input = benchmark_utils::get_random_data( - size, std::numeric_limits::min(), - std::numeric_limits::max()); - } - - offset_type *d_offsets; - HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); - HIP_CHECK(hipMemcpy(d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice)); - - key_type *d_keys_input; - key_type *d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), - hipMemcpyHostToDevice)); - - void *d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, d_keys_input, - d_keys_output, size, segments_count, d_offsets, - d_offsets + 1, 0, sizeof(key_type) * 8, stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < warmup_size; i++) { - HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, segments_count, - d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, +template +void run_sort_keys_benchmark(benchmark::State& state, + size_t desired_segments, + hipStream_t stream, + size_t size, + bool descending = false) +{ + using offset_type = int; + using key_type = Key; + typedef hipError_t (*sort_func)(void*, + size_t&, + const key_type*, + key_type*, + int, + int, + offset_type*, + offset_type*, + int, + int, + hipStream_t); + + sort_func func_ascending = &hipcub::DeviceSegmentedRadixSort::SortKeys; + sort_func func_descending + = &hipcub::DeviceSegmentedRadixSort::SortKeysDescending; + + sort_func sorting = descending ? func_descending : func_ascending; + + // Generate data + std::vector offsets; + + const double avg_segment_length = static_cast(size) / desired_segments; + + const unsigned int seed = 123; + std::default_random_engine gen(seed); + + std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); + + unsigned int segments_count = 0; + size_t offset = 0; + while(offset < size) + { + const size_t segment_length = std::round(segment_length_dis(gen)); + offsets.push_back(offset); + segments_count++; + offset += segment_length; + } + offsets.push_back(size); + + std::vector keys_input; + if(std::is_floating_point::value) + { + keys_input + = benchmark_utils::get_random_data(size, (key_type)-1000, (key_type) + 1000); + } else + { + keys_input + = benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); + } + + offset_type* d_offsets; + HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); + HIP_CHECK(hipMemcpy(d_offsets, + offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); + + key_type* d_keys_input; + key_type* d_keys_output; + HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK( + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK(sorting(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + 0, + sizeof(key_type) * 8, stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, segments_count, - d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, - stream)); + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK(sorting(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + 0, + sizeof(key_type) * 8, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * - sizeof(key_type)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_offsets)); - HIP_CHECK(hipFree(d_keys_input)); - HIP_CHECK(hipFree(d_keys_output)); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK(sorting(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + 0, + sizeof(key_type) * 8, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_offsets)); + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_keys_output)); } -template -void run_sort_pairs_benchmark(benchmark::State &state, size_t desired_segments, - hipStream_t stream, size_t size, - bool descending = false) { - using offset_type = int; - using key_type = Key; - using value_type = Value; - typedef hipError_t (*sort_func)(void *, size_t &, const key_type *, - key_type *, const value_type *, value_type *, - int, int, offset_type *, offset_type *, int, - int, hipStream_t); - - sort_func func_ascending = - &hipcub::DeviceSegmentedRadixSort::SortPairs; - sort_func func_descending = - &hipcub::DeviceSegmentedRadixSort::SortPairsDescending< - key_type, value_type, offset_type *>; - - sort_func sorting = descending ? func_descending : func_ascending; - - // Generate data - std::vector offsets; - - const double avg_segment_length = - static_cast(size) / desired_segments; - - const unsigned int seed = 123; - std::default_random_engine gen(seed); - - std::uniform_real_distribution segment_length_dis( - 0, avg_segment_length * 2); - - unsigned int segments_count = 0; - size_t offset = 0; - while (offset < size) { - const size_t segment_length = std::round(segment_length_dis(gen)); - offsets.push_back(offset); - segments_count++; - offset += segment_length; - } - offsets.push_back(size); - - std::vector keys_input; - if (std::is_floating_point::value) { - keys_input = benchmark_utils::get_random_data( - size, (key_type)-1000, (key_type) + 1000); - } else { - keys_input = benchmark_utils::get_random_data( - size, std::numeric_limits::min(), - std::numeric_limits::max()); - } - - std::vector values_input(size); - std::iota(values_input.begin(), values_input.end(), 0); - - offset_type *d_offsets; - HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); - HIP_CHECK(hipMemcpy(d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice)); - - key_type *d_keys_input; - key_type *d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), - hipMemcpyHostToDevice)); - - value_type *d_values_input; - value_type *d_values_output; - HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); - HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); - HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), - size * sizeof(value_type), hipMemcpyHostToDevice)); - - void *d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, d_keys_input, - d_keys_output, d_values_input, d_values_output, size, - segments_count, d_offsets, d_offsets + 1, 0, - sizeof(key_type) * 8, stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < warmup_size; i++) { - HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, d_values_input, - d_values_output, size, segments_count, d_offsets, - d_offsets + 1, 0, sizeof(key_type) * 8, stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, d_values_input, - d_values_output, size, segments_count, d_offsets, - d_offsets + 1, 0, sizeof(key_type) * 8, stream)); +template +void run_sort_pairs_benchmark(benchmark::State& state, + size_t desired_segments, + hipStream_t stream, + size_t size, + bool descending = false) +{ + using offset_type = int; + using key_type = Key; + using value_type = Value; + typedef hipError_t (*sort_func)(void*, + size_t&, + const key_type*, + key_type*, + const value_type*, + value_type*, + int, + int, + offset_type*, + offset_type*, + int, + int, + hipStream_t); + + sort_func func_ascending + = &hipcub::DeviceSegmentedRadixSort::SortPairs; + sort_func func_descending = &hipcub::DeviceSegmentedRadixSort:: + SortPairsDescending; + + sort_func sorting = descending ? func_descending : func_ascending; + + // Generate data + std::vector offsets; + + const double avg_segment_length = static_cast(size) / desired_segments; + + const unsigned int seed = 123; + std::default_random_engine gen(seed); + + std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); + + unsigned int segments_count = 0; + size_t offset = 0; + while(offset < size) + { + const size_t segment_length = std::round(segment_length_dis(gen)); + offsets.push_back(offset); + segments_count++; + offset += segment_length; + } + offsets.push_back(size); + + std::vector keys_input; + if(std::is_floating_point::value) + { + keys_input + = benchmark_utils::get_random_data(size, (key_type)-1000, (key_type) + 1000); + } else + { + keys_input + = benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); + } + + std::vector values_input(size); + std::iota(values_input.begin(), values_input.end(), 0); + + offset_type* d_offsets; + HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); + HIP_CHECK(hipMemcpy(d_offsets, + offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); + + key_type* d_keys_input; + key_type* d_keys_output; + HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK( + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + value_type* d_values_input; + value_type* d_values_output; + HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); + HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); + HIP_CHECK(hipMemcpy(d_values_input, + values_input.data(), + size * sizeof(value_type), + hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK(sorting(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + 0, + sizeof(key_type) * 8, + stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK(sorting(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + 0, + sizeof(key_type) * 8, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * - (sizeof(key_type) + sizeof(value_type))); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_offsets)); - HIP_CHECK(hipFree(d_keys_input)); - HIP_CHECK(hipFree(d_keys_output)); - HIP_CHECK(hipFree(d_values_input)); - HIP_CHECK(hipFree(d_values_output)); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK(sorting(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + 0, + sizeof(key_type) * 8, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size + * (sizeof(key_type) + sizeof(value_type))); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_offsets)); + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_keys_output)); + HIP_CHECK(hipFree(d_values_input)); + HIP_CHECK(hipFree(d_values_output)); } -#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ - benchmark::RegisterBenchmark( \ - std::string("device_segmented_radix_sort_keys" \ - "." \ - "(segments:~" + \ - std::to_string(SEGMENTS) + " segments)") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_sort_keys_benchmark(state, SEGMENTS, stream, size, \ - Ascending); \ - }) - -#define CREATE_SORT_KEYS_DESCENDING_BENCHMARK(Key, SEGMENTS) \ - benchmark::RegisterBenchmark( \ - std::string("device_segmented_radix_sort_keys" \ - "." \ - "(segments:~" + \ - std::to_string(SEGMENTS) + " segments)") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_sort_keys_benchmark(state, SEGMENTS, stream, size, \ - Descending); \ - }) - -#define BENCHMARK_KEY_TYPE(type) \ - CREATE_SORT_KEYS_BENCHMARK(type, 1), CREATE_SORT_KEYS_BENCHMARK(type, 10), \ - CREATE_SORT_KEYS_BENCHMARK(type, 100), \ - CREATE_SORT_KEYS_BENCHMARK(type, 1000), \ - CREATE_SORT_KEYS_BENCHMARK(type, 10000), \ - CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 1), \ - CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 10), \ - CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 100), \ - CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 1000), \ - CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 10000) - -void add_sort_keys_benchmarks( - std::vector &benchmarks, - hipStream_t stream, size_t size) { - std::vector bs = { - BENCHMARK_KEY_TYPE(float), BENCHMARK_KEY_TYPE(double), - BENCHMARK_KEY_TYPE(int8_t), BENCHMARK_KEY_TYPE(uint8_t), - BENCHMARK_KEY_TYPE(int), - }; - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_radix_sort_keys" \ + "." \ + "(segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_keys_benchmark(state, SEGMENTS, stream, size, Ascending); }) + +#define CREATE_SORT_KEYS_DESCENDING_BENCHMARK(Key, SEGMENTS) \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_radix_sort_keys" \ + "." \ + "(segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_keys_benchmark(state, SEGMENTS, stream, size, Descending); }) + +#define BENCHMARK_KEY_TYPE(type) \ + CREATE_SORT_KEYS_BENCHMARK(type, 1), CREATE_SORT_KEYS_BENCHMARK(type, 10), \ + CREATE_SORT_KEYS_BENCHMARK(type, 100), CREATE_SORT_KEYS_BENCHMARK(type, 1000), \ + CREATE_SORT_KEYS_BENCHMARK(type, 10000), CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 1), \ + CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 10), \ + CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 100), \ + CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 1000), \ + CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 10000) + +void add_sort_keys_benchmarks(std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + std::vector bs = { + BENCHMARK_KEY_TYPE(float), + BENCHMARK_KEY_TYPE(double), + BENCHMARK_KEY_TYPE(int8_t), + BENCHMARK_KEY_TYPE(uint8_t), + BENCHMARK_KEY_TYPE(int), + }; + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ - benchmark::RegisterBenchmark( \ - std::string("device_segmented_radix_sort_pairs" \ - "." \ - "(segments:~" + \ - std::to_string(SEGMENTS) + " segments)") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_sort_pairs_benchmark(state, SEGMENTS, stream, size, \ - Ascending); \ - }) - -#define CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(Key, Value, SEGMENTS) \ - benchmark::RegisterBenchmark( \ - std::string("device_segmented_radix_sort_pairs" \ - "." \ - "(segments:~" + \ - std::to_string(SEGMENTS) + " segments)") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_sort_pairs_benchmark(state, SEGMENTS, stream, size, \ - Descending); \ - }) - -#define BENCHMARK_PAIR_TYPE(type, value) \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 1), \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 10), \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 100), \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 1000), \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 10000), \ - CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 1), \ - CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 10), \ - CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 100), \ - CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 1000), \ - CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 10000) - -void add_sort_pairs_benchmarks( - std::vector &benchmarks, - hipStream_t stream, size_t size) { - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - - std::vector bs = { - BENCHMARK_PAIR_TYPE(int, float), - BENCHMARK_PAIR_TYPE(long long, double), - BENCHMARK_PAIR_TYPE(int8_t, int8_t), - BENCHMARK_PAIR_TYPE(uint8_t, uint8_t), - BENCHMARK_PAIR_TYPE(int, custom_float2), - BENCHMARK_PAIR_TYPE(long long, custom_double2), - }; - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_radix_sort_pairs" \ + "." \ + "(segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, Ascending); }) + +#define CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(Key, Value, SEGMENTS) \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_radix_sort_pairs" \ + "." \ + "(segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, Descending); }) + +#define BENCHMARK_PAIR_TYPE(type, value) \ + CREATE_SORT_PAIRS_BENCHMARK(type, value, 1), CREATE_SORT_PAIRS_BENCHMARK(type, value, 10), \ + CREATE_SORT_PAIRS_BENCHMARK(type, value, 100), \ + CREATE_SORT_PAIRS_BENCHMARK(type, value, 1000), \ + CREATE_SORT_PAIRS_BENCHMARK(type, value, 10000), \ + CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 1), \ + CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 10), \ + CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 100), \ + CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 1000), \ + CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 10000) + +void add_sort_pairs_benchmarks(std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + + std::vector bs = { + BENCHMARK_PAIR_TYPE(int, float), + BENCHMARK_PAIR_TYPE(long long, double), + BENCHMARK_PAIR_TYPE(int8_t, int8_t), + BENCHMARK_PAIR_TYPE(uint8_t, uint8_t), + BENCHMARK_PAIR_TYPE(int, custom_float2), + BENCHMARK_PAIR_TYPE(long long, custom_double2), + }; + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_segmented_radix_sort" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_sort_keys_benchmarks(benchmarks, stream, size); - add_sort_pairs_benchmarks(benchmarks, stream, size); - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_segmented_radix_sort" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_sort_keys_benchmarks(benchmarks, stream, size); + add_sort_pairs_benchmarks(benchmarks, stream, size); + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_segmented_reduce.cpp b/benchmark/benchmark_device_segmented_reduce.cpp index e008bf41..d1e40c67 100644 --- a/benchmark/benchmark_device_segmented_reduce.cpp +++ b/benchmark/benchmark_device_segmented_reduce.cpp @@ -29,212 +29,253 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; using OffsetType = int; -template -void run_benchmark(benchmark::State &state, size_t desired_segments, - hipStream_t stream, size_t size, - SegmentedReduceKernel segmented_reduce) { - using value_type = T; - - // Generate data - const unsigned int seed = 123; - std::default_random_engine gen(seed); - - const double avg_segment_length = - static_cast(size) / desired_segments; - std::uniform_real_distribution segment_length_dis( - 0, avg_segment_length * 2); - - std::vector offsets; - unsigned int segments_count = 0; - size_t offset = 0; - while (offset < size) { - const size_t segment_length = std::round(segment_length_dis(gen)); - offsets.push_back(offset); - segments_count++; - offset += segment_length; - } - offsets.push_back(size); - - std::vector values_input(size); - std::iota(values_input.begin(), values_input.end(), 0); - - OffsetType *d_offsets; - HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(OffsetType))); - HIP_CHECK(hipMemcpy(d_offsets, offsets.data(), - (segments_count + 1) * sizeof(OffsetType), - hipMemcpyHostToDevice)); - - value_type *d_values_input; - HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); - HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), - size * sizeof(value_type), hipMemcpyHostToDevice)); - - OutputT *d_aggregates_output; - HIP_CHECK(hipMalloc(&d_aggregates_output, segments_count * sizeof(OutputT))); - - void *d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - - HIP_CHECK(segmented_reduce(d_temporary_storage, temporary_storage_bytes, - d_values_input, d_aggregates_output, - segments_count, d_offsets, d_offsets + 1, stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < warmup_size; i++) { - HIP_CHECK(segmented_reduce( - d_temporary_storage, temporary_storage_bytes, d_values_input, - d_aggregates_output, segments_count, d_offsets, d_offsets + 1, stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK(segmented_reduce(d_temporary_storage, temporary_storage_bytes, - d_values_input, d_aggregates_output, - segments_count, d_offsets, d_offsets + 1, - stream)); +template +void run_benchmark(benchmark::State& state, + size_t desired_segments, + hipStream_t stream, + size_t size, + SegmentedReduceKernel segmented_reduce) +{ + using value_type = T; + + // Generate data + const unsigned int seed = 123; + std::default_random_engine gen(seed); + + const double avg_segment_length = static_cast(size) / desired_segments; + std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); + + std::vector offsets; + unsigned int segments_count = 0; + size_t offset = 0; + while(offset < size) + { + const size_t segment_length = std::round(segment_length_dis(gen)); + offsets.push_back(offset); + segments_count++; + offset += segment_length; } - HIP_CHECK(hipStreamSynchronize(stream)); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * - sizeof(value_type)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_offsets)); - HIP_CHECK(hipFree(d_values_input)); - HIP_CHECK(hipFree(d_aggregates_output)); -} + offsets.push_back(size); + + std::vector values_input(size); + std::iota(values_input.begin(), values_input.end(), 0); + + OffsetType* d_offsets; + HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(OffsetType))); + HIP_CHECK(hipMemcpy(d_offsets, + offsets.data(), + (segments_count + 1) * sizeof(OffsetType), + hipMemcpyHostToDevice)); + + value_type* d_values_input; + HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); + HIP_CHECK(hipMemcpy(d_values_input, + values_input.data(), + size * sizeof(value_type), + hipMemcpyHostToDevice)); + + OutputT* d_aggregates_output; + HIP_CHECK(hipMalloc(&d_aggregates_output, segments_count * sizeof(OutputT))); + + void* d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + + HIP_CHECK(segmented_reduce(d_temporary_storage, + temporary_storage_bytes, + d_values_input, + d_aggregates_output, + segments_count, + d_offsets, + d_offsets + 1, + stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK(segmented_reduce(d_temporary_storage, + temporary_storage_bytes, + d_values_input, + d_aggregates_output, + segments_count, + d_offsets, + d_offsets + 1, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK(segmented_reduce(d_temporary_storage, + temporary_storage_bytes, + d_values_input, + d_aggregates_output, + segments_count, + d_offsets, + d_offsets + 1, + stream)); + } + HIP_CHECK(hipStreamSynchronize(stream)); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(value_type)); + state.SetItemsProcessed(state.iterations() * batch_size * size); -template struct Benchmark; + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_offsets)); + HIP_CHECK(hipFree(d_values_input)); + HIP_CHECK(hipFree(d_aggregates_output)); +} -template struct Benchmark { - static void run(benchmark::State &state, size_t desired_segments, - const hipStream_t stream, size_t size) { - hipError_t (*ptr_to_sum)(void *, size_t &, T *, T *, int, OffsetType *, - OffsetType *, hipStream_t) = - &hipcub::DeviceSegmentedReduce::Sum; - run_benchmark(state, desired_segments, stream, size, ptr_to_sum); - } +template +struct Benchmark; + +template +struct Benchmark +{ + static void + run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) + { + hipError_t (*ptr_to_sum)(void*, size_t&, T*, T*, int, OffsetType*, OffsetType*, hipStream_t) + = &hipcub::DeviceSegmentedReduce::Sum; + run_benchmark(state, desired_segments, stream, size, ptr_to_sum); + } }; -template struct Benchmark { - static void run(benchmark::State &state, size_t desired_segments, - const hipStream_t stream, size_t size) { - hipError_t (*ptr_to_min)(void *, size_t &, T *, T *, int, OffsetType *, - OffsetType *, hipStream_t) = - &hipcub::DeviceSegmentedReduce::Min; - run_benchmark(state, desired_segments, stream, size, ptr_to_min); - } +template +struct Benchmark +{ + static void + run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) + { + hipError_t (*ptr_to_min)(void*, size_t&, T*, T*, int, OffsetType*, OffsetType*, hipStream_t) + = &hipcub::DeviceSegmentedReduce::Min; + run_benchmark(state, desired_segments, stream, size, ptr_to_min); + } }; -template struct Benchmark { - using Difference = OffsetType; - using Iterator = typename hipcub::ArgIndexInputIterator; - using KeyValue = typename Iterator::value_type; - - static void run(benchmark::State &state, size_t desired_segments, - const hipStream_t stream, size_t size) { - hipError_t (*ptr_to_argmin)(void *, size_t &, T *, KeyValue *, int, - OffsetType *, OffsetType *, hipStream_t) = - &hipcub::DeviceSegmentedReduce::ArgMin; - run_benchmark(state, desired_segments, stream, size, - ptr_to_argmin); - } +template +struct Benchmark +{ + using Difference = OffsetType; + using Iterator = typename hipcub::ArgIndexInputIterator; + using KeyValue = typename Iterator::value_type; + + static void + run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) + { + hipError_t (*ptr_to_argmin)(void*, + size_t&, + T*, + KeyValue*, + int, + OffsetType*, + OffsetType*, + hipStream_t) + = &hipcub::DeviceSegmentedReduce::ArgMin; + run_benchmark(state, desired_segments, stream, size, ptr_to_argmin); + } }; -#define CREATE_BENCHMARK(T, SEGMENTS, REDUCE_OP) \ - benchmark::RegisterBenchmark( \ - std::string("device_segmented_reduce" \ - "." \ - "(number_of_segments:~" + \ - std::to_string(SEGMENTS) + " segments)") \ - .c_str(), \ - &Benchmark::run, SEGMENTS, stream, size) - -#define BENCHMARK_TYPE(type, REDUCE_OP) \ - CREATE_BENCHMARK(type, 1, REDUCE_OP), \ - CREATE_BENCHMARK(type, 100, REDUCE_OP), \ - CREATE_BENCHMARK(type, 10000, REDUCE_OP) - -#define CREATE_BENCHMARKS(REDUCE_OP) \ - BENCHMARK_TYPE(float, REDUCE_OP), BENCHMARK_TYPE(double, REDUCE_OP), \ - BENCHMARK_TYPE(int8_t, REDUCE_OP), BENCHMARK_TYPE(int, REDUCE_OP) - -void add_benchmarks(std::vector &benchmarks, - hipStream_t stream, size_t size) { - using custom_double2 = benchmark_utils::custom_type; - - std::vector bs = { - CREATE_BENCHMARKS(hipcub::Sum), - BENCHMARK_TYPE(custom_double2, hipcub::Sum), - CREATE_BENCHMARKS(hipcub::Min), +#define CREATE_BENCHMARK(T, SEGMENTS, REDUCE_OP) \ + benchmark::RegisterBenchmark(std::string("device_segmented_reduce" \ + "." \ + "(number_of_segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + &Benchmark::run, \ + SEGMENTS, \ + stream, \ + size) + +#define BENCHMARK_TYPE(type, REDUCE_OP) \ + CREATE_BENCHMARK(type, 1, REDUCE_OP), CREATE_BENCHMARK(type, 100, REDUCE_OP), \ + CREATE_BENCHMARK(type, 10000, REDUCE_OP) + +#define CREATE_BENCHMARKS(REDUCE_OP) \ + BENCHMARK_TYPE(float, REDUCE_OP), BENCHMARK_TYPE(double, REDUCE_OP), \ + BENCHMARK_TYPE(int8_t, REDUCE_OP), BENCHMARK_TYPE(int, REDUCE_OP) + +void add_benchmarks(std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + using custom_double2 = benchmark_utils::custom_type; + + std::vector bs = { + CREATE_BENCHMARKS(hipcub::Sum), + BENCHMARK_TYPE(custom_double2, hipcub::Sum), + CREATE_BENCHMARKS(hipcub::Min), #ifdef HIPCUB_ROCPRIM_API - BENCHMARK_TYPE(custom_double2, hipcub::Min), + BENCHMARK_TYPE(custom_double2, hipcub::Min), #endif - CREATE_BENCHMARKS(hipcub::ArgMin), + CREATE_BENCHMARKS(hipcub::ArgMin), #ifdef HIPCUB_ROCPRIM_API - BENCHMARK_TYPE(custom_double2, hipcub::ArgMin), + BENCHMARK_TYPE(custom_double2, hipcub::ArgMin), #endif - }; + }; - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_segmented_reduce" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_benchmarks(benchmarks, stream, size); - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_segmented_reduce" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_benchmarks(benchmarks, stream, size); + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_segmented_sort.cpp b/benchmark/benchmark_device_segmented_sort.cpp index c0954103..06e7f73b 100644 --- a/benchmark/benchmark_device_segmented_sort.cpp +++ b/benchmark/benchmark_device_segmented_sort.cpp @@ -29,411 +29,473 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -const unsigned int batch_size = 4; +const unsigned int batch_size = 4; const unsigned int warmup_size = 2; -template -void run_sort_keys_benchmark(benchmark::State &state, size_t desired_segments, - hipStream_t stream, size_t size, - bool Descending = false, bool Stable = false) { - using offset_type = int; - using key_type = Key; - typedef hipError_t (*sort_func)(void *, size_t &, const key_type *, - key_type *, int, int, offset_type *, - offset_type *, hipStream_t); - - sort_func func_ascending = - &hipcub::DeviceSegmentedSort::SortKeys; - sort_func func_descending = - &hipcub::DeviceSegmentedSort::SortKeysDescending; - sort_func func_ascending_stable = - &hipcub::DeviceSegmentedSort::StableSortKeys; - sort_func func_descending_stable = - &hipcub::DeviceSegmentedSort::StableSortKeysDescending; - - sort_func sorting = Descending - ? (Stable ? func_descending_stable : func_descending) - : (Stable ? func_ascending_stable : func_ascending); - - std::vector offsets; - - const double avg_segment_length = - static_cast(size) / desired_segments; - - std::random_device rd; - std::default_random_engine gen(rd()); - - std::uniform_real_distribution segment_length_dis( - 0, avg_segment_length * 2); - - unsigned int segments_count = 0; - size_t offset = 0; - while (offset < size) { - const size_t segment_length = std::round(segment_length_dis(gen)); - offsets.push_back(offset); - ++segments_count; - offset += segment_length; - } - offsets.push_back(size); - - std::vector keys_input; - if (std::is_floating_point::value) { - keys_input = benchmark_utils::get_random_data( - size, static_cast(-1000), static_cast(1000)); - } else { - keys_input = benchmark_utils::get_random_data( - size, std::numeric_limits::min(), - std::numeric_limits::max()); - } - - offset_type *d_offsets; - HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); - HIP_CHECK(hipMemcpy(d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice)); - - key_type *d_keys_input; - key_type *d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), - hipMemcpyHostToDevice)); - - void *d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, d_keys_input, - d_keys_output, size, segments_count, d_offsets, - d_offsets + 1, stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < warmup_size; ++i) { - HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, segments_count, - d_offsets, d_offsets + 1, stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < batch_size; ++i) { - HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, segments_count, - d_offsets, d_offsets + 1, stream)); +template +void run_sort_keys_benchmark(benchmark::State& state, + size_t desired_segments, + hipStream_t stream, + size_t size, + bool Descending = false, + bool Stable = false) +{ + using offset_type = int; + using key_type = Key; + typedef hipError_t (*sort_func)(void*, + size_t&, + const key_type*, + key_type*, + int, + int, + offset_type*, + offset_type*, + hipStream_t); + + sort_func func_ascending = &hipcub::DeviceSegmentedSort::SortKeys; + sort_func func_descending + = &hipcub::DeviceSegmentedSort::SortKeysDescending; + sort_func func_ascending_stable + = &hipcub::DeviceSegmentedSort::StableSortKeys; + sort_func func_descending_stable + = &hipcub::DeviceSegmentedSort::StableSortKeysDescending; + + sort_func sorting = Descending ? (Stable ? func_descending_stable : func_descending) + : (Stable ? func_ascending_stable : func_ascending); + + std::vector offsets; + + const double avg_segment_length = static_cast(size) / desired_segments; + + std::random_device rd; + std::default_random_engine gen(rd()); + + std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); + + unsigned int segments_count = 0; + size_t offset = 0; + while(offset < size) + { + const size_t segment_length = std::round(segment_length_dis(gen)); + offsets.push_back(offset); + ++segments_count; + offset += segment_length; + } + offsets.push_back(size); + + std::vector keys_input; + if(std::is_floating_point::value) + { + keys_input = benchmark_utils::get_random_data(size, + static_cast(-1000), + static_cast(1000)); + } else + { + keys_input + = benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); + } + + offset_type* d_offsets; + HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); + HIP_CHECK(hipMemcpy(d_offsets, + offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); + + key_type* d_keys_input; + key_type* d_keys_output; + HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK( + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK(sorting(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < warmup_size; ++i) + { + HIP_CHECK(sorting(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * - sizeof(key_type)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_offsets)); - HIP_CHECK(hipFree(d_keys_input)); - HIP_CHECK(hipFree(d_keys_output)); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < batch_size; ++i) + { + HIP_CHECK(sorting(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_offsets)); + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_keys_output)); } -template -void run_sort_pairs_benchmark(benchmark::State &state, size_t desired_segments, - hipStream_t stream, size_t size, - bool Descending = false, bool Stable = false) { - using offset_type = int; - using key_type = Key; - using value_type = Value; - typedef hipError_t (*sort_func)( - void *, size_t &, const key_type *, key_type *, const value_type *, - value_type *, int, int, offset_type *, offset_type *, hipStream_t); - - sort_func func_ascending = - &hipcub::DeviceSegmentedSort::SortPairs; - sort_func func_descending = - &hipcub::DeviceSegmentedSort::SortPairsDescending; - sort_func func_ascending_stable = - &hipcub::DeviceSegmentedSort::StableSortPairs; - sort_func func_descending_stable = - &hipcub::DeviceSegmentedSort::StableSortPairsDescending< - key_type, value_type, offset_type *>; - - sort_func sorting = Descending - ? (Stable ? func_descending_stable : func_descending) - : (Stable ? func_ascending_stable : func_ascending); - - std::vector offsets; - - const double avg_segment_length = - static_cast(size) / desired_segments; - - std::random_device rd; - std::default_random_engine gen(rd()); - - std::uniform_real_distribution segment_length_dis( - 0, avg_segment_length * 2); - - unsigned int segments_count = 0; - size_t offset = 0; - while (offset < size) { - const size_t segment_length = std::round(segment_length_dis(gen)); - offsets.push_back(offset); - ++segments_count; - offset += segment_length; - } - offsets.push_back(size); - - std::vector keys_input; - if (std::is_floating_point::value) { - keys_input = benchmark_utils::get_random_data( - size, static_cast(-1000), static_cast(1000)); - } else { - keys_input = benchmark_utils::get_random_data( - size, std::numeric_limits::min(), - std::numeric_limits::max()); - } - - std::vector values_input(size); - std::iota(values_input.begin(), values_input.end(), 0); - - offset_type *d_offsets; - HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); - HIP_CHECK(hipMemcpy(d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice)); - - key_type *d_keys_input; - key_type *d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), - hipMemcpyHostToDevice)); - - value_type *d_values_input; - value_type *d_values_output; - HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); - HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); - HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), - size * sizeof(value_type), hipMemcpyHostToDevice)); - - void *d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, d_keys_input, - d_keys_output, d_values_input, d_values_output, size, - segments_count, d_offsets, d_offsets + 1, stream)); - - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < warmup_size; i++) { - HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, d_values_input, - d_values_output, size, segments_count, d_offsets, - d_offsets + 1, stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, d_values_input, - d_values_output, size, segments_count, d_offsets, - d_offsets + 1, stream)); +template +void run_sort_pairs_benchmark(benchmark::State& state, + size_t desired_segments, + hipStream_t stream, + size_t size, + bool Descending = false, + bool Stable = false) +{ + using offset_type = int; + using key_type = Key; + using value_type = Value; + typedef hipError_t (*sort_func)(void*, + size_t&, + const key_type*, + key_type*, + const value_type*, + value_type*, + int, + int, + offset_type*, + offset_type*, + hipStream_t); + + sort_func func_ascending + = &hipcub::DeviceSegmentedSort::SortPairs; + sort_func func_descending + = &hipcub::DeviceSegmentedSort::SortPairsDescending; + sort_func func_ascending_stable + = &hipcub::DeviceSegmentedSort::StableSortPairs; + sort_func func_descending_stable + = &hipcub::DeviceSegmentedSort:: + StableSortPairsDescending; + + sort_func sorting = Descending ? (Stable ? func_descending_stable : func_descending) + : (Stable ? func_ascending_stable : func_ascending); + + std::vector offsets; + + const double avg_segment_length = static_cast(size) / desired_segments; + + std::random_device rd; + std::default_random_engine gen(rd()); + + std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); + + unsigned int segments_count = 0; + size_t offset = 0; + while(offset < size) + { + const size_t segment_length = std::round(segment_length_dis(gen)); + offsets.push_back(offset); + ++segments_count; + offset += segment_length; + } + offsets.push_back(size); + + std::vector keys_input; + if(std::is_floating_point::value) + { + keys_input = benchmark_utils::get_random_data(size, + static_cast(-1000), + static_cast(1000)); + } else + { + keys_input + = benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); + } + + std::vector values_input(size); + std::iota(values_input.begin(), values_input.end(), 0); + + offset_type* d_offsets; + HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); + HIP_CHECK(hipMemcpy(d_offsets, + offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); + + key_type* d_keys_input; + key_type* d_keys_output; + HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK( + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + value_type* d_values_input; + value_type* d_values_output; + HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); + HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); + HIP_CHECK(hipMemcpy(d_values_input, + values_input.data(), + size * sizeof(value_type), + hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + HIP_CHECK(sorting(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + stream)); + + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK(sorting(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * - (sizeof(key_type) + sizeof(value_type))); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - HIP_CHECK(hipFree(d_temporary_storage)); - HIP_CHECK(hipFree(d_offsets)); - HIP_CHECK(hipFree(d_keys_input)); - HIP_CHECK(hipFree(d_keys_output)); - HIP_CHECK(hipFree(d_values_input)); - HIP_CHECK(hipFree(d_values_output)); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK(sorting(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size + * (sizeof(key_type) + sizeof(value_type))); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_offsets)); + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_keys_output)); + HIP_CHECK(hipFree(d_values_input)); + HIP_CHECK(hipFree(d_values_output)); } -#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ - benchmark::RegisterBenchmark( \ - std::string("device_segmented_sort_keys" \ - "." \ - "(number_of_segments:~" + \ - std::to_string(SEGMENTS) + " segments)") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_sort_keys_benchmark(state, SEGMENTS, stream, size); \ - }), \ - benchmark::RegisterBenchmark( \ - std::string("device_segmented_sort_keys" \ - "." \ - "(number_of_segments:~" + \ - std::to_string(SEGMENTS) + " segments)") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_sort_keys_benchmark(state, SEGMENTS, stream, size, true); \ - }), \ - benchmark::RegisterBenchmark( \ - std::string("device_segmented_sort_keys" \ - "." \ - "(number_of_segments:~" + \ - std::to_string(SEGMENTS) + " segments)") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_sort_keys_benchmark(state, SEGMENTS, stream, size, false, \ - true); \ - }), \ - benchmark::RegisterBenchmark( \ - std::string("device_segmented_sort_keys" \ - "." \ - "(number_of_segments:~" + \ - std::to_string(SEGMENTS) + " segments)") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_sort_keys_benchmark(state, SEGMENTS, stream, size, true, \ - true); \ - }) - -#define BENCHMARK_KEY_TYPE(type) \ - CREATE_SORT_KEYS_BENCHMARK(type, 10), CREATE_SORT_KEYS_BENCHMARK(type, 100), \ - CREATE_SORT_KEYS_BENCHMARK(type, 1000), \ - CREATE_SORT_KEYS_BENCHMARK(type, 10000) - -void add_sort_keys_benchmarks( - std::vector &benchmarks, - hipStream_t stream, size_t size) { - std::vector bs = { - BENCHMARK_KEY_TYPE(float), BENCHMARK_KEY_TYPE(double), - BENCHMARK_KEY_TYPE(int8_t), BENCHMARK_KEY_TYPE(uint8_t), - BENCHMARK_KEY_TYPE(int), - }; - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ + benchmark::RegisterBenchmark(std::string("device_segmented_sort_keys" \ + "." \ + "(number_of_segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) { \ + run_sort_keys_benchmark(state, SEGMENTS, stream, size); \ + }), \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_sort_keys" \ + "." \ + "(number_of_segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_keys_benchmark(state, SEGMENTS, stream, size, true); }), \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_sort_keys" \ + "." \ + "(number_of_segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_keys_benchmark(state, SEGMENTS, stream, size, false, true); }), \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_sort_keys" \ + "." \ + "(number_of_segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_keys_benchmark(state, SEGMENTS, stream, size, true, true); }) + +#define BENCHMARK_KEY_TYPE(type) \ + CREATE_SORT_KEYS_BENCHMARK(type, 10), CREATE_SORT_KEYS_BENCHMARK(type, 100), \ + CREATE_SORT_KEYS_BENCHMARK(type, 1000), CREATE_SORT_KEYS_BENCHMARK(type, 10000) + +void add_sort_keys_benchmarks(std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + std::vector bs = { + BENCHMARK_KEY_TYPE(float), + BENCHMARK_KEY_TYPE(double), + BENCHMARK_KEY_TYPE(int8_t), + BENCHMARK_KEY_TYPE(uint8_t), + BENCHMARK_KEY_TYPE(int), + }; + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ - benchmark::RegisterBenchmark( \ - (std::string("device_segmented_sort_pairs") + \ - "." + \ - "(number_of_segments:~" + std::to_string(SEGMENTS) + " segments)") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_sort_pairs_benchmark(state, SEGMENTS, stream, size); \ - }), \ - benchmark::RegisterBenchmark( \ - (std::string("device_segmented_sort_pairs") + \ - "." + \ - "(number_of_segments:~" + std::to_string(SEGMENTS) + " segments)") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_sort_pairs_benchmark(state, SEGMENTS, stream, \ - size, true); \ - }), \ - benchmark::RegisterBenchmark( \ - (std::string("device_segmented_sort_pairs") + \ - "." + \ - "(number_of_segments:~" + std::to_string(SEGMENTS) + " segments)") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_sort_pairs_benchmark(state, SEGMENTS, stream, \ - size, false, true); \ - }), \ - benchmark::RegisterBenchmark( \ - (std::string("device_segmented_sort_pairs") + \ - "." + \ - "(number_of_segments:~" + std::to_string(SEGMENTS) + " segments)") \ - .c_str(), \ - [=](benchmark::State &state) { \ - run_sort_pairs_benchmark(state, SEGMENTS, stream, \ - size, true, true); \ - }) -#define BENCHMARK_PAIR_TYPE(type, value) \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 10), \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 100), \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 10000) - -void add_sort_pairs_benchmarks( - std::vector &benchmarks, - hipStream_t stream, size_t size) { - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; - - std::vector bs = { - BENCHMARK_PAIR_TYPE(int, float), - BENCHMARK_PAIR_TYPE(long long, double), - BENCHMARK_PAIR_TYPE(int8_t, int8_t), - BENCHMARK_PAIR_TYPE(uint8_t, uint8_t), - BENCHMARK_PAIR_TYPE(int, custom_float2), - BENCHMARK_PAIR_TYPE(long long, custom_double2), - }; - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); +#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ + benchmark::RegisterBenchmark( \ + (std::string("device_segmented_sort_pairs") \ + + "." \ + + "(number_of_segments:~" + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_pairs_benchmark(state, SEGMENTS, stream, size); }), \ + benchmark::RegisterBenchmark( \ + (std::string("device_segmented_sort_pairs") \ + + "." \ + + "(number_of_segments:~" + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, true); }), \ + benchmark::RegisterBenchmark( \ + (std::string("device_segmented_sort_pairs") \ + + "." \ + + "(number_of_segments:~" + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) { \ + run_sort_pairs_benchmark(state, SEGMENTS, stream, size, false, true); \ + }), \ + benchmark::RegisterBenchmark( \ + (std::string("device_segmented_sort_pairs") \ + + "." \ + + "(number_of_segments:~" + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, true, true); }) +#define BENCHMARK_PAIR_TYPE(type, value) \ + CREATE_SORT_PAIRS_BENCHMARK(type, value, 10), CREATE_SORT_PAIRS_BENCHMARK(type, value, 100), \ + CREATE_SORT_PAIRS_BENCHMARK(type, value, 10000) + +void add_sort_pairs_benchmarks(std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; + + std::vector bs = { + BENCHMARK_PAIR_TYPE(int, float), + BENCHMARK_PAIR_TYPE(long long, double), + BENCHMARK_PAIR_TYPE(int8_t, int8_t), + BENCHMARK_PAIR_TYPE(uint8_t, uint8_t), + BENCHMARK_PAIR_TYPE(int, custom_float2), + BENCHMARK_PAIR_TYPE(long long, custom_double2), + }; + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_segmented_sort" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_sort_keys_benchmarks(benchmarks, stream, size); - add_sort_pairs_benchmarks(benchmarks, stream, size); - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_segmented_sort" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_sort_keys_benchmarks(benchmarks, stream, size); + add_sort_pairs_benchmarks(benchmarks, stream, size); + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_device_select.cpp b/benchmark/benchmark_device_select.cpp index 47a4beb0..a2641649 100644 --- a/benchmark/benchmark_device_select.cpp +++ b/benchmark/benchmark_device_select.cpp @@ -29,437 +29,525 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template -void run_flagged_benchmark(benchmark::State &state, size_t size, - const hipStream_t stream, float true_probability) { - std::vector input; - std::vector flags = - benchmark_utils::get_random_data01(size, true_probability); - if (std::is_floating_point::value) { - input = benchmark_utils::get_random_data(size, T(-1000), T(1000)); - } else { - input = benchmark_utils::get_random_data( - size, std::numeric_limits::min(), std::numeric_limits::max()); - } - - T *d_input; - FlagType *d_flags; - T *d_output; - unsigned int *d_selected_count_output; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_flags, flags.size() * sizeof(FlagType))); - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); - HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(FlagType), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - // Allocate temporary storage memory - size_t temp_storage_size_bytes = 0; - - // Get size of d_temp_storage - HIP_CHECK(hipcub::DeviceSelect::Flagged( - nullptr, temp_storage_size_bytes, d_input, d_flags, d_output, - d_selected_count_output, input.size(), stream)); - HIP_CHECK(hipDeviceSynchronize()); - - // allocate temporary storage - void *d_temp_storage = nullptr; - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < 10; i++) { - HIP_CHECK(hipcub::DeviceSelect::Flagged( - d_temp_storage, temp_storage_size_bytes, d_input, d_flags, d_output, - d_selected_count_output, input.size(), stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - const unsigned int batch_size = 10; - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK(hipcub::DeviceSelect::Flagged( - d_temp_storage, temp_storage_size_bytes, d_input, d_flags, d_output, - d_selected_count_output, input.size(), stream)); +template +void run_flagged_benchmark(benchmark::State& state, + size_t size, + const hipStream_t stream, + float true_probability) +{ + std::vector input; + std::vector flags + = benchmark_utils::get_random_data01(size, true_probability); + if(std::is_floating_point::value) + { + input = benchmark_utils::get_random_data(size, T(-1000), T(1000)); + } else + { + input = benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); } + + T* d_input; + FlagType* d_flags; + T* d_output; + unsigned int* d_selected_count_output; + HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_flags, flags.size() * sizeof(FlagType))); + HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK( + hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(FlagType), hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + // Allocate temporary storage memory + size_t temp_storage_size_bytes = 0; + + // Get size of d_temp_storage + HIP_CHECK(hipcub::DeviceSelect::Flagged(nullptr, + temp_storage_size_bytes, + d_input, + d_flags, + d_output, + d_selected_count_output, + input.size(), + stream)); HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - hipFree(d_input); - hipFree(d_flags); - hipFree(d_output); - hipFree(d_selected_count_output); - hipFree(d_temp_storage); - HIP_CHECK(hipDeviceSynchronize()); + // allocate temporary storage + void* d_temp_storage = nullptr; + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < 10; i++) + { + HIP_CHECK(hipcub::DeviceSelect::Flagged(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_flags, + d_output, + d_selected_count_output, + input.size(), + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + const unsigned int batch_size = 10; + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK(hipcub::DeviceSelect::Flagged(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_flags, + d_output, + d_selected_count_output, + input.size(), + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + hipFree(d_input); + hipFree(d_flags); + hipFree(d_output); + hipFree(d_selected_count_output); + hipFree(d_temp_storage); + HIP_CHECK(hipDeviceSynchronize()); } -template -void run_selectop_benchmark(benchmark::State &state, size_t size, - const hipStream_t stream, float true_probability) { - std::vector input = - benchmark_utils::get_random_data(size, T(0), T(1000)); - - auto select_op = [true_probability] __device__(const T &value) -> bool { - if (value < T(1000 * true_probability)) - return true; - return false; - }; - - T *d_input; - T *d_output; - unsigned int *d_selected_count_output; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); - HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - // Allocate temporary storage memory - size_t temp_storage_size_bytes; - - // Get size of d_temp_storage - HIP_CHECK(hipcub::DeviceSelect::If(nullptr, temp_storage_size_bytes, d_input, - d_output, d_selected_count_output, - input.size(), select_op, stream)); - HIP_CHECK(hipDeviceSynchronize()); - - // allocate temporary storage - void *d_temp_storage = nullptr; - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < 10; i++) { - HIP_CHECK(hipcub::DeviceSelect::If( - d_temp_storage, temp_storage_size_bytes, d_input, d_output, - d_selected_count_output, input.size(), select_op, stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - const unsigned int batch_size = 10; - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK(hipcub::DeviceSelect::If( - d_temp_storage, temp_storage_size_bytes, d_input, d_output, - d_selected_count_output, input.size(), select_op, stream)); +template +void run_selectop_benchmark(benchmark::State& state, + size_t size, + const hipStream_t stream, + float true_probability) +{ + std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); + + auto select_op = [true_probability] __device__(const T& value) -> bool + { + if(value < T(1000 * true_probability)) + return true; + return false; + }; + + T* d_input; + T* d_output; + unsigned int* d_selected_count_output; + HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + // Allocate temporary storage memory + size_t temp_storage_size_bytes; + + // Get size of d_temp_storage + HIP_CHECK(hipcub::DeviceSelect::If(nullptr, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + select_op, + stream)); + HIP_CHECK(hipDeviceSynchronize()); + + // allocate temporary storage + void* d_temp_storage = nullptr; + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < 10; i++) + { + HIP_CHECK(hipcub::DeviceSelect::If(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + select_op, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - hipFree(d_input); - hipFree(d_output); - hipFree(d_selected_count_output); - hipFree(d_temp_storage); - HIP_CHECK(hipDeviceSynchronize()); + const unsigned int batch_size = 10; + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK(hipcub::DeviceSelect::If(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + select_op, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + hipFree(d_input); + hipFree(d_output); + hipFree(d_selected_count_output); + hipFree(d_temp_storage); + HIP_CHECK(hipDeviceSynchronize()); } -template -void run_unique_benchmark(benchmark::State &state, size_t size, +template +void run_unique_benchmark(benchmark::State& state, + size_t size, const hipStream_t stream, - float discontinuity_probability) { - hipcub::Sum op; - - std::vector input(size); - { - auto input01 = - benchmark_utils::get_random_data01(size, discontinuity_probability); - auto acc = input01[0]; - input[0] = acc; - for (size_t i = 1; i < input01.size(); i++) { - input[i] = op(acc, input01[i]); + float discontinuity_probability) +{ + hipcub::Sum op; + + std::vector input(size); + { + auto input01 = benchmark_utils::get_random_data01(size, discontinuity_probability); + auto acc = input01[0]; + input[0] = acc; + for(size_t i = 1; i < input01.size(); i++) + { + input[i] = op(acc, input01[i]); + } } - } - - T *d_input; - T *d_output; - unsigned int *d_selected_count_output; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); - HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - // Allocate temporary storage memory - size_t temp_storage_size_bytes; - - // Get size of d_temp_storage - HIP_CHECK(hipcub::DeviceSelect::Unique( - nullptr, temp_storage_size_bytes, d_input, d_output, - d_selected_count_output, input.size(), stream)); - HIP_CHECK(hipDeviceSynchronize()); - - // allocate temporary storage - void *d_temp_storage = nullptr; - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < 10; i++) { - HIP_CHECK(hipcub::DeviceSelect::Unique( - d_temp_storage, temp_storage_size_bytes, d_input, d_output, - d_selected_count_output, input.size(), stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - const unsigned int batch_size = 10; - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK(hipcub::DeviceSelect::Unique( - d_temp_storage, temp_storage_size_bytes, d_input, d_output, - d_selected_count_output, input.size(), stream)); + + T* d_input; + T* d_output; + unsigned int* d_selected_count_output; + HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + // Allocate temporary storage memory + size_t temp_storage_size_bytes; + + // Get size of d_temp_storage + HIP_CHECK(hipcub::DeviceSelect::Unique(nullptr, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + stream)); + HIP_CHECK(hipDeviceSynchronize()); + + // allocate temporary storage + void* d_temp_storage = nullptr; + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < 10; i++) + { + HIP_CHECK(hipcub::DeviceSelect::Unique(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + stream)); } HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - hipFree(d_input); - hipFree(d_output); - hipFree(d_selected_count_output); - hipFree(d_temp_storage); + const unsigned int batch_size = 10; + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK(hipcub::DeviceSelect::Unique(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + hipFree(d_input); + hipFree(d_output); + hipFree(d_selected_count_output); + hipFree(d_temp_storage); } -template -void run_unique_by_key_benchmark(benchmark::State &state, size_t size, +template +void run_unique_by_key_benchmark(benchmark::State& state, + size_t size, const hipStream_t stream, - float discontinuity_probability) { - hipcub::Sum op; + float discontinuity_probability) +{ + hipcub::Sum op; - std::vector input_keys(size); - { - auto input01 = benchmark_utils::get_random_data01( - size, discontinuity_probability); - auto acc = input01[0]; + std::vector input_keys(size); + { + auto input01 = benchmark_utils::get_random_data01(size, discontinuity_probability); + auto acc = input01[0]; - input_keys[0] = acc; + input_keys[0] = acc; - for (size_t i = 1; i < input01.size(); i++) { - input_keys[i] = op(acc, input01[i]); + for(size_t i = 1; i < input01.size(); i++) + { + input_keys[i] = op(acc, input01[i]); + } } - } - - const auto input_values = benchmark_utils::get_random_data( - size, ValueT(-1000), ValueT(1000)); - - KeyT *d_keys_input; - ValueT *d_values_input; - KeyT *d_keys_output; - ValueT *d_values_output; - unsigned int *d_selected_count_output; - - HIP_CHECK( - hipMalloc(&d_keys_input, input_keys.size() * sizeof(input_keys[0]))); - HIP_CHECK(hipMalloc(&d_values_input, - input_values.size() * sizeof(input_values[0]))); - HIP_CHECK( - hipMalloc(&d_keys_output, input_keys.size() * sizeof(input_keys[0]))); - HIP_CHECK(hipMalloc(&d_values_output, - input_values.size() * sizeof(input_values[0]))); - HIP_CHECK( - hipMalloc(&d_selected_count_output, sizeof(*d_selected_count_output))); - - HIP_CHECK(hipMemcpy(d_keys_input, input_keys.data(), - input_keys.size() * sizeof(input_keys[0]), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_values_input, input_values.data(), - input_values.size() * sizeof(input_values[0]), - hipMemcpyHostToDevice)); - - // Allocate temporary storage memory - size_t temp_storage_size_bytes; - - // Get size of d_temp_storage - HIP_CHECK(hipcub::DeviceSelect::UniqueByKey( - nullptr, temp_storage_size_bytes, d_keys_input, d_values_input, - d_keys_output, d_values_output, d_selected_count_output, - input_keys.size(), stream)); - HIP_CHECK(hipDeviceSynchronize()); - - // allocate temporary storage - void *d_temp_storage = nullptr; - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < 10; i++) { - HIP_CHECK(hipcub::DeviceSelect::UniqueByKey( - d_temp_storage, temp_storage_size_bytes, d_keys_input, d_values_input, - d_keys_output, d_values_output, d_selected_count_output, - input_keys.size(), stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - const unsigned int batch_size = 10; - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK(hipcub::DeviceSelect::UniqueByKey( - d_temp_storage, temp_storage_size_bytes, d_keys_input, d_values_input, - d_keys_output, d_values_output, d_selected_count_output, - input_keys.size(), stream)); + + const auto input_values + = benchmark_utils::get_random_data(size, ValueT(-1000), ValueT(1000)); + + KeyT* d_keys_input; + ValueT* d_values_input; + KeyT* d_keys_output; + ValueT* d_values_output; + unsigned int* d_selected_count_output; + + HIP_CHECK(hipMalloc(&d_keys_input, input_keys.size() * sizeof(input_keys[0]))); + HIP_CHECK(hipMalloc(&d_values_input, input_values.size() * sizeof(input_values[0]))); + HIP_CHECK(hipMalloc(&d_keys_output, input_keys.size() * sizeof(input_keys[0]))); + HIP_CHECK(hipMalloc(&d_values_output, input_values.size() * sizeof(input_values[0]))); + HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(*d_selected_count_output))); + + HIP_CHECK(hipMemcpy(d_keys_input, + input_keys.data(), + input_keys.size() * sizeof(input_keys[0]), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_values_input, + input_values.data(), + input_values.size() * sizeof(input_values[0]), + hipMemcpyHostToDevice)); + + // Allocate temporary storage memory + size_t temp_storage_size_bytes; + + // Get size of d_temp_storage + HIP_CHECK(hipcub::DeviceSelect::UniqueByKey(nullptr, + temp_storage_size_bytes, + d_keys_input, + d_values_input, + d_keys_output, + d_values_output, + d_selected_count_output, + input_keys.size(), + stream)); + HIP_CHECK(hipDeviceSynchronize()); + + // allocate temporary storage + void* d_temp_storage = nullptr; + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < 10; i++) + { + HIP_CHECK(hipcub::DeviceSelect::UniqueByKey(d_temp_storage, + temp_storage_size_bytes, + d_keys_input, + d_values_input, + d_keys_output, + d_values_output, + d_selected_count_output, + input_keys.size(), + stream)); } HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * size * - (sizeof(KeyT) + sizeof(ValueT))); - state.SetItemsProcessed(state.iterations() * batch_size * size); - - hipFree(d_keys_input); - hipFree(d_values_input); - hipFree(d_keys_output); - hipFree(d_values_output); - hipFree(d_selected_count_output); - hipFree(d_temp_storage); + const unsigned int batch_size = 10; + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK(hipcub::DeviceSelect::UniqueByKey(d_temp_storage, + temp_storage_size_bytes, + d_keys_input, + d_values_input, + d_keys_output, + d_values_output, + d_selected_count_output, + input_keys.size(), + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size + * (sizeof(KeyT) + sizeof(ValueT))); + state.SetItemsProcessed(state.iterations() * batch_size * size); + + hipFree(d_keys_input); + hipFree(d_values_input); + hipFree(d_keys_output); + hipFree(d_values_output); + hipFree(d_selected_count_output); + hipFree(d_temp_storage); } -#define CREATE_SELECT_FLAGGED_BENCHMARK(T, F, p) \ - benchmark::RegisterBenchmark( \ - ("device_select_flagged.(probability:" #p ")"), \ - &run_flagged_benchmark, size, stream, p) - -#define CREATE_SELECT_IF_BENCHMARK(T, p) \ - benchmark::RegisterBenchmark( \ - ("device_select_if.(probability:" #p ")"), \ - &run_selectop_benchmark, size, stream, p) - -#define CREATE_UNIQUE_BENCHMARK(T, p) \ - benchmark::RegisterBenchmark( \ - ("device_select_unique.(probability:" #p ")"), \ - &run_unique_benchmark, size, stream, p) - -#define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p) \ - benchmark::RegisterBenchmark( \ - ("device_select_unique_by_key.(probability:" #p ")"), \ - &run_unique_by_key_benchmark, size, stream, p) - -#define BENCHMARK_FLAGGED_TYPE(type, value) \ - CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.05f), \ - CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.25f), \ - CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.5f), \ - CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.75f) - -#define BENCHMARK_IF_TYPE(type) \ - CREATE_SELECT_IF_BENCHMARK(type, 0.05f), \ - CREATE_SELECT_IF_BENCHMARK(type, 0.25f), \ - CREATE_SELECT_IF_BENCHMARK(type, 0.5f), \ - CREATE_SELECT_IF_BENCHMARK(type, 0.75f) - -#define BENCHMARK_UNIQUE_TYPE(type) \ - CREATE_UNIQUE_BENCHMARK(type, 0.05f), CREATE_UNIQUE_BENCHMARK(type, 0.25f), \ - CREATE_UNIQUE_BENCHMARK(type, 0.5f), \ - CREATE_UNIQUE_BENCHMARK(type, 0.75f) - -#define BENCHMARK_UNIQUE_BY_KEY_TYPE(key_type, value_type) \ - CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.05f), \ - CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.25f), \ - CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.5f), \ - CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.75f) - -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_device_select" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - using custom_double2 = benchmark_utils::custom_type; - using custom_int_double = benchmark_utils::custom_type; - - // Add benchmarks - std::vector benchmarks = { - BENCHMARK_FLAGGED_TYPE(int, unsigned char), - BENCHMARK_FLAGGED_TYPE(float, unsigned char), - BENCHMARK_FLAGGED_TYPE(double, unsigned char), - BENCHMARK_FLAGGED_TYPE(uint8_t, uint8_t), - BENCHMARK_FLAGGED_TYPE(int8_t, int8_t), - BENCHMARK_FLAGGED_TYPE(custom_double2, unsigned char), - - BENCHMARK_IF_TYPE(int), - BENCHMARK_IF_TYPE(float), - BENCHMARK_IF_TYPE(double), - BENCHMARK_IF_TYPE(uint8_t), - BENCHMARK_IF_TYPE(int8_t), - BENCHMARK_IF_TYPE(custom_int_double), - - BENCHMARK_UNIQUE_TYPE(int), - BENCHMARK_UNIQUE_TYPE(float), - BENCHMARK_UNIQUE_TYPE(double), - BENCHMARK_UNIQUE_TYPE(uint8_t), - BENCHMARK_UNIQUE_TYPE(int8_t), - BENCHMARK_UNIQUE_TYPE(custom_int_double), - - BENCHMARK_UNIQUE_BY_KEY_TYPE(int, int), - BENCHMARK_UNIQUE_BY_KEY_TYPE(float, double), - BENCHMARK_UNIQUE_BY_KEY_TYPE(double, custom_double2), - BENCHMARK_UNIQUE_BY_KEY_TYPE(uint8_t, uint8_t), - BENCHMARK_UNIQUE_BY_KEY_TYPE(int8_t, double), - BENCHMARK_UNIQUE_BY_KEY_TYPE(custom_int_double, custom_int_double)}; - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +#define CREATE_SELECT_FLAGGED_BENCHMARK(T, F, p) \ + benchmark::RegisterBenchmark( \ + ("device_select_flagged.(probability:" #p ")"), \ + &run_flagged_benchmark, \ + size, \ + stream, \ + p) + +#define CREATE_SELECT_IF_BENCHMARK(T, p) \ + benchmark::RegisterBenchmark(("device_select_if.(probability:" #p \ + ")"), \ + &run_selectop_benchmark, \ + size, \ + stream, \ + p) + +#define CREATE_UNIQUE_BENCHMARK(T, p) \ + benchmark::RegisterBenchmark(("device_select_unique.(probability:" #p \ + ")"), \ + &run_unique_benchmark, \ + size, \ + stream, \ + p) + +#define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p) \ + benchmark::RegisterBenchmark( \ + ("device_select_unique_by_key.(probability:" #p ")"), \ + &run_unique_by_key_benchmark, \ + size, \ + stream, \ + p) + +#define BENCHMARK_FLAGGED_TYPE(type, value) \ + CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.05f), \ + CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.25f), \ + CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.5f), \ + CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.75f) + +#define BENCHMARK_IF_TYPE(type) \ + CREATE_SELECT_IF_BENCHMARK(type, 0.05f), CREATE_SELECT_IF_BENCHMARK(type, 0.25f), \ + CREATE_SELECT_IF_BENCHMARK(type, 0.5f), CREATE_SELECT_IF_BENCHMARK(type, 0.75f) + +#define BENCHMARK_UNIQUE_TYPE(type) \ + CREATE_UNIQUE_BENCHMARK(type, 0.05f), CREATE_UNIQUE_BENCHMARK(type, 0.25f), \ + CREATE_UNIQUE_BENCHMARK(type, 0.5f), CREATE_UNIQUE_BENCHMARK(type, 0.75f) + +#define BENCHMARK_UNIQUE_BY_KEY_TYPE(key_type, value_type) \ + CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.05f), \ + CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.25f), \ + CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.5f), \ + CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.75f) + +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_device_select" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + using custom_double2 = benchmark_utils::custom_type; + using custom_int_double = benchmark_utils::custom_type; + + // Add benchmarks + std::vector benchmarks + = {BENCHMARK_FLAGGED_TYPE(int, unsigned char), + BENCHMARK_FLAGGED_TYPE(float, unsigned char), + BENCHMARK_FLAGGED_TYPE(double, unsigned char), + BENCHMARK_FLAGGED_TYPE(uint8_t, uint8_t), + BENCHMARK_FLAGGED_TYPE(int8_t, int8_t), + BENCHMARK_FLAGGED_TYPE(custom_double2, unsigned char), + + BENCHMARK_IF_TYPE(int), + BENCHMARK_IF_TYPE(float), + BENCHMARK_IF_TYPE(double), + BENCHMARK_IF_TYPE(uint8_t), + BENCHMARK_IF_TYPE(int8_t), + BENCHMARK_IF_TYPE(custom_int_double), + + BENCHMARK_UNIQUE_TYPE(int), + BENCHMARK_UNIQUE_TYPE(float), + BENCHMARK_UNIQUE_TYPE(double), + BENCHMARK_UNIQUE_TYPE(uint8_t), + BENCHMARK_UNIQUE_TYPE(int8_t), + BENCHMARK_UNIQUE_TYPE(custom_int_double), + + BENCHMARK_UNIQUE_BY_KEY_TYPE(int, int), + BENCHMARK_UNIQUE_BY_KEY_TYPE(float, double), + BENCHMARK_UNIQUE_BY_KEY_TYPE(double, custom_double2), + BENCHMARK_UNIQUE_BY_KEY_TYPE(uint8_t, uint8_t), + BENCHMARK_UNIQUE_BY_KEY_TYPE(int8_t, double), + BENCHMARK_UNIQUE_BY_KEY_TYPE(custom_int_double, custom_int_double)}; + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); - return 0; + return 0; } diff --git a/benchmark/benchmark_device_spmv.cpp b/benchmark/benchmark_device_spmv.cpp index aaa5ffa6..f1ff2cc3 100644 --- a/benchmark/benchmark_device_spmv.cpp +++ b/benchmark/benchmark_device_spmv.cpp @@ -29,195 +29,236 @@ const size_t DEFAULT_N = 1024 * 32; #endif -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template -void run_benchmark(benchmark::State &state, size_t size, - const hipStream_t stream, float probability) { - const T rand_min = T(1); - const T rand_max = T(10); - - // generate a lexicograhically sorted list of (row, column) index tuples - // number of nonzeroes cannot be guaranteed as duplicates may exist - const int num_nonzeroes_attempt = static_cast(std::min( - static_cast(INT_MAX), - static_cast(probability * static_cast(size * size)))); - std::vector> indices(num_nonzeroes_attempt); - { - std::vector flat_indices = benchmark_utils::get_random_data( - 2 * num_nonzeroes_attempt, 0, size - 1, 2 * num_nonzeroes_attempt); - for (int i = 0; i < num_nonzeroes_attempt; i++) { - indices[i] = std::make_pair(flat_indices[2 * i], flat_indices[2 * i + 1]); - } - std::sort(indices.begin(), indices.end()); - } - - // generate the compressed sparse rows matrix - std::pair prev_cell = std::make_pair(-1, -1); - int num_nonzeroes = 0; - std::vector row_offsets(size + 1); - // this vector might be too large, but doing the allocation now eliminates a - // scan - std::vector column_indices(num_nonzeroes_attempt); - row_offsets[0] = 0; - int last_row_written = 0; - for (int i = 0; i < num_nonzeroes_attempt; i++) { - if (indices[i] != prev_cell) { - // update the row offets if we go to the next row (or skip some) - if (indices[i].first != last_row_written) { - for (int j = last_row_written + 1; j <= indices[i].first; j++) { - row_offsets[j] = num_nonzeroes; +template +void run_benchmark(benchmark::State& state, + size_t size, + const hipStream_t stream, + float probability) +{ + const T rand_min = T(1); + const T rand_max = T(10); + + // generate a lexicograhically sorted list of (row, column) index tuples + // number of nonzeroes cannot be guaranteed as duplicates may exist + const int num_nonzeroes_attempt = static_cast( + std::min(static_cast(INT_MAX), + static_cast(probability * static_cast(size * size)))); + std::vector> indices(num_nonzeroes_attempt); + { + std::vector flat_indices + = benchmark_utils::get_random_data(2 * num_nonzeroes_attempt, + 0, + size - 1, + 2 * num_nonzeroes_attempt); + for(int i = 0; i < num_nonzeroes_attempt; i++) + { + indices[i] = std::make_pair(flat_indices[2 * i], flat_indices[2 * i + 1]); } - last_row_written = indices[i].first; - } + std::sort(indices.begin(), indices.end()); + } + + // generate the compressed sparse rows matrix + std::pair prev_cell = std::make_pair(-1, -1); + int num_nonzeroes = 0; + std::vector row_offsets(size + 1); + // this vector might be too large, but doing the allocation now eliminates a + // scan + std::vector column_indices(num_nonzeroes_attempt); + row_offsets[0] = 0; + int last_row_written = 0; + for(int i = 0; i < num_nonzeroes_attempt; i++) + { + if(indices[i] != prev_cell) + { + // update the row offets if we go to the next row (or skip some) + if(indices[i].first != last_row_written) + { + for(int j = last_row_written + 1; j <= indices[i].first; j++) + { + row_offsets[j] = num_nonzeroes; + } + last_row_written = indices[i].first; + } - column_indices[num_nonzeroes++] = indices[i].second; + column_indices[num_nonzeroes++] = indices[i].second; - prev_cell = indices[i]; + prev_cell = indices[i]; + } + } + // fill in the entries for any missing rows + for(int j = last_row_written + 1; j < static_cast(size) + 1; j++) + { + row_offsets[j] = num_nonzeroes; } - } - // fill in the entries for any missing rows - for (int j = last_row_written + 1; j < static_cast(size) + 1; j++) { - row_offsets[j] = num_nonzeroes; - } - - // generate the random data once the actual number of nonzeroes are known - std::vector values = - benchmark_utils::get_random_data(num_nonzeroes, rand_min, rand_max); - - std::vector vector_x = - benchmark_utils::get_random_data(size, rand_min, rand_max); - - T *d_values; - int *d_row_offsets; - int *d_column_indices; - T *d_vector_x; - T *d_vector_y; - HIP_CHECK(hipMalloc(&d_values, values.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_row_offsets, row_offsets.size() * sizeof(int))); - HIP_CHECK(hipMalloc(&d_column_indices, num_nonzeroes * sizeof(int))); - HIP_CHECK(hipMalloc(&d_vector_x, vector_x.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_vector_y, size * sizeof(T))); - HIP_CHECK(hipMemcpy(d_values, values.data(), values.size() * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_row_offsets, row_offsets.data(), - row_offsets.size() * sizeof(int), hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_column_indices, column_indices.data(), - num_nonzeroes * sizeof(int), hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_vector_x, vector_x.data(), vector_x.size() * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - // Allocate temporary storage memory - size_t temp_storage_size_bytes; - - // Get size of d_temp_storage - HIP_CHECK(hipcub::DeviceSpmv::CsrMV(nullptr, temp_storage_size_bytes, - d_values, d_row_offsets, d_column_indices, - d_vector_x, d_vector_y, size, size, - num_nonzeroes, stream)); - HIP_CHECK(hipDeviceSynchronize()); - - // allocate temporary storage - void *d_temp_storage = nullptr; - HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); - HIP_CHECK(hipDeviceSynchronize()); - - // Warm-up - for (size_t i = 0; i < warmup_size; i++) { - HIP_CHECK(hipcub::DeviceSpmv::CsrMV( - d_temp_storage, temp_storage_size_bytes, d_values, d_row_offsets, - d_column_indices, d_vector_x, d_vector_y, size, size, num_nonzeroes, - stream)); - } - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < batch_size; i++) { - HIP_CHECK(hipcub::DeviceSpmv::CsrMV( - d_temp_storage, temp_storage_size_bytes, d_values, d_row_offsets, - d_column_indices, d_vector_x, d_vector_y, size, size, num_nonzeroes, - stream)); + + // generate the random data once the actual number of nonzeroes are known + std::vector values = benchmark_utils::get_random_data(num_nonzeroes, rand_min, rand_max); + + std::vector vector_x = benchmark_utils::get_random_data(size, rand_min, rand_max); + + T* d_values; + int* d_row_offsets; + int* d_column_indices; + T* d_vector_x; + T* d_vector_y; + HIP_CHECK(hipMalloc(&d_values, values.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_row_offsets, row_offsets.size() * sizeof(int))); + HIP_CHECK(hipMalloc(&d_column_indices, num_nonzeroes * sizeof(int))); + HIP_CHECK(hipMalloc(&d_vector_x, vector_x.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_vector_y, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_values, values.data(), values.size() * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_row_offsets, + row_offsets.data(), + row_offsets.size() * sizeof(int), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_column_indices, + column_indices.data(), + num_nonzeroes * sizeof(int), + hipMemcpyHostToDevice)); + HIP_CHECK( + hipMemcpy(d_vector_x, vector_x.data(), vector_x.size() * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + // Allocate temporary storage memory + size_t temp_storage_size_bytes; + + // Get size of d_temp_storage + HIP_CHECK(hipcub::DeviceSpmv::CsrMV(nullptr, + temp_storage_size_bytes, + d_values, + d_row_offsets, + d_column_indices, + d_vector_x, + d_vector_y, + size, + size, + num_nonzeroes, + stream)); + HIP_CHECK(hipDeviceSynchronize()); + + // allocate temporary storage + void* d_temp_storage = nullptr; + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK(hipcub::DeviceSpmv::CsrMV(d_temp_storage, + temp_storage_size_bytes, + d_values, + d_row_offsets, + d_column_indices, + d_vector_x, + d_vector_y, + size, + size, + num_nonzeroes, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * batch_size * - (num_nonzeroes + size) * sizeof(T)); - state.SetItemsProcessed(state.iterations() * batch_size * - (num_nonzeroes + size)); - - hipFree(d_temp_storage); - hipFree(d_vector_y); - hipFree(d_vector_x); - hipFree(d_column_indices); - hipFree(d_row_offsets); - hipFree(d_values); - HIP_CHECK(hipDeviceSynchronize()); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK(hipcub::DeviceSpmv::CsrMV(d_temp_storage, + temp_storage_size_bytes, + d_values, + d_row_offsets, + d_column_indices, + d_vector_x, + d_vector_y, + size, + size, + num_nonzeroes, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * (num_nonzeroes + size) * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * (num_nonzeroes + size)); + + hipFree(d_temp_storage); + hipFree(d_vector_y); + hipFree(d_vector_x); + hipFree(d_column_indices); + hipFree(d_row_offsets); + hipFree(d_values); + HIP_CHECK(hipDeviceSynchronize()); } -#define CREATE_BENCHMARK(T, p) \ - benchmark::RegisterBenchmark( \ - (std::string("device_spmv_CsrMV.")) \ - .c_str(), \ - &run_benchmark, size, stream, p) - -#define BENCHMARK_TYPE(type) \ - CREATE_BENCHMARK(type, 1.0e-6f), CREATE_BENCHMARK(type, 1.0e-5f), \ - CREATE_BENCHMARK(type, 1.0e-4f), CREATE_BENCHMARK(type, 1.0e-3f), \ - CREATE_BENCHMARK(type, 1.0e-2f) - -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - - std::cout << "benchmark_device_spmv" << std::endl; - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks = { - BENCHMARK_TYPE(int), - BENCHMARK_TYPE(unsigned int), - BENCHMARK_TYPE(float), - BENCHMARK_TYPE(double), - }; - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +#define CREATE_BENCHMARK(T, p) \ + benchmark::RegisterBenchmark( \ + (std::string("device_spmv_CsrMV.")) \ + .c_str(), \ + &run_benchmark, \ + size, \ + stream, \ + p) + +#define BENCHMARK_TYPE(type) \ + CREATE_BENCHMARK(type, 1.0e-6f), CREATE_BENCHMARK(type, 1.0e-5f), \ + CREATE_BENCHMARK(type, 1.0e-4f), CREATE_BENCHMARK(type, 1.0e-3f), \ + CREATE_BENCHMARK(type, 1.0e-2f) + +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_device_spmv" << std::endl; + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks = { + BENCHMARK_TYPE(int), + BENCHMARK_TYPE(unsigned int), + BENCHMARK_TYPE(float), + BENCHMARK_TYPE(double), + }; + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); - return 0; + return 0; } diff --git a/benchmark/benchmark_utils.hpp b/benchmark/benchmark_utils.hpp index a68fd7db..fa3da901 100644 --- a/benchmark/benchmark_utils.hpp +++ b/benchmark/benchmark_utils.hpp @@ -24,409 +24,452 @@ #define HIPCUB_BENCHMARK_UTILS_HPP_ #ifndef BENCHMARK_UTILS_INCLUDE_GUARD -#error benchmark_utils.hpp must ONLY be included by common_benchmark_header.hpp. Please include common_benchmark_header.hpp instead. + #error benchmark_utils.hpp must ONLY be included by common_benchmark_header.hpp. Please include common_benchmark_header.hpp instead. #endif // hipCUB API #ifdef __HIP_PLATFORM_AMD__ -#include "hipcub/backend/rocprim/util_ptx.hpp" + #include "hipcub/backend/rocprim/util_ptx.hpp" #elif defined(__HIP_PLATFORM_NVIDIA__) -#include "hipcub/config.hpp" -#include + #include "hipcub/config.hpp" + #include #endif #include "hipcub/tuple.hpp" #ifndef HIPCUB_CUB_API -#define HIPCUB_WARP_THREADS_MACRO warpSize + #define HIPCUB_WARP_THREADS_MACRO warpSize #else -#define HIPCUB_WARP_THREADS_MACRO CUB_PTX_WARP_THREADS + #define HIPCUB_WARP_THREADS_MACRO CUB_PTX_WARP_THREADS #endif -namespace benchmark_utils { +namespace benchmark_utils +{ const size_t default_max_random_size = 1024 * 1024; // get_random_data() generates only part of sequence and replicates it, // because benchmarks usually do not need "true" random sequence. -template -inline auto get_random_data(size_t size, T min, T max, - size_t max_random_size = default_max_random_size) -> - typename std::enable_if::value, std::vector>::type { - std::random_device rd; - std::default_random_engine gen(rd()); - using distribution_type = - typename std::conditional<(sizeof(T) == 1), short, T>::type; - std::uniform_int_distribution distribution(min, max); - std::vector data(size); - std::generate(data.begin(), data.begin() + std::min(size, max_random_size), - [&]() { return distribution(gen); }); - for (size_t i = max_random_size; i < size; i += max_random_size) { - std::copy_n(data.begin(), std::min(size - i, max_random_size), - data.begin() + i); - } - return data; +template +inline auto + get_random_data(size_t size, T min, T max, size_t max_random_size = default_max_random_size) -> + typename std::enable_if::value, std::vector>::type +{ + std::random_device rd; + std::default_random_engine gen(rd()); + using distribution_type = typename std::conditional<(sizeof(T) == 1), short, T>::type; + std::uniform_int_distribution distribution(min, max); + std::vector data(size); + std::generate(data.begin(), + data.begin() + std::min(size, max_random_size), + [&]() { return distribution(gen); }); + for(size_t i = max_random_size; i < size; i += max_random_size) + { + std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); + } + return data; } -template -inline auto get_random_data(size_t size, T min, T max, - size_t max_random_size = default_max_random_size) -> - typename std::enable_if::value, - std::vector>::type { - std::random_device rd; - std::default_random_engine gen(rd()); - std::uniform_real_distribution distribution(min, max); - std::vector data(size); - std::generate(data.begin(), data.begin() + std::min(size, max_random_size), - [&]() { return distribution(gen); }); - for (size_t i = max_random_size; i < size; i += max_random_size) { - std::copy_n(data.begin(), std::min(size - i, max_random_size), - data.begin() + i); - } - return data; +template +inline auto + get_random_data(size_t size, T min, T max, size_t max_random_size = default_max_random_size) -> + typename std::enable_if::value, std::vector>::type +{ + std::random_device rd; + std::default_random_engine gen(rd()); + std::uniform_real_distribution distribution(min, max); + std::vector data(size); + std::generate(data.begin(), + data.begin() + std::min(size, max_random_size), + [&]() { return distribution(gen); }); + for(size_t i = max_random_size; i < size; i += max_random_size) + { + std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); + } + return data; } -template +template inline std::vector -get_random_data01(size_t size, float p, - size_t max_random_size = default_max_random_size) { - std::random_device rd; - std::default_random_engine gen(rd()); - std::bernoulli_distribution distribution(p); - std::vector data(size); - std::generate(data.begin(), data.begin() + std::min(size, max_random_size), - [&]() { return distribution(gen); }); - for (size_t i = max_random_size; i < size; i += max_random_size) { - std::copy_n(data.begin(), std::min(size - i, max_random_size), - data.begin() + i); - } - return data; + get_random_data01(size_t size, float p, size_t max_random_size = default_max_random_size) +{ + std::random_device rd; + std::default_random_engine gen(rd()); + std::bernoulli_distribution distribution(p); + std::vector data(size); + std::generate(data.begin(), + data.begin() + std::min(size, max_random_size), + [&]() { return distribution(gen); }); + for(size_t i = max_random_size; i < size; i += max_random_size) + { + std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); + } + return data; } -template inline T get_random_value(T min, T max) { - return get_random_data(1, min, max)[0]; +template +inline T get_random_value(T min, T max) +{ + return get_random_data(1, min, max)[0]; } // Can't use std::prefix_sum for inclusive/exclusive scan, because // it does not handle short[] -> int(int a, int b) { a + b; } -> int[] // they way we expect. That's because sum in std::prefix_sum's implementation // is of type typename std::iterator_traits::value_type (short) -template -OutputIt host_inclusive_scan(InputIt first, InputIt last, OutputIt d_first, - BinaryOperation op) { - using input_type = typename std::iterator_traits::value_type; - using output_type = typename std::iterator_traits::value_type; - using result_type = - typename std::conditional::value, input_type, - output_type>::type; - - if (first == last) - return d_first; - - result_type sum = *first; - *d_first = sum; - - while (++first != last) { - sum = op(sum, static_cast(*first)); - *++d_first = sum; - } - return ++d_first; +template +OutputIt host_inclusive_scan(InputIt first, InputIt last, OutputIt d_first, BinaryOperation op) +{ + using input_type = typename std::iterator_traits::value_type; + using output_type = typename std::iterator_traits::value_type; + using result_type = + typename std::conditional::value, input_type, output_type>::type; + + if(first == last) + return d_first; + + result_type sum = *first; + *d_first = sum; + + while(++first != last) + { + sum = op(sum, static_cast(*first)); + *++d_first = sum; + } + return ++d_first; } -template -OutputIt host_exclusive_scan(InputIt first, InputIt last, T initial_value, - OutputIt d_first, BinaryOperation op) { - using input_type = typename std::iterator_traits::value_type; - using output_type = typename std::iterator_traits::value_type; - using result_type = - typename std::conditional::value, input_type, - output_type>::type; - - if (first == last) - return d_first; - - result_type sum = initial_value; - *d_first = initial_value; - - while ((first + 1) != last) { - sum = op(sum, static_cast(*first)); - *++d_first = sum; - first++; - } - return ++d_first; +template +OutputIt host_exclusive_scan( + InputIt first, InputIt last, T initial_value, OutputIt d_first, BinaryOperation op) +{ + using input_type = typename std::iterator_traits::value_type; + using output_type = typename std::iterator_traits::value_type; + using result_type = + typename std::conditional::value, input_type, output_type>::type; + + if(first == last) + return d_first; + + result_type sum = initial_value; + *d_first = initial_value; + + while((first + 1) != last) + { + sum = op(sum, static_cast(*first)); + *++d_first = sum; + first++; + } + return ++d_first; } -template -OutputIt host_exclusive_scan_by_key(InputIt first, InputIt last, KeyIt k_first, - T initial_value, OutputIt d_first, +template +OutputIt host_exclusive_scan_by_key(InputIt first, + InputIt last, + KeyIt k_first, + T initial_value, + OutputIt d_first, BinaryOperation op, - KeyCompare key_compare_op) { - using input_type = typename std::iterator_traits::value_type; - using output_type = typename std::iterator_traits::value_type; - using result_type = - typename std::conditional::value, input_type, - output_type>::type; - - if (first == last) - return d_first; - - result_type sum = initial_value; - *d_first = initial_value; - - while ((first + 1) != last) { - if (key_compare_op(*k_first, *++k_first)) { - sum = op(sum, static_cast(*first)); - } else { - sum = initial_value; + KeyCompare key_compare_op) +{ + using input_type = typename std::iterator_traits::value_type; + using output_type = typename std::iterator_traits::value_type; + using result_type = + typename std::conditional::value, input_type, output_type>::type; + + if(first == last) + return d_first; + + result_type sum = initial_value; + *d_first = initial_value; + + while((first + 1) != last) + { + if(key_compare_op(*k_first, *++k_first)) + { + sum = op(sum, static_cast(*first)); + } else + { + sum = initial_value; + } + *++d_first = sum; + first++; } - *++d_first = sum; - first++; - } - return ++d_first; + return ++d_first; } -template struct custom_type { - using first_type = T; - using second_type = U; +template +struct custom_type +{ + using first_type = T; + using second_type = U; - T x; - U y; + T x; + U y; - HIPCUB_HOST_DEVICE inline constexpr custom_type() : x(T()), y(U()) {} + HIPCUB_HOST_DEVICE inline constexpr custom_type() : x(T()), y(U()) {} - HIPCUB_HOST_DEVICE inline constexpr custom_type(T xx, U yy) : x(xx), y(yy) {} + HIPCUB_HOST_DEVICE inline constexpr custom_type(T xx, U yy) : x(xx), y(yy) {} - HIPCUB_HOST_DEVICE inline constexpr custom_type(T xy) : x(xy), y(xy) {} + HIPCUB_HOST_DEVICE inline constexpr custom_type(T xy) : x(xy), y(xy) {} - template - HIPCUB_HOST_DEVICE inline custom_type(const custom_type &other) - : x(other.x), y(other.y) {} + template + HIPCUB_HOST_DEVICE inline custom_type(const custom_type& other) : x(other.x), y(other.y) + {} #ifndef HIPCUB_CUB_API - HIPCUB_HOST_DEVICE inline ~custom_type() = default; + HIPCUB_HOST_DEVICE inline ~custom_type() = default; #endif - HIPCUB_HOST_DEVICE inline custom_type &operator=(const custom_type &other) { - x = other.x; - y = other.y; - return *this; - } - - HIPCUB_HOST_DEVICE inline custom_type - operator+(const custom_type &rhs) const { - return custom_type(x + rhs.x, y + rhs.y); - } - - HIPCUB_HOST_DEVICE inline custom_type - operator-(const custom_type &other) const { - return custom_type(x - other.x, y - other.y); - } - - HIPCUB_HOST_DEVICE inline bool operator<(const custom_type &rhs) const { - // intentionally suboptimal choice for short-circuting, - // required to generate more performant device code - return ((x == rhs.x && y < rhs.y) || x < rhs.x); - } - - HIPCUB_HOST_DEVICE inline bool operator>(const custom_type &other) const { - return (x > other.x || (x == other.x && y > other.y)); - } - - HIPCUB_HOST_DEVICE inline bool operator==(const custom_type &rhs) const { - return x == rhs.x && y == rhs.y; - } - - HIPCUB_HOST_DEVICE inline bool operator!=(const custom_type &other) const { - return !(*this == other); - } - - HIPCUB_HOST_DEVICE custom_type &operator+=(const custom_type &rhs) { - this->x += rhs.x; - this->y += rhs.y; - return *this; - } + HIPCUB_HOST_DEVICE inline custom_type& operator=(const custom_type& other) + { + x = other.x; + y = other.y; + return *this; + } + + HIPCUB_HOST_DEVICE inline custom_type operator+(const custom_type& rhs) const + { + return custom_type(x + rhs.x, y + rhs.y); + } + + HIPCUB_HOST_DEVICE inline custom_type operator-(const custom_type& other) const + { + return custom_type(x - other.x, y - other.y); + } + + HIPCUB_HOST_DEVICE inline bool operator<(const custom_type& rhs) const + { + // intentionally suboptimal choice for short-circuting, + // required to generate more performant device code + return ((x == rhs.x && y < rhs.y) || x < rhs.x); + } + + HIPCUB_HOST_DEVICE inline bool operator>(const custom_type& other) const + { + return (x > other.x || (x == other.x && y > other.y)); + } + + HIPCUB_HOST_DEVICE inline bool operator==(const custom_type& rhs) const + { + return x == rhs.x && y == rhs.y; + } + + HIPCUB_HOST_DEVICE inline bool operator!=(const custom_type& other) const + { + return !(*this == other); + } + + HIPCUB_HOST_DEVICE custom_type& operator+=(const custom_type& rhs) + { + this->x += rhs.x; + this->y += rhs.y; + return *this; + } }; -template struct is_custom_type : std::false_type {}; +template +struct is_custom_type : std::false_type +{}; -template -struct is_custom_type> : std::true_type {}; +template +struct is_custom_type> : std::true_type +{}; -template struct custom_type_decomposer { - static_assert(is_custom_type::value, - "custom_type_decomposer can only be used with instantiations " - "of custom_type"); +template +struct custom_type_decomposer +{ + static_assert(is_custom_type::value, + "custom_type_decomposer can only be used with instantiations " + "of custom_type"); - using T = typename CustomType::first_type; - using U = typename CustomType::second_type; + using T = typename CustomType::first_type; + using U = typename CustomType::second_type; - HIPCUB_HOST_DEVICE ::hipcub::tuple - operator()(CustomType &key) const { - return ::hipcub::tuple{key.x, key.y}; - } + HIPCUB_HOST_DEVICE ::hipcub::tuple operator()(CustomType& key) const + { + return ::hipcub::tuple{key.x, key.y}; + } }; -template -inline auto get_random_data(size_t size, T min, T max, - size_t max_random_size = 1024 * 1024) -> - typename std::enable_if::value, std::vector>::type { - using first_type = typename T::first_type; - using second_type = typename T::second_type; - std::vector data(size); - auto fdata = get_random_data(size, min.x, max.x, max_random_size); - auto sdata = - get_random_data(size, min.y, max.y, max_random_size); - for (size_t i = 0; i < size; i++) { - data[i] = T(fdata[i], sdata[i]); - } - return data; +template +inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) -> + typename std::enable_if::value, std::vector>::type +{ + using first_type = typename T::first_type; + using second_type = typename T::second_type; + std::vector data(size); + auto fdata = get_random_data(size, min.x, max.x, max_random_size); + auto sdata = get_random_data(size, min.y, max.y, max_random_size); + for(size_t i = 0; i < size; i++) + { + data[i] = T(fdata[i], sdata[i]); + } + return data; } -template -inline auto get_random_data(size_t size, T min, T max, - size_t max_random_size = 1024 * 1024) -> - typename std::enable_if::value && - !std::is_same::value, - std::vector>::type { - - using field_type = decltype(max.x); - std::vector data(size); - auto field_data = - get_random_data(size, min.x, max.x, max_random_size); - for (size_t i = 0; i < size; i++) { - data[i] = T(field_data[i]); - } - return data; +template +inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) -> + typename std::enable_if::value + && !std::is_same::value, + std::vector>::type +{ + + using field_type = decltype(max.x); + std::vector data(size); + auto field_data = get_random_data(size, min.x, max.x, max_random_size); + for(size_t i = 0; i < size; i++) + { + data[i] = T(field_data[i]); + } + return data; } -template -std::vector get_random_segments(const size_t size, - const size_t max_segment_length, - const int seed_value) { - static_assert(std::is_arithmetic::value, "Key type must be arithmetic"); - - std::default_random_engine prng(seed_value); - std::uniform_int_distribution segment_length_distribution( - max_segment_length); - using key_distribution_type = - std::conditional_t::value, - std::uniform_int_distribution, - std::uniform_real_distribution>; - key_distribution_type key_distribution(std::numeric_limits::max()); - std::vector keys(size); - - size_t keys_start_index = 0; - while (keys_start_index < size) { - const size_t new_segment_length = segment_length_distribution(prng); - const size_t new_segment_end = - std::min(size, keys_start_index + new_segment_length); - const T key = key_distribution(prng); - std::fill(std::next(keys.begin(), keys_start_index), - std::next(keys.begin(), new_segment_end), key); - keys_start_index += new_segment_length; - } - return keys; +template +std::vector + get_random_segments(const size_t size, const size_t max_segment_length, const int seed_value) +{ + static_assert(std::is_arithmetic::value, "Key type must be arithmetic"); + + std::default_random_engine prng(seed_value); + std::uniform_int_distribution segment_length_distribution(max_segment_length); + using key_distribution_type = std::conditional_t::value, + std::uniform_int_distribution, + std::uniform_real_distribution>; + key_distribution_type key_distribution(std::numeric_limits::max()); + std::vector keys(size); + + size_t keys_start_index = 0; + while(keys_start_index < size) + { + const size_t new_segment_length = segment_length_distribution(prng); + const size_t new_segment_end = std::min(size, keys_start_index + new_segment_length); + const T key = key_distribution(prng); + std::fill(std::next(keys.begin(), keys_start_index), + std::next(keys.begin(), new_segment_end), + key); + keys_start_index += new_segment_length; + } + return keys; } -bool is_warp_size_supported(const unsigned required_warp_size) { - return HIPCUB_HOST_WARP_THREADS >= required_warp_size; +bool is_warp_size_supported(const unsigned required_warp_size) +{ + return HIPCUB_HOST_WARP_THREADS >= required_warp_size; } -template -__device__ constexpr bool device_test_enabled_for_warp_size_v = - HIPCUB_DEVICE_WARP_THREADS >= LogicalWarpSize; +template +__device__ constexpr bool device_test_enabled_for_warp_size_v + = HIPCUB_DEVICE_WARP_THREADS >= LogicalWarpSize; -template +template using it_value_t = typename std::iterator_traits::value_type; using engine_type = std::default_random_engine; // generate_random_data_n() generates only part of sequence and replicates it, // because benchmarks usually do not need "true" random sequence. -template -inline auto generate_random_data_n(OutputIter it, size_t size, U min, V max, - Generator &gen, - size_t max_random_size = 1024 * 1024) -> - typename std::enable_if_t>::value, - OutputIter> { - using T = it_value_t; - - using dis_type = typename std::conditional<(sizeof(T) == 1), short, T>::type; - std::uniform_int_distribution distribution((T)min, (T)max); - std::generate_n(it, std::min(size, max_random_size), - [&]() { return distribution(gen); }); - for (size_t i = max_random_size; i < size; i += max_random_size) { - std::copy_n(it, std::min(size - i, max_random_size), it + i); - } - return it + size; +template +inline auto generate_random_data_n( + OutputIter it, size_t size, U min, V max, Generator& gen, size_t max_random_size = 1024 * 1024) + -> typename std::enable_if_t>::value, OutputIter> +{ + using T = it_value_t; + + using dis_type = typename std::conditional<(sizeof(T) == 1), short, T>::type; + std::uniform_int_distribution distribution((T)min, (T)max); + std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); }); + for(size_t i = max_random_size; i < size; i += max_random_size) + { + std::copy_n(it, std::min(size - i, max_random_size), it + i); + } + return it + size; } -template -inline auto generate_random_data_n(OutputIterator it, size_t size, U min, V max, - Generator &gen, - size_t max_random_size = 1024 * 1024) - -> std::enable_if_t< - std::is_floating_point>::value, - OutputIterator> { - using T = typename std::iterator_traits::value_type; - - std::uniform_real_distribution distribution((T)min, (T)max); - std::generate_n(it, std::min(size, max_random_size), - [&]() { return distribution(gen); }); - for (size_t i = max_random_size; i < size; i += max_random_size) { - std::copy_n(it, std::min(size - i, max_random_size), it + i); - } - return it + size; +template +inline auto generate_random_data_n(OutputIterator it, + size_t size, + U min, + V max, + Generator& gen, + size_t max_random_size = 1024 * 1024) + -> std::enable_if_t>::value, OutputIterator> +{ + using T = typename std::iterator_traits::value_type; + + std::uniform_real_distribution distribution((T)min, (T)max); + std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); }); + for(size_t i = max_random_size; i < size; i += max_random_size) + { + std::copy_n(it, std::min(size - i, max_random_size), it + i); + } + return it + size; } -template -struct alignas(Alignment) custom_aligned_type { - unsigned char data[Size]; +template +struct alignas(Alignment) custom_aligned_type +{ + unsigned char data[Size]; }; -template < - typename T, typename U, - std::enable_if_t::value && std::is_unsigned::value, - int> = 0> -inline constexpr auto ceiling_div(const T a, const U b) { - return a / b + (a % b > 0 ? 1 : 0); +template::value && std::is_unsigned::value, int> = 0> +inline constexpr auto ceiling_div(const T a, const U b) +{ + return a / b + (a % b > 0 ? 1 : 0); } } // namespace benchmark_utils // Need for hipcub::DeviceReduce::Min/Max etc. -namespace std { -template <> class numeric_limits> { - using T = typename benchmark_utils::custom_type; +namespace std +{ +template<> +class numeric_limits> +{ + using T = typename benchmark_utils::custom_type; public: - static constexpr inline T min() { - return std::numeric_limits::min(); - } + static constexpr inline T min() + { + return std::numeric_limits::min(); + } - static constexpr inline T max() { - return std::numeric_limits::max(); - } + static constexpr inline T max() + { + return std::numeric_limits::max(); + } - static constexpr inline T lowest() { - return std::numeric_limits::lowest(); - } + static constexpr inline T lowest() + { + return std::numeric_limits::lowest(); + } }; -template <> class numeric_limits> { - using T = typename benchmark_utils::custom_type; +template<> +class numeric_limits> +{ + using T = typename benchmark_utils::custom_type; public: - static constexpr inline T min() { - return std::numeric_limits::min(); - } + static constexpr inline T min() + { + return std::numeric_limits::min(); + } - static constexpr inline T max() { - return std::numeric_limits::max(); - } + static constexpr inline T max() + { + return std::numeric_limits::max(); + } - static constexpr inline T lowest() { - return std::numeric_limits::lowest(); - } + static constexpr inline T lowest() + { + return std::numeric_limits::lowest(); + } }; } // namespace std diff --git a/benchmark/benchmark_warp_exchange.cpp b/benchmark/benchmark_warp_exchange.cpp index 3c649c3c..e3db1f81 100644 --- a/benchmark/benchmark_warp_exchange.cpp +++ b/benchmark/benchmark_warp_exchange.cpp @@ -31,303 +31,327 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template -__device__ auto warp_exchange_benchmark(T *d_output) -> std::enable_if_t< - benchmark_utils::device_test_enabled_for_warp_size_v> { - T thread_data[ItemsPerThread]; +template +__device__ auto warp_exchange_benchmark(T* d_output) + -> std::enable_if_t> +{ + T thread_data[ItemsPerThread]; #pragma unroll - for (unsigned i = 0; i < ItemsPerThread; ++i) { - thread_data[i] = static_cast(i); - } + for(unsigned i = 0; i < ItemsPerThread; ++i) + { + thread_data[i] = static_cast(i); + } - using WarpExchangeT = - ::hipcub::WarpExchange; - constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; - __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_in_block]; - const unsigned warp_id = threadIdx.x / LogicalWarpSize; + using WarpExchangeT = ::hipcub::WarpExchange; + constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; + __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_in_block]; + const unsigned warp_id = threadIdx.x / LogicalWarpSize; - WarpExchangeT warp_exchange(temp_storage[warp_id]); - Op{}(warp_exchange, thread_data); + WarpExchangeT warp_exchange(temp_storage[warp_id]); + Op{}(warp_exchange, thread_data); #pragma unroll - for (unsigned i = 0; i < ItemsPerThread; ++i) { - const unsigned global_idx = - (BlockSize * blockIdx.x + threadIdx.x) * ItemsPerThread + i; - d_output[global_idx] = thread_data[i]; - } + for(unsigned i = 0; i < ItemsPerThread; ++i) + { + const unsigned global_idx = (BlockSize * blockIdx.x + threadIdx.x) * ItemsPerThread + i; + d_output[global_idx] = thread_data[i]; + } } -template -__device__ auto warp_exchange_benchmark(T * /*d_output*/) -> std::enable_if_t< - !benchmark_utils::device_test_enabled_for_warp_size_v> {} - -template -__global__ __launch_bounds__(BlockSize) void warp_exchange_kernel(T *d_output) { - warp_exchange_benchmark(d_output); +template +__device__ auto warp_exchange_benchmark(T* /*d_output*/) + -> std::enable_if_t> +{} + +template +__global__ __launch_bounds__(BlockSize) void warp_exchange_kernel(T* d_output) +{ + warp_exchange_benchmark(d_output); } -template -__device__ auto warp_exchange_scatter_to_striped_benchmark(T *d_output) - -> std::enable_if_t< - benchmark_utils::device_test_enabled_for_warp_size_v> { - const unsigned warp_id = threadIdx.x / LogicalWarpSize; - T thread_data[ItemsPerThread]; - OffsetT thread_ranks[ItemsPerThread]; +template +__device__ auto warp_exchange_scatter_to_striped_benchmark(T* d_output) + -> std::enable_if_t> +{ + const unsigned warp_id = threadIdx.x / LogicalWarpSize; + T thread_data[ItemsPerThread]; + OffsetT thread_ranks[ItemsPerThread]; #pragma unroll - for (unsigned i = 0; i < ItemsPerThread; ++i) { - thread_data[i] = static_cast(i); - thread_ranks[i] = static_cast(LogicalWarpSize - - warp_id * ItemsPerThread - i - 1); - } + for(unsigned i = 0; i < ItemsPerThread; ++i) + { + thread_data[i] = static_cast(i); + thread_ranks[i] = static_cast(LogicalWarpSize - warp_id * ItemsPerThread - i - 1); + } - using WarpExchangeT = - ::hipcub::WarpExchange; - constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; - __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_in_block]; + using WarpExchangeT = ::hipcub::WarpExchange; + constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; + __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_in_block]; - WarpExchangeT(temp_storage[warp_id]) - .ScatterToStriped(thread_data, thread_ranks); + WarpExchangeT(temp_storage[warp_id]).ScatterToStriped(thread_data, thread_ranks); #pragma unroll - for (unsigned i = 0; i < ItemsPerThread; ++i) { - const unsigned striped_global_idx = - BlockSize * ItemsPerThread * blockIdx.x + BlockSize * i + threadIdx.x; - d_output[striped_global_idx] = thread_data[i]; - } + for(unsigned i = 0; i < ItemsPerThread; ++i) + { + const unsigned striped_global_idx + = BlockSize * ItemsPerThread * blockIdx.x + BlockSize * i + threadIdx.x; + d_output[striped_global_idx] = thread_data[i]; + } } -template -__device__ auto warp_exchange_scatter_to_striped_benchmark(T * /*d_output*/) - -> std::enable_if_t> {} - -template -__global__ __launch_bounds__( - BlockSize) void warp_exchange_scatter_to_striped_kernel(T *d_output) { - warp_exchange_scatter_to_striped_benchmark(d_output); +template +__device__ auto warp_exchange_scatter_to_striped_benchmark(T* /*d_output*/) + -> std::enable_if_t> +{} + +template +__global__ __launch_bounds__(BlockSize) void warp_exchange_scatter_to_striped_kernel(T* d_output) +{ + warp_exchange_scatter_to_striped_benchmark( + d_output); } -template -void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { - constexpr unsigned trials = 100; - constexpr unsigned items_per_block = BlockSize * ItemsPerThread; - const unsigned size = - items_per_block * ((N + items_per_block - 1) / items_per_block); - - T *d_output; - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < trials; ++i) { - warp_exchange_kernel - <<>>( - d_output); +template +void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) +{ + constexpr unsigned trials = 100; + constexpr unsigned items_per_block = BlockSize * ItemsPerThread; + const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); + + T* d_output; + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < trials; ++i) + { + warp_exchange_kernel + <<>>(d_output); + } + + HIP_CHECK(hipPeekAtLastError()) + HIP_CHECK(hipDeviceSynchronize()); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); } + state.SetBytesProcessed(state.iterations() * trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * trials * size); - HIP_CHECK(hipPeekAtLastError()) - HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * trials * size); - - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_output)); } -template -void run_benchmark_scatter_to_striped(benchmark::State &state, - hipStream_t stream, size_t N) { - constexpr unsigned trials = 100; - constexpr unsigned items_per_block = BlockSize * ItemsPerThread; - const unsigned size = - items_per_block * ((N + items_per_block - 1) / items_per_block); - - T *d_output; - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < trials; ++i) { - warp_exchange_scatter_to_striped_kernel - <<>>( - d_output); +template +void run_benchmark_scatter_to_striped(benchmark::State& state, hipStream_t stream, size_t N) +{ + constexpr unsigned trials = 100; + constexpr unsigned items_per_block = BlockSize * ItemsPerThread; + const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); + + T* d_output; + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < trials; ++i) + { + warp_exchange_scatter_to_striped_kernel + <<>>(d_output); + } + + HIP_CHECK(hipPeekAtLastError()) + HIP_CHECK(hipDeviceSynchronize()); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); } + state.SetBytesProcessed(state.iterations() * trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * trials * size); - HIP_CHECK(hipPeekAtLastError()) - HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * trials * size); - - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_output)); } -struct StripedToBlockedOp { - template - __device__ void operator()(WarpExchangeT &warp_exchange, - T (&thread_data)[ItemsPerThread]) const { - warp_exchange.StripedToBlocked(thread_data, thread_data); - } +struct StripedToBlockedOp +{ + template + __device__ void operator()(WarpExchangeT& warp_exchange, T (&thread_data)[ItemsPerThread]) const + { + warp_exchange.StripedToBlocked(thread_data, thread_data); + } }; -struct BlockedToStripedOp { - template - __device__ void operator()(WarpExchangeT &warp_exchange, - T (&thread_data)[ItemsPerThread]) const { - warp_exchange.BlockedToStriped(thread_data, thread_data); - } +struct BlockedToStripedOp +{ + template + __device__ void operator()(WarpExchangeT& warp_exchange, T (&thread_data)[ItemsPerThread]) const + { + warp_exchange.BlockedToStriped(thread_data, thread_data); + } }; -#define CREATE_BENCHMARK_STRIPED_TO_BLOCKED(T, BS, IT, WS, ALG) \ - benchmark::RegisterBenchmark( \ - std::string("warp_exchange_striped_to_blocked.") \ - .c_str(), \ - &run_benchmark, \ - stream, size) - -#define CREATE_BENCHMARK_BLOCKED_TO_STRIPED(T, BS, IT, WS, ALG) \ - benchmark::RegisterBenchmark( \ - std::string("warp_exchange_blocked_to_striped.") \ - .c_str(), \ - &run_benchmark, \ - stream, size) - -#define CREATE_BENCHMARK_SCATTER_TO_STRIPED(T, OFFSET_T, BS, IT, WS) \ - benchmark::RegisterBenchmark( \ - std::string("warp_exchange_scatter_to_striped.") \ - .c_str(), \ - &run_benchmark_scatter_to_striped, stream, \ - size) - -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_warp_exchange" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks{ - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 16), - CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 32), - CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 256, 4, 32), - - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, - WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, - WARP_EXCHANGE_SHUFFLE), +#define CREATE_BENCHMARK_STRIPED_TO_BLOCKED(T, BS, IT, WS, ALG) \ + benchmark::RegisterBenchmark(std::string("warp_exchange_striped_to_blocked.") \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) + +#define CREATE_BENCHMARK_BLOCKED_TO_STRIPED(T, BS, IT, WS, ALG) \ + benchmark::RegisterBenchmark(std::string("warp_exchange_blocked_to_striped.") \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) + +#define CREATE_BENCHMARK_SCATTER_TO_STRIPED(T, OFFSET_T, BS, IT, WS) \ + benchmark::RegisterBenchmark(std::string("warp_exchange_scatter_to_striped.") \ + .c_str(), \ + &run_benchmark_scatter_to_striped, \ + stream, \ + size) + +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_warp_exchange" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks{ + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 16), + CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 32), + CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 256, 4, 32), + + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, WARP_EXCHANGE_SHUFFLE), // CUB requires WS == IPT for WARP_EXCHANGE_SHUFFLE #ifdef HIPCUB_ROCPRIM_API - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, - WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, - WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, - WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, - WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, - WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, - WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, WARP_EXCHANGE_SHUFFLE), #endif - }; + }; #ifdef HIPCUB_ROCPRIM_API - if (::benchmark_utils::is_warp_size_supported(64)) { - std::vector additional_benchmarks{ - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, - WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, - WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, - WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, - WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 64), - - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, - WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, - WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, - WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, - WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 256, 4, 64)}; - benchmarks.insert(benchmarks.end(), additional_benchmarks.begin(), - additional_benchmarks.end()); - } + if(::benchmark_utils::is_warp_size_supported(64)) + { + std::vector additional_benchmarks{ + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 64), + + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 256, 4, 64)}; + benchmarks.insert(benchmarks.end(), + additional_benchmarks.begin(), + additional_benchmarks.end()); + } #endif - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_warp_load.cpp b/benchmark/benchmark_warp_load.cpp index 3479f83f..ab7057da 100644 --- a/benchmark/benchmark_warp_load.cpp +++ b/benchmark/benchmark_warp_load.cpp @@ -31,217 +31,235 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template -__device__ auto warp_load_benchmark(T *d_input, T *d_output) - -> std::enable_if_t< - benchmark_utils::device_test_enabled_for_warp_size_v> { - using WarpLoadT = - ::hipcub::WarpLoad; - constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; - constexpr int tile_size = ItemsPerThread * LogicalWarpSize; - - const unsigned warp_id = threadIdx.x / LogicalWarpSize; - const unsigned global_warp_id = blockIdx.x * warps_in_block + warp_id; - __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; - T thread_data[ItemsPerThread]; - - WarpLoadT(temp_storage[warp_id]) - .Load(d_input + global_warp_id * tile_size, thread_data); +template +__device__ auto warp_load_benchmark(T* d_input, T* d_output) + -> std::enable_if_t> +{ + using WarpLoadT = ::hipcub::WarpLoad; + constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; + constexpr int tile_size = ItemsPerThread * LogicalWarpSize; + + const unsigned warp_id = threadIdx.x / LogicalWarpSize; + const unsigned global_warp_id = blockIdx.x * warps_in_block + warp_id; + __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; + T thread_data[ItemsPerThread]; + + WarpLoadT(temp_storage[warp_id]).Load(d_input + global_warp_id * tile_size, thread_data); #pragma unroll - for (unsigned i = 0; i < ItemsPerThread; ++i) { - const unsigned striped_global_idx = - BlockSize * ItemsPerThread * blockIdx.x + BlockSize * i + threadIdx.x; - d_output[striped_global_idx] = thread_data[i]; - } + for(unsigned i = 0; i < ItemsPerThread; ++i) + { + const unsigned striped_global_idx + = BlockSize * ItemsPerThread * blockIdx.x + BlockSize * i + threadIdx.x; + d_output[striped_global_idx] = thread_data[i]; + } } -template -__device__ auto -warp_load_benchmark(T * /*d_input*/, T * /*d_output*/) -> std::enable_if_t< - !benchmark_utils::device_test_enabled_for_warp_size_v> {} - -template -__global__ __launch_bounds__(BlockSize) void warp_load_kernel(T *d_input, - T *d_output) { - warp_load_benchmark( - d_input, d_output); +template +__device__ auto warp_load_benchmark(T* /*d_input*/, T* /*d_output*/) + -> std::enable_if_t> +{} + +template +__global__ __launch_bounds__(BlockSize) void warp_load_kernel(T* d_input, T* d_output) +{ + warp_load_benchmark(d_input, d_output); } -template -void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { - constexpr unsigned items_per_block = BlockSize * ItemsPerThread; - const unsigned size = - items_per_block * ((N + items_per_block - 1) / items_per_block); - - std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); - T *d_input; - T *d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), - hipMemcpyHostToDevice)); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i = 0; i < Trials; i++) { - warp_load_kernel - <<>>( - d_input, d_output); +template +void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) +{ + constexpr unsigned items_per_block = BlockSize * ItemsPerThread; + const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); + + std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); + T* d_input; + T* d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); + + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < Trials; i++) + { + warp_load_kernel + <<>>(d_input, d_output); + } + HIP_CHECK(hipPeekAtLastError()) + HIP_CHECK(hipDeviceSynchronize()); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); } - HIP_CHECK(hipPeekAtLastError()) - HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); - - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ - benchmark::RegisterBenchmark( \ - "warp_load.", \ - &run_benchmark, stream, size) - -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks{ - CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_VECTORIZE), - // WARP_LOAD_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_VECTORIZE) - // WARP_LOAD_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_TRANSPOSE) - }; - - if (::benchmark_utils::is_warp_size_supported(64)) { - std::vector additional_benchmarks{ - CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_VECTORIZE), - CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_VECTORIZE), - // WARP_LOAD_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_VECTORIZE), +#define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ + benchmark::RegisterBenchmark("warp_load.", \ + &run_benchmark, \ + stream, \ + size) + +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks{ + CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_VECTORIZE), // WARP_LOAD_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_DIRECT), - CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_STRIPED), - CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_VECTORIZE) + // CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_VECTORIZE) // WARP_LOAD_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_TRANSPOSE) + // CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_TRANSPOSE) }; - benchmarks.insert(benchmarks.end(), additional_benchmarks.begin(), - additional_benchmarks.end()); - } - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); + + if(::benchmark_utils::is_warp_size_supported(64)) + { + std::vector additional_benchmarks{ + CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_VECTORIZE), + CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_VECTORIZE), + // WARP_LOAD_TRANSPOSE removed because of shared memory limit + // CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_VECTORIZE), + // WARP_LOAD_TRANSPOSE removed because of shared memory limit + // CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_DIRECT), + CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_STRIPED), + CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_VECTORIZE) + // WARP_LOAD_TRANSPOSE removed because of shared memory limit + // CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_TRANSPOSE) + }; + benchmarks.insert(benchmarks.end(), + additional_benchmarks.begin(), + additional_benchmarks.end()); + } + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_warp_merge_sort.cpp b/benchmark/benchmark_warp_merge_sort.cpp index e29d14be..4351e679 100644 --- a/benchmark/benchmark_warp_merge_sort.cpp +++ b/benchmark/benchmark_warp_merge_sort.cpp @@ -35,442 +35,538 @@ constexpr size_t DEFAULT_N = 1024 * 1024 * 128; #endif -enum class benchmark_kinds { - sort_keys, - sort_pairs, +enum class benchmark_kinds +{ + sort_keys, + sort_pairs, }; -template -__device__ auto sort_keys_benchmark(const T *input, T *output, - Compare compare_op) - -> std::enable_if_t< - benchmark_utils::device_test_enabled_for_warp_size_v> { - constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; +template +__device__ auto sort_keys_benchmark(const T* input, T* output, Compare compare_op) + -> std::enable_if_t> +{ + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int flat_tid = threadIdx.x; - const unsigned int block_offset = blockIdx.x * items_per_block; - T keys[ItemsPerThread]; - hipcub::LoadDirectBlocked(flat_tid, input + block_offset, keys); + const unsigned int flat_tid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * items_per_block; + T keys[ItemsPerThread]; + hipcub::LoadDirectBlocked(flat_tid, input + block_offset, keys); - constexpr unsigned int warps_per_block = BlockSize / LogicalWarpSize; - const unsigned int warp_id = threadIdx.x / LogicalWarpSize; + constexpr unsigned int warps_per_block = BlockSize / LogicalWarpSize; + const unsigned int warp_id = threadIdx.x / LogicalWarpSize; - using warp_merge_sort = - hipcub::WarpMergeSort; - __shared__ typename warp_merge_sort::TempStorage storage[warps_per_block]; + using warp_merge_sort = hipcub::WarpMergeSort; + __shared__ typename warp_merge_sort::TempStorage storage[warps_per_block]; - warp_merge_sort wsort{storage[warp_id]}; - wsort.Sort(keys, compare_op); + warp_merge_sort wsort{storage[warp_id]}; + wsort.Sort(keys, compare_op); - hipcub::StoreDirectBlocked(flat_tid, output + block_offset, keys); + hipcub::StoreDirectBlocked(flat_tid, output + block_offset, keys); } -template -__device__ auto sort_keys_benchmark(const T * /*input*/, T * /*output*/, - Compare /*compare_op*/) - -> std::enable_if_t> {} - -template -__global__ __launch_bounds__(BlockSize) void sort_keys(const T *input, - T *output, - Compare compare_op) { - sort_keys_benchmark(input, output, - compare_op); +template +__device__ auto sort_keys_benchmark(const T* /*input*/, T* /*output*/, Compare /*compare_op*/) + -> std::enable_if_t> +{} + +template +__global__ + __launch_bounds__(BlockSize) void sort_keys(const T* input, T* output, Compare compare_op) +{ + sort_keys_benchmark(input, output, compare_op); } -template -__device__ auto sort_pairs_benchmark(const T *input, T *output, - Compare compare_op) - -> std::enable_if_t< - benchmark_utils::device_test_enabled_for_warp_size_v> { - constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - - const unsigned int flat_tid = threadIdx.x; - const unsigned int block_offset = blockIdx.x * items_per_block; - T keys[ItemsPerThread]; - T values[ItemsPerThread]; - hipcub::LoadDirectBlocked(flat_tid, input + block_offset, keys); - - for (unsigned int i = 0; i < ItemsPerThread; ++i) { - values[i] = keys[i] + T(1); - } +template +__device__ auto sort_pairs_benchmark(const T* input, T* output, Compare compare_op) + -> std::enable_if_t> +{ + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + + const unsigned int flat_tid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * items_per_block; + T keys[ItemsPerThread]; + T values[ItemsPerThread]; + hipcub::LoadDirectBlocked(flat_tid, input + block_offset, keys); + + for(unsigned int i = 0; i < ItemsPerThread; ++i) + { + values[i] = keys[i] + T(1); + } - constexpr unsigned int warps_per_block = BlockSize / LogicalWarpSize; - const unsigned int warp_id = threadIdx.x / LogicalWarpSize; + constexpr unsigned int warps_per_block = BlockSize / LogicalWarpSize; + const unsigned int warp_id = threadIdx.x / LogicalWarpSize; - using warp_merge_sort = - hipcub::WarpMergeSort; - __shared__ typename warp_merge_sort::TempStorage storage[warps_per_block]; + using warp_merge_sort = hipcub::WarpMergeSort; + __shared__ typename warp_merge_sort::TempStorage storage[warps_per_block]; - warp_merge_sort wsort{storage[warp_id]}; - wsort.Sort(keys, values, compare_op); + warp_merge_sort wsort{storage[warp_id]}; + wsort.Sort(keys, values, compare_op); - for (unsigned int i = 0; i < ItemsPerThread; ++i) { - keys[i] += values[i]; - } + for(unsigned int i = 0; i < ItemsPerThread; ++i) + { + keys[i] += values[i]; + } - hipcub::StoreDirectBlocked(flat_tid, output + block_offset, keys); + hipcub::StoreDirectBlocked(flat_tid, output + block_offset, keys); } -template -__device__ auto sort_pairs_benchmark(const T * /*input*/, T * /*output*/, - Compare /*compare_op*/) - -> std::enable_if_t> {} - -template -__global__ __launch_bounds__(BlockSize) void sort_pairs(const T *input, - T *output, - Compare compare_op) { - sort_pairs_benchmark( - input, output, compare_op); +template +__device__ auto sort_pairs_benchmark(const T* /*input*/, T* /*output*/, Compare /*compare_op*/) + -> std::enable_if_t> +{} + +template +__global__ + __launch_bounds__(BlockSize) void sort_pairs(const T* input, T* output, Compare compare_op) +{ + sort_pairs_benchmark(input, output, compare_op); } -template struct max_value { - static constexpr T value = std::numeric_limits::max(); +template +struct max_value +{ + static constexpr T value = std::numeric_limits::max(); }; -template -__device__ auto sort_keys_segmented_benchmark(const T *input, T *output, - const unsigned int *segment_sizes, - Compare compare) - -> std::enable_if_t< - benchmark_utils::device_test_enabled_for_warp_size_v> { - constexpr unsigned int max_segment_size = LogicalWarpSize * ItemsPerThread; - constexpr unsigned int segments_per_block = BlockSize / LogicalWarpSize; +template +__device__ auto sort_keys_segmented_benchmark(const T* input, + T* output, + const unsigned int* segment_sizes, + Compare compare) + -> std::enable_if_t> +{ + constexpr unsigned int max_segment_size = LogicalWarpSize * ItemsPerThread; + constexpr unsigned int segments_per_block = BlockSize / LogicalWarpSize; - using warp_merge_sort = - hipcub::WarpMergeSort; - __shared__ typename warp_merge_sort::TempStorage storage[segments_per_block]; + using warp_merge_sort = hipcub::WarpMergeSort; + __shared__ typename warp_merge_sort::TempStorage storage[segments_per_block]; - const unsigned int warp_id = threadIdx.x / LogicalWarpSize; - warp_merge_sort wsort{storage[warp_id]}; + const unsigned int warp_id = threadIdx.x / LogicalWarpSize; + warp_merge_sort wsort{storage[warp_id]}; - const unsigned int segment_id = blockIdx.x * segments_per_block + warp_id; + const unsigned int segment_id = blockIdx.x * segments_per_block + warp_id; - const unsigned int segment_size = segment_sizes[segment_id]; - const unsigned int warp_offset = segment_id * max_segment_size; - T keys[ItemsPerThread]; + const unsigned int segment_size = segment_sizes[segment_id]; + const unsigned int warp_offset = segment_id * max_segment_size; + T keys[ItemsPerThread]; - const unsigned int flat_tid = wsort.get_linear_tid(); - hipcub::LoadDirectBlocked(flat_tid, input + warp_offset, keys, segment_size); + const unsigned int flat_tid = wsort.get_linear_tid(); + hipcub::LoadDirectBlocked(flat_tid, input + warp_offset, keys, segment_size); - const T oob_default = max_value::value; - wsort.Sort(keys, compare, segment_size, oob_default); + const T oob_default = max_value::value; + wsort.Sort(keys, compare, segment_size, oob_default); - hipcub::StoreDirectBlocked(flat_tid, output + warp_offset, keys, - segment_size); + hipcub::StoreDirectBlocked(flat_tid, output + warp_offset, keys, segment_size); } -template -__device__ auto -sort_keys_segmented_benchmark(const T * /*input*/, T * /*output*/, - const unsigned int * /*segment_sizes*/, - Compare /*compare*/) - -> std::enable_if_t> {} - -template -__global__ __launch_bounds__(BlockSize) void sort_keys_segmented( - const T *input, T *output, const unsigned int *segment_sizes, - Compare compare) { - sort_keys_segmented_benchmark( - input, output, segment_sizes, compare); +template +__device__ auto sort_keys_segmented_benchmark(const T* /*input*/, + T* /*output*/, + const unsigned int* /*segment_sizes*/, + Compare /*compare*/) + -> std::enable_if_t> +{} + +template +__global__ __launch_bounds__(BlockSize) void sort_keys_segmented(const T* input, + T* output, + const unsigned int* segment_sizes, + Compare compare) +{ + sort_keys_segmented_benchmark(input, + output, + segment_sizes, + compare); } -template -__device__ auto -sort_pairs_segmented_benchmark(const T *input, T *output, - const unsigned int *segment_sizes, - Compare compare) - -> std::enable_if_t< - benchmark_utils::device_test_enabled_for_warp_size_v> { - constexpr unsigned int max_segment_size = LogicalWarpSize * ItemsPerThread; - constexpr unsigned int segments_per_block = BlockSize / LogicalWarpSize; - - using warp_merge_sort = - hipcub::WarpMergeSort; - __shared__ typename warp_merge_sort::TempStorage storage[segments_per_block]; - - const unsigned int warp_id = threadIdx.x / LogicalWarpSize; - warp_merge_sort wsort{storage[warp_id]}; - - const unsigned int segment_id = blockIdx.x * segments_per_block + warp_id; - - const unsigned int segment_size = segment_sizes[segment_id]; - const unsigned int warp_offset = segment_id * max_segment_size; - T keys[ItemsPerThread]; - T values[ItemsPerThread]; - - const unsigned int flat_tid = wsort.get_linear_tid(); - hipcub::LoadDirectBlocked(flat_tid, input + warp_offset, keys, segment_size); - - for (unsigned int i = 0; i < ItemsPerThread; ++i) { - if (flat_tid * ItemsPerThread + i < segment_size) { - values[i] = keys[i] + T(1); +template +__device__ auto sort_pairs_segmented_benchmark(const T* input, + T* output, + const unsigned int* segment_sizes, + Compare compare) + -> std::enable_if_t> +{ + constexpr unsigned int max_segment_size = LogicalWarpSize * ItemsPerThread; + constexpr unsigned int segments_per_block = BlockSize / LogicalWarpSize; + + using warp_merge_sort = hipcub::WarpMergeSort; + __shared__ typename warp_merge_sort::TempStorage storage[segments_per_block]; + + const unsigned int warp_id = threadIdx.x / LogicalWarpSize; + warp_merge_sort wsort{storage[warp_id]}; + + const unsigned int segment_id = blockIdx.x * segments_per_block + warp_id; + + const unsigned int segment_size = segment_sizes[segment_id]; + const unsigned int warp_offset = segment_id * max_segment_size; + T keys[ItemsPerThread]; + T values[ItemsPerThread]; + + const unsigned int flat_tid = wsort.get_linear_tid(); + hipcub::LoadDirectBlocked(flat_tid, input + warp_offset, keys, segment_size); + + for(unsigned int i = 0; i < ItemsPerThread; ++i) + { + if(flat_tid * ItemsPerThread + i < segment_size) + { + values[i] = keys[i] + T(1); + } } - } - const T oob_default = max_value::value; - wsort.Sort(keys, values, compare, segment_size, oob_default); + const T oob_default = max_value::value; + wsort.Sort(keys, values, compare, segment_size, oob_default); - for (unsigned int i = 0; i < ItemsPerThread; ++i) { - if (flat_tid * ItemsPerThread + i < segment_size) { - keys[i] += values[i]; + for(unsigned int i = 0; i < ItemsPerThread; ++i) + { + if(flat_tid * ItemsPerThread + i < segment_size) + { + keys[i] += values[i]; + } } - } - hipcub::StoreDirectBlocked(flat_tid, output + warp_offset, keys, - segment_size); + hipcub::StoreDirectBlocked(flat_tid, output + warp_offset, keys, segment_size); } -template -__device__ auto -sort_pairs_segmented_benchmark(const T * /*input*/, T * /*output*/, - const unsigned int * /*segment_sizes*/, - Compare /*compare*/) - -> std::enable_if_t> {} - -template -__global__ __launch_bounds__(BlockSize) void sort_pairs_segmented( - const T *input, T *output, const unsigned int *segment_sizes, - Compare compare) { - sort_pairs_segmented_benchmark( - input, output, segment_sizes, compare); +template +__device__ auto sort_pairs_segmented_benchmark(const T* /*input*/, + T* /*output*/, + const unsigned int* /*segment_sizes*/, + Compare /*compare*/) + -> std::enable_if_t> +{} + +template +__global__ __launch_bounds__(BlockSize) void sort_pairs_segmented(const T* input, + T* output, + const unsigned int* segment_sizes, + Compare compare) +{ + sort_pairs_segmented_benchmark(input, + output, + segment_sizes, + compare); } -template -void run_benchmark(benchmark::State &state, +template +void run_benchmark(benchmark::State& state, const benchmark_kinds benchmark_kind, - const hipStream_t stream, const size_t N) { - constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = - items_per_block * ((N + items_per_block - 1) / items_per_block); - - const auto input = - std::is_floating_point::value - ? benchmark_utils::get_random_data(size, static_cast(-1000), - static_cast(1000)) - : benchmark_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max()); - - T *d_input = nullptr; - T *d_output = nullptr; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(input[0]))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(input[0]))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), - hipMemcpyHostToDevice)); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - if (benchmark_kind == benchmark_kinds::sort_keys) { - for (unsigned int i = 0; i < Trials; ++i) { - sort_keys - <<>>( - d_input, d_output, CompareOp{}); - } - } else if (benchmark_kind == benchmark_kinds::sort_pairs) { - for (unsigned int i = 0; i < Trials; ++i) { - sort_pairs - <<>>( - d_input, d_output, CompareOp{}); - } + const hipStream_t stream, + const size_t N) +{ + constexpr auto items_per_block = BlockSize * ItemsPerThread; + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); + + const auto input = std::is_floating_point::value + ? benchmark_utils::get_random_data(size, + static_cast(-1000), + static_cast(1000)) + : benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); + + T* d_input = nullptr; + T* d_output = nullptr; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(input[0]))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(input[0]))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); + + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + if(benchmark_kind == benchmark_kinds::sort_keys) + { + for(unsigned int i = 0; i < Trials; ++i) + { + sort_keys + <<>>(d_input, + d_output, + CompareOp{}); + } + } else if(benchmark_kind == benchmark_kinds::sort_pairs) + { + for(unsigned int i = 0; i < Trials; ++i) + { + sort_pairs + <<>>(d_input, + d_output, + CompareOp{}); + } + } + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); } - HIP_CHECK(hipPeekAtLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); - - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -template -void run_segmented_benchmark(benchmark::State &state, +template +void run_segmented_benchmark(benchmark::State& state, const benchmark_kinds benchmark_kind, - const hipStream_t stream, const size_t N) { - constexpr auto max_segment_size = LogicalWarpSize * ItemsPerThread; - constexpr auto segments_per_block = BlockSize / LogicalWarpSize; - constexpr auto items_per_block = BlockSize * ItemsPerThread; - - const auto num_blocks = (N + items_per_block - 1) / items_per_block; - const auto num_segments = num_blocks * segments_per_block; - const auto size = num_blocks * items_per_block; - - const auto input = - std::is_floating_point::value - ? benchmark_utils::get_random_data(size, static_cast(-1000), - static_cast(1000)) - : benchmark_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max()); - - const auto segment_sizes = benchmark_utils::get_random_data( - num_segments, 0, max_segment_size); - - T *d_input = nullptr; - T *d_output = nullptr; - unsigned int *d_segment_sizes = nullptr; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(input[0]))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(input[0]))); - HIP_CHECK( - hipMalloc(&d_segment_sizes, num_segments * sizeof(segment_sizes[0]))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_segment_sizes, segment_sizes.data(), - num_segments * sizeof(segment_sizes[0]), - hipMemcpyHostToDevice)); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - - if (benchmark_kind == benchmark_kinds::sort_keys) { - for (unsigned int i = 0; i < Trials; ++i) { - sort_keys_segmented - <<>>( - d_input, d_output, d_segment_sizes, CompareOp{}); - } - } else if (benchmark_kind == benchmark_kinds::sort_pairs) { - for (unsigned int i = 0; i < Trials; ++i) { - sort_pairs_segmented - <<>>( - d_input, d_output, d_segment_sizes, CompareOp{}); - } + const hipStream_t stream, + const size_t N) +{ + constexpr auto max_segment_size = LogicalWarpSize * ItemsPerThread; + constexpr auto segments_per_block = BlockSize / LogicalWarpSize; + constexpr auto items_per_block = BlockSize * ItemsPerThread; + + const auto num_blocks = (N + items_per_block - 1) / items_per_block; + const auto num_segments = num_blocks * segments_per_block; + const auto size = num_blocks * items_per_block; + + const auto input = std::is_floating_point::value + ? benchmark_utils::get_random_data(size, + static_cast(-1000), + static_cast(1000)) + : benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); + + const auto segment_sizes + = benchmark_utils::get_random_data(num_segments, 0, max_segment_size); + + T* d_input = nullptr; + T* d_output = nullptr; + unsigned int* d_segment_sizes = nullptr; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(input[0]))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(input[0]))); + HIP_CHECK(hipMalloc(&d_segment_sizes, num_segments * sizeof(segment_sizes[0]))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_segment_sizes, + segment_sizes.data(), + num_segments * sizeof(segment_sizes[0]), + hipMemcpyHostToDevice)); + + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + + if(benchmark_kind == benchmark_kinds::sort_keys) + { + for(unsigned int i = 0; i < Trials; ++i) + { + sort_keys_segmented + <<>>(d_input, + d_output, + d_segment_sizes, + CompareOp{}); + } + } else if(benchmark_kind == benchmark_kinds::sort_pairs) + { + for(unsigned int i = 0; i < Trials; ++i) + { + sort_pairs_segmented + <<>>(d_input, + d_output, + d_segment_sizes, + CompareOp{}); + } + } + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); } - HIP_CHECK(hipPeekAtLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); - - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); - HIP_CHECK(hipFree(d_segment_sizes)); + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_segment_sizes)); } -#define CREATE_BENCHMARK(T, BS, WS, IPT) \ - if (WS <= device_warp_size) { \ - benchmarks.push_back(benchmark::RegisterBenchmark( \ - std::string("warp_merge_sort.sub_algorithm_name:" + \ - name) \ - .c_str(), \ - segmented ? &run_benchmark \ - : &run_segmented_benchmark, \ - benchmark_kind, stream, size)); \ - } +#define CREATE_BENCHMARK(T, BS, WS, IPT) \ + if(WS <= device_warp_size) \ + { \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("warp_merge_sort.sub_algorithm_name:" \ + + name) \ + .c_str(), \ + segmented ? &run_benchmark : &run_segmented_benchmark, \ + benchmark_kind, \ + stream, \ + size)); \ + } #define BENCHMARK_TYPE_WS(type, block, warp) \ - CREATE_BENCHMARK(type, block, warp, 1); \ - CREATE_BENCHMARK(type, block, warp, 4); \ - CREATE_BENCHMARK(type, block, warp, 8) - -#define BENCHMARK_TYPE(type, block) \ - BENCHMARK_TYPE_WS(type, block, 4); \ - BENCHMARK_TYPE_WS(type, block, 16); \ - BENCHMARK_TYPE_WS(type, block, 32); \ - BENCHMARK_TYPE_WS(type, block, 64) - -void add_benchmarks(const benchmark_kinds benchmark_kind, - const std::string &name, - std::vector &benchmarks, - const hipStream_t stream, const size_t size, - const bool segmented, const unsigned int device_warp_size) { - BENCHMARK_TYPE(int, 256); - BENCHMARK_TYPE(int8_t, 256); - BENCHMARK_TYPE(uint8_t, 256); - BENCHMARK_TYPE(long long, 256); + CREATE_BENCHMARK(type, block, warp, 1); \ + CREATE_BENCHMARK(type, block, warp, 4); \ + CREATE_BENCHMARK(type, block, warp, 8) + +#define BENCHMARK_TYPE(type, block) \ + BENCHMARK_TYPE_WS(type, block, 4); \ + BENCHMARK_TYPE_WS(type, block, 16); \ + BENCHMARK_TYPE_WS(type, block, 32); \ + BENCHMARK_TYPE_WS(type, block, 64) + +void add_benchmarks(const benchmark_kinds benchmark_kind, + const std::string& name, + std::vector& benchmarks, + const hipStream_t stream, + const size_t size, + const bool segmented, + const unsigned int device_warp_size) +{ + BENCHMARK_TYPE(int, 256); + BENCHMARK_TYPE(int8_t, 256); + BENCHMARK_TYPE(uint8_t, 256); + BENCHMARK_TYPE(long long, 256); } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_warp_merge_sort" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - const auto device_warp_size = [] { - const int result = HIPCUB_HOST_WARP_THREADS; - if (result > 0) { - std::cout << "[HIP] Device warp size: " << result << std::endl; - } else { - std::cerr << "Failed to get device warp size! Aborting.\n"; - std::exit(1); +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_warp_merge_sort" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + const auto device_warp_size = [] + { + const int result = HIPCUB_HOST_WARP_THREADS; + if(result > 0) + { + std::cout << "[HIP] Device warp size: " << result << std::endl; + } else + { + std::cerr << "Failed to get device warp size! Aborting.\n"; + std::exit(1); + } + return static_cast(result); + }(); + + // Add benchmarks + std::vector benchmarks; + add_benchmarks(benchmark_kinds::sort_keys, + "sort(keys)", + benchmarks, + stream, + size, + false, + device_warp_size); + add_benchmarks(benchmark_kinds::sort_pairs, + "sort(keys, values)", + benchmarks, + stream, + size, + false, + device_warp_size); + add_benchmarks(benchmark_kinds::sort_keys, + "segmented_sort(keys)", + benchmarks, + stream, + size, + true, + device_warp_size); + add_benchmarks(benchmark_kinds::sort_pairs, + "segmented_sort(keys, values)", + benchmarks, + stream, + size, + true, + device_warp_size); + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); } - return static_cast(result); - }(); - - // Add benchmarks - std::vector benchmarks; - add_benchmarks(benchmark_kinds::sort_keys, "sort(keys)", benchmarks, stream, - size, false, device_warp_size); - add_benchmarks(benchmark_kinds::sort_pairs, "sort(keys, values)", benchmarks, - stream, size, false, device_warp_size); - add_benchmarks(benchmark_kinds::sort_keys, "segmented_sort(keys)", benchmarks, - stream, size, true, device_warp_size); - add_benchmarks(benchmark_kinds::sort_pairs, "segmented_sort(keys, values)", - benchmarks, stream, size, true, device_warp_size); - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_warp_reduce.cpp b/benchmark/benchmark_warp_reduce.cpp index 2f798d44..f72c268d 100644 --- a/benchmark/benchmark_warp_reduce.cpp +++ b/benchmark/benchmark_warp_reduce.cpp @@ -29,195 +29,229 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template -__global__ __launch_bounds__(64) void warp_reduce_kernel(const T *d_input, - T *d_output) { - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; +template +__global__ __launch_bounds__(64) void warp_reduce_kernel(const T* d_input, T* d_output) +{ + const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - auto value = d_input[i]; + auto value = d_input[i]; - using wreduce_t = hipcub::WarpReduce; - __shared__ typename wreduce_t::TempStorage storage; - auto reduce_op = hipcub::Sum(); + using wreduce_t = hipcub::WarpReduce; + __shared__ typename wreduce_t::TempStorage storage; + auto reduce_op = hipcub::Sum(); #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - value = wreduce_t(storage).Reduce(value, reduce_op); - } + for(unsigned int trial = 0; trial < Trials; trial++) + { + value = wreduce_t(storage).Reduce(value, reduce_op); + } - d_output[i] = value; + d_output[i] = value; } -template -__global__ __launch_bounds__(64) void segmented_warp_reduce_kernel( - const T *d_input, Flag *d_flags, T *d_output) { - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; +template +__global__ __launch_bounds__(64) void segmented_warp_reduce_kernel(const T* d_input, + Flag* d_flags, + T* d_output) +{ + const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - auto value = d_input[i]; - auto flag = d_flags[i]; + auto value = d_input[i]; + auto flag = d_flags[i]; - using wreduce_t = hipcub::WarpReduce; - __shared__ typename wreduce_t::TempStorage storage; + using wreduce_t = hipcub::WarpReduce; + __shared__ typename wreduce_t::TempStorage storage; #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - value = wreduce_t(storage).HeadSegmentedSum(value, flag); - } + for(unsigned int trial = 0; trial < Trials; trial++) + { + value = wreduce_t(storage).HeadSegmentedSum(value, flag); + } - d_output[i] = value; + d_output[i] = value; } -template -inline auto execute_warp_reduce_kernel(T *input, T *output, Flag * /* flags */, - size_t size, hipStream_t stream) -> - typename std::enable_if::type { - hipLaunchKernelGGL(HIP_KERNEL_NAME(warp_reduce_kernel), - dim3(size / BlockSize), dim3(BlockSize), 0, stream, input, - output); - HIP_CHECK(hipPeekAtLastError()); +template +inline auto execute_warp_reduce_kernel( + T* input, T* output, Flag* /* flags */, size_t size, hipStream_t stream) -> + typename std::enable_if::type +{ + hipLaunchKernelGGL(HIP_KERNEL_NAME(warp_reduce_kernel), + dim3(size / BlockSize), + dim3(BlockSize), + 0, + stream, + input, + output); + HIP_CHECK(hipPeekAtLastError()); } -template -inline auto execute_warp_reduce_kernel(T *input, T *output, Flag *flags, - size_t size, hipStream_t stream) -> - typename std::enable_if::type { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(segmented_warp_reduce_kernel), - dim3(size / BlockSize), dim3(BlockSize), 0, stream, input, flags, output); - HIP_CHECK(hipPeekAtLastError()); +template +inline auto + execute_warp_reduce_kernel(T* input, T* output, Flag* flags, size_t size, hipStream_t stream) -> + typename std::enable_if::type +{ + hipLaunchKernelGGL(HIP_KERNEL_NAME(segmented_warp_reduce_kernel), + dim3(size / BlockSize), + dim3(BlockSize), + 0, + stream, + input, + flags, + output); + HIP_CHECK(hipPeekAtLastError()); } -template -void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { - using flag_type = unsigned char; - - const auto size = BlockSize * ((N + BlockSize - 1) / BlockSize); - - std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); - std::vector flags = - benchmark_utils::get_random_data(size, 0, 1); - T *d_input; - flag_type *d_flags; - T *d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_flags, size * sizeof(flag_type))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_flags, flags.data(), size * sizeof(flag_type), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - execute_warp_reduce_kernel( - d_input, d_output, d_flags, size, stream); +template +void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) +{ + using flag_type = unsigned char; + + const auto size = BlockSize * ((N + BlockSize - 1) / BlockSize); + + std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); + std::vector flags = benchmark_utils::get_random_data(size, 0, 1); + T* d_input; + flag_type* d_flags; + T* d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_flags, size * sizeof(flag_type))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_flags, flags.data(), size * sizeof(flag_type), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); - - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); - HIP_CHECK(hipFree(d_flags)); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + execute_warp_reduce_kernel(d_input, + d_output, + d_flags, + size, + stream); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_flags)); } -#define CREATE_BENCHMARK(T, WS, BS) \ - benchmark::RegisterBenchmark( \ - std::string("warp_reduce.sub_algorithm_name:" + \ - name) \ - .c_str(), \ - &run_benchmark, stream, size) +#define CREATE_BENCHMARK(T, WS, BS) \ + benchmark::RegisterBenchmark(std::string("warp_reduce.sub_algorithm_name:" \ + + name) \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) // If warp size limit is 16 -#define BENCHMARK_TYPE_WS16(type) \ - CREATE_BENCHMARK(type, 15, 32), CREATE_BENCHMARK(type, 16, 32) +#define BENCHMARK_TYPE_WS16(type) CREATE_BENCHMARK(type, 15, 32), CREATE_BENCHMARK(type, 16, 32) // If warp size limit is 32 -#define BENCHMARK_TYPE_WS32(type) \ - BENCHMARK_TYPE_WS16(type), CREATE_BENCHMARK(type, 31, 32), \ - CREATE_BENCHMARK(type, 32, 32), CREATE_BENCHMARK(type, 32, 64) +#define BENCHMARK_TYPE_WS32(type) \ + BENCHMARK_TYPE_WS16(type), CREATE_BENCHMARK(type, 31, 32), CREATE_BENCHMARK(type, 32, 32), \ + CREATE_BENCHMARK(type, 32, 64) // If warp size limit is 64 -#define BENCHMARK_TYPE_WS64(type) \ - BENCHMARK_TYPE_WS32(type), CREATE_BENCHMARK(type, 37, 64), \ - CREATE_BENCHMARK(type, 61, 64), CREATE_BENCHMARK(type, 64, 64) - -template -void add_benchmarks(const std::string &name, - std::vector &benchmarks, - hipStream_t stream, size_t size) { - std::vector bs = { +#define BENCHMARK_TYPE_WS64(type) \ + BENCHMARK_TYPE_WS32(type), CREATE_BENCHMARK(type, 37, 64), CREATE_BENCHMARK(type, 61, 64), \ + CREATE_BENCHMARK(type, 64, 64) + +template +void add_benchmarks(const std::string& name, + std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + std::vector bs = { #if HIPCUB_WARP_THREADS_MACRO == 16 - BENCHMARK_TYPE_WS16(int), - BENCHMARK_TYPE_WS16(float), - BENCHMARK_TYPE_WS16(double), - BENCHMARK_TYPE_WS16(int8_t), - BENCHMARK_TYPE_WS16(uint8_t) + BENCHMARK_TYPE_WS16(int), + BENCHMARK_TYPE_WS16(float), + BENCHMARK_TYPE_WS16(double), + BENCHMARK_TYPE_WS16(int8_t), + BENCHMARK_TYPE_WS16(uint8_t) #elif HIPCUB_WARP_THREADS_MACRO == 32 - BENCHMARK_TYPE_WS32(int), - BENCHMARK_TYPE_WS32(float), - BENCHMARK_TYPE_WS32(double), - BENCHMARK_TYPE_WS32(int8_t), - BENCHMARK_TYPE_WS32(uint8_t) + BENCHMARK_TYPE_WS32(int), + BENCHMARK_TYPE_WS32(float), + BENCHMARK_TYPE_WS32(double), + BENCHMARK_TYPE_WS32(int8_t), + BENCHMARK_TYPE_WS32(uint8_t) #else - BENCHMARK_TYPE_WS64(int), - BENCHMARK_TYPE_WS64(float), - BENCHMARK_TYPE_WS64(double), - BENCHMARK_TYPE_WS64(int8_t), - BENCHMARK_TYPE_WS64(uint8_t) + BENCHMARK_TYPE_WS64(int), + BENCHMARK_TYPE_WS64(float), + BENCHMARK_TYPE_WS64(double), + BENCHMARK_TYPE_WS64(int8_t), + BENCHMARK_TYPE_WS64(uint8_t) #endif - }; - benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); + }; + benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_warp_reduce" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_benchmarks("reduce", benchmarks, stream, size); - add_benchmarks("segmented_reduce", benchmarks, stream, size); - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_warp_reduce" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_benchmarks("reduce", benchmarks, stream, size); + add_benchmarks("segmented_reduce", benchmarks, stream, size); + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_warp_scan.cpp b/benchmark/benchmark_warp_scan.cpp index 926f644d..48705742 100644 --- a/benchmark/benchmark_warp_scan.cpp +++ b/benchmark/benchmark_warp_scan.cpp @@ -29,116 +29,138 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -enum class scan_type { inclusive_scan, exclusive_scan, broadcast }; +enum class scan_type +{ + inclusive_scan, + exclusive_scan, + broadcast +}; -template -__global__ __launch_bounds__(BlockSize) void kernel(const T *input, T *output, - const T init) { - Runner::template run(input, output, init); +template +__global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output, const T init) +{ + Runner::template run(input, output, init); } -struct inclusive_scan { - template - __device__ static void run(const T *input, T *output, const T init) { - (void)init; +struct inclusive_scan +{ + template + __device__ static void run(const T* input, T* output, const T init) + { + (void)init; - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - auto value = input[i]; + const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + auto value = input[i]; - using wscan_t = hipcub::WarpScan; - __shared__ typename wscan_t::TempStorage storage; - auto scan_op = hipcub::Sum(); + using wscan_t = hipcub::WarpScan; + __shared__ typename wscan_t::TempStorage storage; + auto scan_op = hipcub::Sum(); #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - wscan_t(storage).InclusiveScan(value, value, scan_op); - } + for(unsigned int trial = 0; trial < Trials; trial++) + { + wscan_t(storage).InclusiveScan(value, value, scan_op); + } - output[i] = value; - } + output[i] = value; + } }; -struct exclusive_scan { - template - __device__ static void run(const T *input, T *output, const T init) { - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - auto value = input[i]; - - using wscan_t = hipcub::WarpScan; - __shared__ typename wscan_t::TempStorage storage; - auto scan_op = hipcub::Sum(); +struct exclusive_scan +{ + template + __device__ static void run(const T* input, T* output, const T init) + { + const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + auto value = input[i]; + + using wscan_t = hipcub::WarpScan; + __shared__ typename wscan_t::TempStorage storage; + auto scan_op = hipcub::Sum(); #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - wscan_t(storage).ExclusiveScan(value, value, init, scan_op); - } + for(unsigned int trial = 0; trial < Trials; trial++) + { + wscan_t(storage).ExclusiveScan(value, value, init, scan_op); + } - output[i] = value; - } + output[i] = value; + } }; -struct broadcast { - template - __device__ static void run(const T *input, T *output, const T init) { - (void)init; +struct broadcast +{ + template + __device__ static void run(const T* input, T* output, const T init) + { + (void)init; - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - auto value = input[i]; + const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + auto value = input[i]; - using wscan_t = hipcub::WarpScan; - __shared__ typename wscan_t::TempStorage storage; + using wscan_t = hipcub::WarpScan; + __shared__ typename wscan_t::TempStorage storage; #pragma nounroll - for (unsigned int trial = 0; trial < Trials; trial++) { - value = wscan_t(storage).Broadcast(value, 0); - } + for(unsigned int trial = 0; trial < Trials; trial++) + { + value = wscan_t(storage).Broadcast(value, 0); + } - output[i] = value; - } + output[i] = value; + } }; -template -void run_benchmark(benchmark::State &state, hipStream_t stream, size_t size) { - // Make sure size is a multiple of BlockSize - size = BlockSize * ((size + BlockSize - 1) / BlockSize); - // Allocate and fill memory - std::vector input(size, 1.0f); - T *d_input; - T *d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel), - dim3(size / BlockSize), dim3(BlockSize), 0, stream, d_input, d_output, - input[0]); - HIP_CHECK(hipPeekAtLastError()); +template +void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) +{ + // Make sure size is a multiple of BlockSize + size = BlockSize * ((size + BlockSize - 1) / BlockSize); + // Allocate and fill memory + std::vector input(size, 1.0f); + T* d_input; + T* d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); - state.SetItemsProcessed(state.iterations() * size * Trials); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), + dim3(size / BlockSize), + dim3(BlockSize), + 0, + stream, + d_input, + d_output, + input[0]); + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); + state.SetItemsProcessed(state.iterations() * size * Trials); - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK_IMPL(T, BS, WS, OP) \ - benchmark::RegisterBenchmark( \ - std::string("warp_scan.method_name:" + \ - method_name) \ - .c_str(), \ - &run_benchmark, stream, size) +#define CREATE_BENCHMARK_IMPL(T, BS, WS, OP) \ + benchmark::RegisterBenchmark(std::string("warp_scan.method_name:" \ + + method_name) \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) #define CREATE_BENCHMARK(T, BS, WS) CREATE_BENCHMARK_IMPL(T, BS, WS, Benchmark) @@ -165,82 +187,87 @@ void run_benchmark(benchmark::State &state, hipStream_t stream, size_t size) { CREATE_BENCHMARK(type, 256, 64) // clang-format on -template -void add_benchmarks(std::vector &benchmarks, - const std::string &method_name, hipStream_t stream, - size_t size) { - using custom_double2 = benchmark_utils::custom_type; - using custom_int_double = benchmark_utils::custom_type; +template +void add_benchmarks(std::vector& benchmarks, + const std::string& method_name, + hipStream_t stream, + size_t size) +{ + using custom_double2 = benchmark_utils::custom_type; + using custom_int_double = benchmark_utils::custom_type; - std::vector new_benchmarks = { + std::vector new_benchmarks = { #if HIPCUB_WARP_THREADS_MACRO == 16 - BENCHMARK_TYPE_WS16(int), - BENCHMARK_TYPE_WS16(float), - BENCHMARK_TYPE_WS16(double), - BENCHMARK_TYPE_WS16(int8_t), - BENCHMARK_TYPE_WS16(custom_double2), - BENCHMARK_TYPE_WS16(custom_int_double) + BENCHMARK_TYPE_WS16(int), + BENCHMARK_TYPE_WS16(float), + BENCHMARK_TYPE_WS16(double), + BENCHMARK_TYPE_WS16(int8_t), + BENCHMARK_TYPE_WS16(custom_double2), + BENCHMARK_TYPE_WS16(custom_int_double) #elif HIPCUB_WARP_THREADS_MACRO == 32 - BENCHMARK_TYPE_WS32(int), - BENCHMARK_TYPE_WS32(float), - BENCHMARK_TYPE_WS32(double), - BENCHMARK_TYPE_WS32(int8_t), - BENCHMARK_TYPE_WS32(custom_double2), - BENCHMARK_TYPE_WS32(custom_int_double) + BENCHMARK_TYPE_WS32(int), + BENCHMARK_TYPE_WS32(float), + BENCHMARK_TYPE_WS32(double), + BENCHMARK_TYPE_WS32(int8_t), + BENCHMARK_TYPE_WS32(custom_double2), + BENCHMARK_TYPE_WS32(custom_int_double) #else - BENCHMARK_TYPE_WS64(int), - BENCHMARK_TYPE_WS64(float), - BENCHMARK_TYPE_WS64(double), - BENCHMARK_TYPE_WS64(int8_t), - BENCHMARK_TYPE_WS64(custom_double2), - BENCHMARK_TYPE_WS64(custom_int_double) + BENCHMARK_TYPE_WS64(int), + BENCHMARK_TYPE_WS64(float), + BENCHMARK_TYPE_WS64(double), + BENCHMARK_TYPE_WS64(int8_t), + BENCHMARK_TYPE_WS64(custom_double2), + BENCHMARK_TYPE_WS64(custom_int_double) #endif - }; - benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), - new_benchmarks.end()); + }; + benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_warp_scan" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks; - add_benchmarks(benchmarks, "inclusive_scan", stream, size); - add_benchmarks(benchmarks, "exclusive_scan", stream, size); - add_benchmarks(benchmarks, "broadcast", stream, size); - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_warp_scan" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks; + add_benchmarks(benchmarks, "inclusive_scan", stream, size); + add_benchmarks(benchmarks, "exclusive_scan", stream, size); + add_benchmarks(benchmarks, "broadcast", stream, size); + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } diff --git a/benchmark/benchmark_warp_store.cpp b/benchmark/benchmark_warp_store.cpp index a73b4eb2..3fe54157 100644 --- a/benchmark/benchmark_warp_store.cpp +++ b/benchmark/benchmark_warp_store.cpp @@ -31,211 +31,232 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template -__device__ auto warp_store_benchmark(T *d_output) -> std::enable_if_t< - benchmark_utils::device_test_enabled_for_warp_size_v> { - T thread_data[ItemsPerThread]; +template +__device__ auto warp_store_benchmark(T* d_output) + -> std::enable_if_t> +{ + T thread_data[ItemsPerThread]; #pragma unroll - for (unsigned i = 0; i < ItemsPerThread; ++i) { - thread_data[i] = static_cast(i); - } - - using WarpStoreT = - ::hipcub::WarpStore; - constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; - constexpr int tile_size = ItemsPerThread * LogicalWarpSize; - __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; - const unsigned warp_id = threadIdx.x / LogicalWarpSize; - const unsigned global_warp_id = blockIdx.x * warps_in_block + warp_id; - - WarpStoreT(temp_storage[warp_id]) - .Store(d_output + global_warp_id * tile_size, thread_data); + for(unsigned i = 0; i < ItemsPerThread; ++i) + { + thread_data[i] = static_cast(i); + } + + using WarpStoreT = ::hipcub::WarpStore; + constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; + constexpr int tile_size = ItemsPerThread * LogicalWarpSize; + __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; + const unsigned warp_id = threadIdx.x / LogicalWarpSize; + const unsigned global_warp_id = blockIdx.x * warps_in_block + warp_id; + + WarpStoreT(temp_storage[warp_id]).Store(d_output + global_warp_id * tile_size, thread_data); } -template -__device__ auto warp_store_benchmark(T * /*d_output*/) -> std::enable_if_t< - !benchmark_utils::device_test_enabled_for_warp_size_v> {} +template +__device__ auto warp_store_benchmark(T* /*d_output*/) + -> std::enable_if_t> +{} -template -__global__ __launch_bounds__(BlockSize) void warp_store_kernel(T *d_output) { - warp_store_benchmark( - d_output); +template +__global__ __launch_bounds__(BlockSize) void warp_store_kernel(T* d_output) +{ + warp_store_benchmark(d_output); } -template -void run_benchmark(benchmark::State &state, hipStream_t stream, size_t N) { - constexpr unsigned items_per_block = BlockSize * ItemsPerThread; - const unsigned size = - items_per_block * ((N + items_per_block - 1) / items_per_block); +template +void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) +{ + constexpr unsigned items_per_block = BlockSize * ItemsPerThread; + const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); - T *d_output; - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + T* d_output; + HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - for (auto _ : state) { - auto start = std::chrono::high_resolution_clock::now(); + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < Trials; ++i) { - warp_store_kernel - <<>>( - d_output); + for(size_t i = 0; i < Trials; ++i) + { + warp_store_kernel + <<>>(d_output); + } + HIP_CHECK(hipPeekAtLastError()) + HIP_CHECK(hipDeviceSynchronize()); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); } - HIP_CHECK(hipPeekAtLastError()) - HIP_CHECK(hipDeviceSynchronize()); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); - state.SetItemsProcessed(state.iterations() * Trials * size); - - HIP_CHECK(hipFree(d_output)); + state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * Trials * size); + + HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ - benchmark::RegisterBenchmark( \ - std::string("warp_store.") \ - .c_str(), \ - &run_benchmark, stream, size) - -int main(int argc, char *argv[]) { - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); - parser.set_optional("trials", "trials", -1, "number of iterations"); - parser.run_and_exit_if_error(); - - // Parse argv - benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); - - std::cout << "benchmark_warp_store" << std::endl; - - // HIP - hipStream_t stream = 0; // default - hipDeviceProp_t devProp; - int device_id = 0; - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - std::cout << "[HIP] Device name: " << devProp.name << std::endl; - - // Add benchmarks - std::vector benchmarks{ - CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_VECTORIZE), - // WARP_STORE_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_VECTORIZE) - // WARP_STORE_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_TRANSPOSE) - }; - - if (::benchmark_utils::is_warp_size_supported(64)) { - std::vector additional_benchmarks{ - CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_VECTORIZE), - CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_VECTORIZE), - // WARP_STORE_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 16, 64, - // ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_VECTORIZE), +#define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ + benchmark::RegisterBenchmark(std::string("warp_store.") \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) + +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_warp_store" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks{ + CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_VECTORIZE), // WARP_STORE_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 32, 64, - // ::hipcub::WARP_STORE_TRANSPOSE), - CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_DIRECT), - CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_STRIPED), - CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_VECTORIZE) + // CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_VECTORIZE) // WARP_STORE_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_TRANSPOSE) + // CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_TRANSPOSE) }; - benchmarks.insert(benchmarks.end(), additional_benchmarks.begin(), - additional_benchmarks.end()); - } - - // Use manual timing - for (auto &b : benchmarks) { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } - - // Force number of iterations - if (trials > 0) { - for (auto &b : benchmarks) { - b->Iterations(trials); + + if(::benchmark_utils::is_warp_size_supported(64)) + { + std::vector additional_benchmarks{ + CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_VECTORIZE), + CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_VECTORIZE), + // WARP_STORE_TRANSPOSE removed because of shared memory limit + // CREATE_BENCHMARK(double, 256, 16, 64, + // ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_VECTORIZE), + // WARP_STORE_TRANSPOSE removed because of shared memory limit + // CREATE_BENCHMARK(double, 256, 32, 64, + // ::hipcub::WARP_STORE_TRANSPOSE), + CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_DIRECT), + CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_STRIPED), + CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_VECTORIZE) + // WARP_STORE_TRANSPOSE removed because of shared memory limit + // CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_TRANSPOSE) + }; + benchmarks.insert(benchmarks.end(), + additional_benchmarks.begin(), + additional_benchmarks.end()); + } + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } } - } - // Run benchmarks - benchmark::RunSpecifiedBenchmarks(); - return 0; + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; } From ffd4b5de10768e62c99004395a24d62a0b672bc6 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Tue, 18 Jun 2024 09:51:46 -0600 Subject: [PATCH 35/46] wraped string concatnations --- .../benchmark_block_adjacent_difference.cpp | 15 ++-- benchmark/benchmark_block_scan.cpp | 15 ++-- benchmark/benchmark_device_segmented_sort.cpp | 36 +++++----- benchmark/benchmark_device_select.cpp | 68 ++++++++++--------- benchmark/benchmark_device_spmv.cpp | 13 ++-- benchmark/benchmark_warp_load.cpp | 13 ++-- 6 files changed, 84 insertions(+), 76 deletions(-) diff --git a/benchmark/benchmark_block_adjacent_difference.cpp b/benchmark/benchmark_block_adjacent_difference.cpp index 9ffdfa1e..6b72c86b 100644 --- a/benchmark/benchmark_block_adjacent_difference.cpp +++ b/benchmark/benchmark_block_adjacent_difference.cpp @@ -326,13 +326,14 @@ auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ - benchmark::RegisterBenchmark( \ - std::string("block_adjacent_difference.sub_algorithm_name:") \ - + name + std::string("").c_str(), \ - &run_benchmark, \ - stream, \ +#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ + benchmark::RegisterBenchmark( \ + std::string("block_adjacent_difference.sub_algorithm_name:" \ + + name + "") \ + .c_str(), \ + &run_benchmark, \ + stream, \ size) #define BENCHMARK_TYPE(type, block, with_tile) \ diff --git a/benchmark/benchmark_block_scan.cpp b/benchmark/benchmark_block_scan.cpp index d453d321..340d3b4e 100644 --- a/benchmark/benchmark_block_scan.cpp +++ b/benchmark/benchmark_block_scan.cpp @@ -147,14 +147,13 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) } // IPT - items per thread -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark((std::string("block_scan.method_name:") \ - + method_name) \ - .c_str(), \ - &run_benchmark, \ - stream, \ +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark(std::string("block_scan.method_name:" + method_name) \ + .c_str(), \ + &run_benchmark, \ + stream, \ size) // clang-format off diff --git a/benchmark/benchmark_device_segmented_sort.cpp b/benchmark/benchmark_device_segmented_sort.cpp index 06e7f73b..d98c7f42 100644 --- a/benchmark/benchmark_device_segmented_sort.cpp +++ b/benchmark/benchmark_device_segmented_sort.cpp @@ -398,35 +398,39 @@ void add_sort_keys_benchmarks(std::vector& benc #define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ benchmark::RegisterBenchmark( \ - (std::string("device_segmented_sort_pairs") \ - + "." \ - + "(number_of_segments:~" + std::to_string(SEGMENTS) + " segments)") \ + std::string("device_segmented_sort_pairs" \ + "." \ + "(number_of_segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ .c_str(), \ [=](benchmark::State& state) \ { run_sort_pairs_benchmark(state, SEGMENTS, stream, size); }), \ benchmark::RegisterBenchmark( \ - (std::string("device_segmented_sort_pairs") \ - + "." \ - + "(number_of_segments:~" + std::to_string(SEGMENTS) + " segments)") \ + std::string("device_segmented_sort_pairs" \ + "." \ + "(number_of_segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ .c_str(), \ [=](benchmark::State& state) \ { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, true); }), \ benchmark::RegisterBenchmark( \ - (std::string("device_segmented_sort_pairs") \ - + "." \ - + "(number_of_segments:~" + std::to_string(SEGMENTS) + " segments)") \ + std::string("device_segmented_sort_pairs" \ + "." \ + "(number_of_segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ .c_str(), \ [=](benchmark::State& state) { \ run_sort_pairs_benchmark(state, SEGMENTS, stream, size, false, true); \ }), \ benchmark::RegisterBenchmark( \ - (std::string("device_segmented_sort_pairs") \ - + "." \ - + "(number_of_segments:~" + std::to_string(SEGMENTS) + " segments)") \ + std::string("device_segmented_sort_pairs" \ + "." \ + "(number_of_segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ .c_str(), \ [=](benchmark::State& state) \ { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, true, true); }) diff --git a/benchmark/benchmark_device_select.cpp b/benchmark/benchmark_device_select.cpp index a2641649..c0921d54 100644 --- a/benchmark/benchmark_device_select.cpp +++ b/benchmark/benchmark_device_select.cpp @@ -419,40 +419,44 @@ void run_unique_by_key_benchmark(benchmark::State& state, hipFree(d_temp_storage); } -#define CREATE_SELECT_FLAGGED_BENCHMARK(T, F, p) \ - benchmark::RegisterBenchmark( \ - ("device_select_flagged.(probability:" #p ")"), \ - &run_flagged_benchmark, \ - size, \ - stream, \ +#define CREATE_SELECT_FLAGGED_BENCHMARK(T, F, p) \ + benchmark::RegisterBenchmark( \ + std::string("device_select_flagged.(probability:" #p ")") \ + .c_str(), \ + &run_flagged_benchmark, \ + size, \ + stream, \ p) -#define CREATE_SELECT_IF_BENCHMARK(T, p) \ - benchmark::RegisterBenchmark(("device_select_if.(probability:" #p \ - ")"), \ - &run_selectop_benchmark, \ - size, \ - stream, \ - p) - -#define CREATE_UNIQUE_BENCHMARK(T, p) \ - benchmark::RegisterBenchmark(("device_select_unique.(probability:" #p \ - ")"), \ - &run_unique_benchmark, \ - size, \ - stream, \ - p) - -#define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p) \ - benchmark::RegisterBenchmark( \ - ("device_select_unique_by_key.(probability:" #p ")"), \ - &run_unique_by_key_benchmark, \ - size, \ - stream, \ +#define CREATE_SELECT_IF_BENCHMARK(T, p) \ + benchmark::RegisterBenchmark( \ + std::string("device_select_if.(probability:" #p ")") \ + .c_str(), \ + &run_selectop_benchmark, \ + size, \ + stream, \ + p) + +#define CREATE_UNIQUE_BENCHMARK(T, p) \ + benchmark::RegisterBenchmark( \ + std::string("device_select_unique.(probability:" #p ")") \ + .c_str(), \ + &run_unique_benchmark, \ + size, \ + stream, \ + p) + +#define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p) \ + benchmark::RegisterBenchmark( \ + std::string("device_select_unique_by_key.(probability:" #p ")") \ + .c_str(), \ + &run_unique_by_key_benchmark, \ + size, \ + stream, \ p) #define BENCHMARK_FLAGGED_TYPE(type, value) \ diff --git a/benchmark/benchmark_device_spmv.cpp b/benchmark/benchmark_device_spmv.cpp index f1ff2cc3..37d119ee 100644 --- a/benchmark/benchmark_device_spmv.cpp +++ b/benchmark/benchmark_device_spmv.cpp @@ -197,13 +197,12 @@ void run_benchmark(benchmark::State& state, HIP_CHECK(hipDeviceSynchronize()); } -#define CREATE_BENCHMARK(T, p) \ - benchmark::RegisterBenchmark( \ - (std::string("device_spmv_CsrMV.")) \ - .c_str(), \ - &run_benchmark, \ - size, \ - stream, \ +#define CREATE_BENCHMARK(T, p) \ + benchmark::RegisterBenchmark( \ + std::string("device_spmv_CsrMV.").c_str(), \ + &run_benchmark, \ + size, \ + stream, \ p) #define BENCHMARK_TYPE(type) \ diff --git a/benchmark/benchmark_warp_load.cpp b/benchmark/benchmark_warp_load.cpp index ab7057da..3b9d54d9 100644 --- a/benchmark/benchmark_warp_load.cpp +++ b/benchmark/benchmark_warp_load.cpp @@ -119,12 +119,13 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ - benchmark::RegisterBenchmark("warp_load.", \ - &run_benchmark, \ - stream, \ +#define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ + benchmark::RegisterBenchmark(std::string("warp_load.") \ + .c_str(), \ + &run_benchmark, \ + stream, \ size) int main(int argc, char* argv[]) From a1695d1a8f1f5be9f49fba9de64e4c20bdb3abd8 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Fri, 21 Jun 2024 09:13:21 -0600 Subject: [PATCH 36/46] added missing , --- benchmark/benchmark_warp_merge_sort.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/benchmark_warp_merge_sort.cpp b/benchmark/benchmark_warp_merge_sort.cpp index 4351e679..e31f68eb 100644 --- a/benchmark/benchmark_warp_merge_sort.cpp +++ b/benchmark/benchmark_warp_merge_sort.cpp @@ -448,7 +448,7 @@ void run_segmented_benchmark(benchmark::State& state, if(WS <= device_warp_size) \ { \ benchmarks.push_back(benchmark::RegisterBenchmark( \ - std::string("warp_merge_sort.sub_algorithm_name:" \ + name) \ .c_str(), \ From 419eb679b94e775acc0c8353c6aa05775eb57252 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Mon, 24 Jun 2024 16:16:15 -0600 Subject: [PATCH 37/46] fixed typo in benchmark_block_reduce --- benchmark/benchmark_block_reduce.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/benchmark_block_reduce.cpp b/benchmark/benchmark_block_reduce.cpp index 2d0aba92..aacbaae6 100644 --- a/benchmark/benchmark_block_reduce.cpp +++ b/benchmark/benchmark_block_reduce.cpp @@ -119,7 +119,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) // IPT - items per thread #define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark(std::string("block_reduce.method_name:" + method_name) \ .c_str(), \ From ffa1c90b3a7b438b6cd3d1c81886086f565ab49c Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Mon, 24 Jun 2024 16:18:08 -0600 Subject: [PATCH 38/46] changed Datatype to data_type in CREATE_BENCHMARK_MEMCPY --- benchmark/benchmark_device_memory.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/benchmark_device_memory.cpp b/benchmark/benchmark_device_memory.cpp index 220ad96a..5e62fd6c 100644 --- a/benchmark/benchmark_device_memory.cpp +++ b/benchmark/benchmark_device_memory.cpp @@ -410,7 +410,7 @@ void run_benchmark_memcpy(benchmark::State& state, size_t size, const hipStream_ #define CREATE_BENCHMARK_MEMCPY(T, SIZE) \ benchmarks.push_back(benchmark::RegisterBenchmark( \ - std::string("device_memory_memcpy.").c_str(), \ + std::string("device_memory_memcpy.").c_str(), \ [=](benchmark::State& state) { run_benchmark_memcpy(state, SIZE, stream); })); // clang-format off From 893280f3af8d91566331c0fc981e8fbaf95edbad Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Mon, 24 Jun 2024 16:52:33 -0600 Subject: [PATCH 39/46] reformated to have ::hipcub:: --- benchmark/benchmark_warp_exchange.cpp | 54 +++++++++++++-------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/benchmark/benchmark_warp_exchange.cpp b/benchmark/benchmark_warp_exchange.cpp index e3db1f81..912e88b3 100644 --- a/benchmark/benchmark_warp_exchange.cpp +++ b/benchmark/benchmark_warp_exchange.cpp @@ -242,7 +242,7 @@ struct BlockedToStripedOp ",block_size:" #BS ",items_per_thread:" #IT \ ",warp_size:" #WS ",algorithm:" #ALG ">.") \ .c_str(), \ - &run_benchmark, \ + &run_benchmark, \ stream, \ size) @@ -251,7 +251,7 @@ struct BlockedToStripedOp ",block_size:" #BS ",items_per_thread:" #IT \ ",warp_size:" #WS ",algorithm:" #ALG ">.") \ .c_str(), \ - &run_benchmark, \ + &run_benchmark, \ stream, \ size) @@ -264,7 +264,7 @@ struct BlockedToStripedOp stream, \ size) -int main(int argc, char* argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -288,29 +288,29 @@ int main(int argc, char* argv[]) // Add benchmarks std::vector benchmarks{ - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, ::hipcub::WARP_EXCHANGE_SMEM), CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 16), CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 32), CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 256, 4, 32), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, ::hipcub::WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, ::hipcub::WARP_EXCHANGE_SHUFFLE), // CUB requires WS == IPT for WARP_EXCHANGE_SHUFFLE #ifdef HIPCUB_ROCPRIM_API - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, ::hipcub::WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, ::hipcub::WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, ::hipcub::WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, ::hipcub::WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, ::hipcub::WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, ::hipcub::WARP_EXCHANGE_SHUFFLE), #endif }; @@ -318,16 +318,16 @@ int main(int argc, char* argv[]) if(::benchmark_utils::is_warp_size_supported(64)) { std::vector additional_benchmarks{ - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, ::hipcub::WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, ::hipcub::WARP_EXCHANGE_SHUFFLE), CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 64), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, ::hipcub::WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, ::hipcub::WARP_EXCHANGE_SHUFFLE), CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 256, 4, 64)}; benchmarks.insert(benchmarks.end(), additional_benchmarks.begin(), From 81495687de906bd115cfe71427db1f55c902ac26 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Tue, 25 Jun 2024 11:53:09 -0600 Subject: [PATCH 40/46] changed algorithm fields to sub_algorithm_name in warp benchmarks --- benchmark/benchmark_warp_exchange.cpp | 4 ++-- benchmark/benchmark_warp_load.cpp | 2 +- benchmark/benchmark_warp_scan.cpp | 2 +- benchmark/benchmark_warp_store.cpp | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmark/benchmark_warp_exchange.cpp b/benchmark/benchmark_warp_exchange.cpp index e3db1f81..65350f31 100644 --- a/benchmark/benchmark_warp_exchange.cpp +++ b/benchmark/benchmark_warp_exchange.cpp @@ -240,7 +240,7 @@ struct BlockedToStripedOp #define CREATE_BENCHMARK_STRIPED_TO_BLOCKED(T, BS, IT, WS, ALG) \ benchmark::RegisterBenchmark(std::string("warp_exchange_striped_to_blocked.") \ + ",warp_size:" #WS ",sub_algorithm_name:" #ALG ">.") \ .c_str(), \ &run_benchmark, \ stream, \ @@ -249,7 +249,7 @@ struct BlockedToStripedOp #define CREATE_BENCHMARK_BLOCKED_TO_STRIPED(T, BS, IT, WS, ALG) \ benchmark::RegisterBenchmark(std::string("warp_exchange_blocked_to_striped.") \ + ",warp_size:" #WS ",sub_algorithm_name:" #ALG ">.") \ .c_str(), \ &run_benchmark, \ stream, \ diff --git a/benchmark/benchmark_warp_load.cpp b/benchmark/benchmark_warp_load.cpp index 3b9d54d9..eca71c73 100644 --- a/benchmark/benchmark_warp_load.cpp +++ b/benchmark/benchmark_warp_load.cpp @@ -122,7 +122,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) #define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ benchmark::RegisterBenchmark(std::string("warp_load.") \ + ",sub_algorithm_name:" #ALG ">.") \ .c_str(), \ &run_benchmark, \ stream, \ diff --git a/benchmark/benchmark_warp_scan.cpp b/benchmark/benchmark_warp_scan.cpp index 48705742..44e42a48 100644 --- a/benchmark/benchmark_warp_scan.cpp +++ b/benchmark/benchmark_warp_scan.cpp @@ -155,7 +155,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) #define CREATE_BENCHMARK_IMPL(T, BS, WS, OP) \ benchmark::RegisterBenchmark(std::string("warp_scan.method_name:" \ + ",warp_size:" #WS ">.sub_algorithm_name:" \ + method_name) \ .c_str(), \ &run_benchmark, \ diff --git a/benchmark/benchmark_warp_store.cpp b/benchmark/benchmark_warp_store.cpp index 3fe54157..356400ea 100644 --- a/benchmark/benchmark_warp_store.cpp +++ b/benchmark/benchmark_warp_store.cpp @@ -114,7 +114,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) #define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ benchmark::RegisterBenchmark(std::string("warp_store.") \ + ",sub_algorithm_name:" #ALG ">.") \ .c_str(), \ &run_benchmark, \ stream, \ From fb30fd1d301735e94d9f6d8880ce33f85b6f00ef Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Tue, 25 Jun 2024 12:08:31 -0600 Subject: [PATCH 41/46] added print of benchmark_warp_load at start --- benchmark/benchmark_warp_load.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmark/benchmark_warp_load.cpp b/benchmark/benchmark_warp_load.cpp index eca71c73..ed6d715f 100644 --- a/benchmark/benchmark_warp_load.cpp +++ b/benchmark/benchmark_warp_load.cpp @@ -146,6 +146,8 @@ int main(int argc, char* argv[]) int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_warp_load" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks From d49965d3a71c1c8fefc0cfc5b097452c73bb36bc Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Tue, 25 Jun 2024 12:14:48 -0600 Subject: [PATCH 42/46] formated warp_benchmarks, device_memory and block_reduce --- benchmark/benchmark_block_reduce.cpp | 2 +- benchmark/benchmark_device_memory.cpp | 4 ++-- benchmark/benchmark_warp_exchange.cpp | 30 +++++++++++++-------------- benchmark/benchmark_warp_load.cpp | 4 ++-- benchmark/benchmark_warp_scan.cpp | 2 +- benchmark/benchmark_warp_store.cpp | 2 +- 6 files changed, 22 insertions(+), 22 deletions(-) diff --git a/benchmark/benchmark_block_reduce.cpp b/benchmark/benchmark_block_reduce.cpp index aacbaae6..bdb089e7 100644 --- a/benchmark/benchmark_block_reduce.cpp +++ b/benchmark/benchmark_block_reduce.cpp @@ -119,7 +119,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) // IPT - items per thread #define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark(std::string("block_reduce.method_name:" + method_name) \ .c_str(), \ diff --git a/benchmark/benchmark_device_memory.cpp b/benchmark/benchmark_device_memory.cpp index 5e62fd6c..d012b0e0 100644 --- a/benchmark/benchmark_device_memory.cpp +++ b/benchmark/benchmark_device_memory.cpp @@ -408,8 +408,8 @@ void run_benchmark_memcpy(benchmark::State& state, size_t size, const hipStream_ [=](benchmark::State& state) \ { run_benchmark(state, SIZE, stream); })); -#define CREATE_BENCHMARK_MEMCPY(T, SIZE) \ - benchmarks.push_back(benchmark::RegisterBenchmark( \ +#define CREATE_BENCHMARK_MEMCPY(T, SIZE) \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ std::string("device_memory_memcpy.").c_str(), \ [=](benchmark::State& state) { run_benchmark_memcpy(state, SIZE, stream); })); diff --git a/benchmark/benchmark_warp_exchange.cpp b/benchmark/benchmark_warp_exchange.cpp index 8d026d78..598df954 100644 --- a/benchmark/benchmark_warp_exchange.cpp +++ b/benchmark/benchmark_warp_exchange.cpp @@ -237,22 +237,22 @@ struct BlockedToStripedOp } }; -#define CREATE_BENCHMARK_STRIPED_TO_BLOCKED(T, BS, IT, WS, ALG) \ - benchmark::RegisterBenchmark(std::string("warp_exchange_striped_to_blocked.") \ - .c_str(), \ - &run_benchmark, \ - stream, \ +#define CREATE_BENCHMARK_STRIPED_TO_BLOCKED(T, BS, IT, WS, ALG) \ + benchmark::RegisterBenchmark(std::string("warp_exchange_striped_to_blocked.") \ + .c_str(), \ + &run_benchmark, \ + stream, \ size) -#define CREATE_BENCHMARK_BLOCKED_TO_STRIPED(T, BS, IT, WS, ALG) \ - benchmark::RegisterBenchmark(std::string("warp_exchange_blocked_to_striped.") \ - .c_str(), \ - &run_benchmark, \ - stream, \ +#define CREATE_BENCHMARK_BLOCKED_TO_STRIPED(T, BS, IT, WS, ALG) \ + benchmark::RegisterBenchmark(std::string("warp_exchange_blocked_to_striped.") \ + .c_str(), \ + &run_benchmark, \ + stream, \ size) #define CREATE_BENCHMARK_SCATTER_TO_STRIPED(T, OFFSET_T, BS, IT, WS) \ @@ -264,7 +264,7 @@ struct BlockedToStripedOp stream, \ size) -int main(int argc, char* argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); diff --git a/benchmark/benchmark_warp_load.cpp b/benchmark/benchmark_warp_load.cpp index ed6d715f..4298db66 100644 --- a/benchmark/benchmark_warp_load.cpp +++ b/benchmark/benchmark_warp_load.cpp @@ -122,7 +122,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) #define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ benchmark::RegisterBenchmark(std::string("warp_load.") \ + ",sub_algorithm_name:" #ALG ">.") \ .c_str(), \ &run_benchmark, \ stream, \ @@ -146,7 +146,7 @@ int main(int argc, char* argv[]) int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - + std::cout << "benchmark_warp_load" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_warp_scan.cpp b/benchmark/benchmark_warp_scan.cpp index 44e42a48..c38defdf 100644 --- a/benchmark/benchmark_warp_scan.cpp +++ b/benchmark/benchmark_warp_scan.cpp @@ -155,7 +155,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) #define CREATE_BENCHMARK_IMPL(T, BS, WS, OP) \ benchmark::RegisterBenchmark(std::string("warp_scan.sub_algorithm_name:" \ + ",warp_size:" #WS ">.sub_algorithm_name:" \ + method_name) \ .c_str(), \ &run_benchmark, \ diff --git a/benchmark/benchmark_warp_store.cpp b/benchmark/benchmark_warp_store.cpp index 356400ea..8e88661c 100644 --- a/benchmark/benchmark_warp_store.cpp +++ b/benchmark/benchmark_warp_store.cpp @@ -114,7 +114,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) #define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ benchmark::RegisterBenchmark(std::string("warp_store.") \ + ",sub_algorithm_name:" #ALG ">.") \ .c_str(), \ &run_benchmark, \ stream, \ From e2fe3811793c25138103bcb76e26d2c0827f039c Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 26 Jun 2024 16:04:59 -0600 Subject: [PATCH 43/46] changed Size to size in device_memory --- benchmark/benchmark_device_memory.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/benchmark_device_memory.cpp b/benchmark/benchmark_device_memory.cpp index d012b0e0..5a879210 100644 --- a/benchmark/benchmark_device_memory.cpp +++ b/benchmark/benchmark_device_memory.cpp @@ -410,7 +410,7 @@ void run_benchmark_memcpy(benchmark::State& state, size_t size, const hipStream_ #define CREATE_BENCHMARK_MEMCPY(T, SIZE) \ benchmarks.push_back(benchmark::RegisterBenchmark( \ - std::string("device_memory_memcpy.").c_str(), \ + std::string("device_memory_memcpy.").c_str(), \ [=](benchmark::State& state) { run_benchmark_memcpy(state, SIZE, stream); })); // clang-format off From f5534f3e4dd4bc40824dd7586fa0bc5d4d484ae8 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 10 Jul 2024 14:53:47 -0600 Subject: [PATCH 44/46] tuned up warp merge sort and device_batch_copy --- benchmark/benchmark_device_batch_copy.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/benchmark/benchmark_device_batch_copy.cpp b/benchmark/benchmark_device_batch_copy.cpp index ff2ccae1..feca312e 100644 --- a/benchmark/benchmark_device_batch_copy.cpp +++ b/benchmark/benchmark_device_batch_copy.cpp @@ -370,6 +370,15 @@ int32_t main(int32_t argc, char* argv[]) // HIP hipStream_t stream = hipStreamDefault; // default + hipDeviceProp_t devProp; + int device_id = 0; + + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_device_batch_copy" << std::endl; + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + // Benchmark info benchmark::AddCustomContext("size", std::to_string(size)); @@ -384,6 +393,8 @@ int32_t main(int32_t argc, char* argv[]) BENCHMARK_TYPE(4, 4), BENCHMARK_TYPE(8, 8)}; + + // Use manual timing for(auto& b : benchmarks) { From d7acc97b050a1ae122426ba13e24abf41d443782 Mon Sep 17 00:00:00 2001 From: Di Nguyen Date: Tue, 16 Jul 2024 13:36:49 -0600 Subject: [PATCH 45/46] Update benchmark/benchmark_device_adjacent_difference.cpp Co-authored-by: Nara --- benchmark/benchmark_device_adjacent_difference.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/benchmark_device_adjacent_difference.cpp b/benchmark/benchmark_device_adjacent_difference.cpp index 07551637..e0788f0b 100644 --- a/benchmark/benchmark_device_adjacent_difference.cpp +++ b/benchmark/benchmark_device_adjacent_difference.cpp @@ -193,9 +193,9 @@ using namespace std::string_literals; #define CREATE_BENCHMARK(T, left, copy) \ benchmark::RegisterBenchmark(std::string("device_adjacent_difference" \ "." \ - "sub_algorithm_name:Subtract" \ - + std::string(left ? "Left" : "Right") \ - + std::string(copy ? "Copy" : "")) \ + "sub_algorithm_name:subtract_" \ + + std::string(left ? "left" : "right") \ + + std::string(copy ? "_copy" : "")) \ .c_str(), \ &run_benchmark, \ size, \ From bde42db4e429bc107f44966ac26c4735551d61f4 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Tue, 16 Jul 2024 13:39:54 -0600 Subject: [PATCH 46/46] made tests names in snake case --- benchmark/benchmark_block_adjacent_difference.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmark/benchmark_block_adjacent_difference.cpp b/benchmark/benchmark_block_adjacent_difference.cpp index 6b72c86b..1cc70798 100644 --- a/benchmark/benchmark_block_adjacent_difference.cpp +++ b/benchmark/benchmark_block_adjacent_difference.cpp @@ -390,10 +390,10 @@ int main(int argc, char* argv[]) // Add benchmarks std::vector benchmarks; - add_benchmarks("SubtractLeft", benchmarks, stream, size); - add_benchmarks("SubtractRight", benchmarks, stream, size); - add_benchmarks("SubtractLeftPartialTile", benchmarks, stream, size); - add_benchmarks("SubtractRightPartialTile", + add_benchmarks("subtract_left", benchmarks, stream, size); + add_benchmarks("subtract_right", benchmarks, stream, size); + add_benchmarks("subtract_left_partial_tile", benchmarks, stream, size); + add_benchmarks("subtract_right_partial_tile", benchmarks, stream, size);