Skip to content

Commit

Permalink
add simple mem-bw check
Browse files Browse the repository at this point in the history
  • Loading branch information
drisspg committed Jul 1, 2024
1 parent b9d2bf1 commit 625557d
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 5 deletions.
13 changes: 8 additions & 5 deletions examples/misc/c_groups.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@

#include <cmath>
#include <cooperative_groups.h>
#include "src/include/utils.h"
#include <fmt/core.h>
Expand Down Expand Up @@ -46,8 +44,13 @@ __global__ void sum_kernel_block(int *sum, int *input, int n) {
int main() {

int n = 1 << 24;
int blockSize = 256;
int nBlocks = simple_cuda::ceil_div(n, blockSize*16);
int blockSize = 128;
int numSMs;
cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, 0);
constexpr int coarse_factor = 32;
// manual Grid_size
int nBlocks_manual = 64 * numSMs;
int nBlocks = simple_cuda::ceil_div(n, blockSize * coarse_factor * 2);
int sharedBytes = blockSize * sizeof(int);

int *sum, *data;
Expand All @@ -56,7 +59,7 @@ int main() {
std::fill_n(data, n, 1); // initialize data
cudaMemset(sum, 0, sizeof(int));

sum_kernel_block<<<nBlocks, blockSize, sharedBytes>>>(sum, data, n);
sum_kernel_block<<<nBlocks_manual, blockSize, sharedBytes>>>(sum, data, n);
cudaDeviceSynchronize();
fmt::print("The array is sized {}\n", n);
fmt::print("Sum is equal to {}\n", *sum);
Expand Down
45 changes: 45 additions & 0 deletions examples/misc/mem_bw.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#include <cooperative_groups.h>
#include <fmt/core.h>

using namespace cooperative_groups;

__global__ void direct_copy_optimized(int4 *output, int4 *input, size_t n) {
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const size_t stride = blockDim.x * gridDim.x;

for (size_t i = tid; i < n / 4; i += stride) {
output[i] = input[i];
}
}

bool check_equal(int *output, int *input, int n) {
for (int i = 0; i < n; i++) {
if (output[i] != input[i]) {
fmt::print("Not equal for {}, input: {} output: {}\n", i, input[i], output[i]);
return false;
}
}
return true;
}

int main() {

int n = 1 << 24;
int blockSize = 1024;
int numSMs;
cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, 0);
// manual Grid_size
int nBlocks_manual = 32 * numSMs;
int *output, *data;
cudaMallocManaged(&output, n * sizeof(int));
cudaMallocManaged(&data, n * sizeof(int));
std::fill_n(data, n, 1); // initialize data

direct_copy_optimized<<<nBlocks_manual, blockSize>>>(reinterpret_cast<int4*>(output), reinterpret_cast<int4*>(data), n);
cudaDeviceSynchronize();

auto eq = check_equal(output, data, n);
fmt::print("Equal: {}\n", eq);

return 0;
}

0 comments on commit 625557d

Please sign in to comment.