Skip to content

Commit

Permalink
above 90
Browse files Browse the repository at this point in the history
  • Loading branch information
drisspg committed Jul 2, 2024
1 parent 625557d commit ee1c000
Showing 1 changed file with 10 additions and 9 deletions.
19 changes: 10 additions & 9 deletions examples/misc/mem_bw.cu
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
#include <cooperative_groups.h>
#include <fmt/core.h>
#include "utils.h"

using namespace cooperative_groups;

__global__ void direct_copy_optimized(int4 *output, int4 *input, size_t n) {
__global__ void direct_copy_optimized(float4 *output, float4 *input, size_t n) {
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const size_t stride = blockDim.x * gridDim.x;

Expand All @@ -12,7 +13,7 @@ __global__ void direct_copy_optimized(int4 *output, int4 *input, size_t n) {
}
}

bool check_equal(int *output, int *input, int n) {
bool check_equal(float *output, float *input, int n) {
for (int i = 0; i < n; i++) {
if (output[i] != input[i]) {
fmt::print("Not equal for {}, input: {} output: {}\n", i, input[i], output[i]);
Expand All @@ -24,18 +25,18 @@ bool check_equal(int *output, int *input, int n) {

int main() {

int n = 1 << 24;
int blockSize = 1024;
int n = 1 << 28;
int blockSize = 256;
int numSMs;
cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, 0);
// manual Grid_size
int nBlocks_manual = 32 * numSMs;
int *output, *data;
cudaMallocManaged(&output, n * sizeof(int));
cudaMallocManaged(&data, n * sizeof(int));
float nBlocks_manual = min(1024 * numSMs, simple_cuda::ceil_div(n, blockSize));
float *output, *data;
cudaMallocManaged(&output, n * sizeof(float));
cudaMallocManaged(&data, n * sizeof(float));
std::fill_n(data, n, 1); // initialize data

direct_copy_optimized<<<nBlocks_manual, blockSize>>>(reinterpret_cast<int4*>(output), reinterpret_cast<int4*>(data), n);
direct_copy_optimized<<<nBlocks_manual, blockSize>>>(reinterpret_cast<float4*>(output), reinterpret_cast<float4*>(data), n);
cudaDeviceSynchronize();

auto eq = check_equal(output, data, n);
Expand Down

0 comments on commit ee1c000

Please sign in to comment.