Skip to content

Commit

Permalink
few more <1024 impls
Browse files Browse the repository at this point in the history
  • Loading branch information
drisspg committed May 20, 2024
1 parent fc4ded5 commit 4b4e65e
Show file tree
Hide file tree
Showing 4 changed files with 140 additions and 20 deletions.
9 changes: 6 additions & 3 deletions .github/workflows/cmake-single-platform.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,23 @@ env:
jobs:
build:
runs-on: ubuntu-latest
container:
container:
image: nvidia/cuda:12.3.2-devel-ubuntu22.04

steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
submodules: 'recursive'

- name: Set up Git
uses: actions/setup-git@v1
with:
git-version: '2.30.0' # Specify the version of Git you need

- name: Install CMake and git
run: |
apt-get update
apt-get install -y git
apt-get install -y cmake
- name: Initialize and Update Git Submodules
Expand Down
16 changes: 9 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,15 @@ target_include_directories(SimpleCudaLib PUBLIC src/include)
# Add fmtlib
add_subdirectory(third_party/fmt)

# CUDA Flags
set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -lineinfo")
# Check if building in Debug mode
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# Add debug-specific flags
set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -G -g")
else()
# Add line info flag only if not building in Debug mode
set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -lineinfo")
endif()

option (SHOW_PTXAS_INFO "Show ptxas info" OFF)
if(SHOW_PTXAS_INFO)
set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -Xptxas -v")
Expand All @@ -46,11 +53,6 @@ foreach(EXAMPLE_SOURCE ${EXAMPLE_SOURCES})
set_target_properties(${EXAMPLE_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
set_target_properties(${EXAMPLE_NAME} PROPERTIES CUDA_ARCHITECTURES 90a)

# Check if building in Debug mode and add -G flag
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -G -g")
endif()

# Convert the flags string into a list of flags
separate_arguments(EXTRA_CUDA_FLAGS_LIST UNIX_COMMAND "${EXTRA_CUDA_FLAGS}")

Expand Down
133 changes: 124 additions & 9 deletions examples/chapter10/reduce1d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
#include <fmt/core.h>
#include <fmt/ranges.h>

#include <numeric>
#include <cmath>
#include <numeric>
#include <optional>
#include <thrust/device_vector.h>
#include <thrust/fill.h>
#include <thrust/host_vector.h>
Expand All @@ -29,6 +30,86 @@ __global__ void Reduce1dInplace(float *input, float *output, const int numel) {
}
}

__global__ void Reduce1dInplaceBetterOrdering(float *input, float *output,
const int numel) {
const int i = threadIdx.x;
for (unsigned stride{blockDim.x}; stride >= 1; stride /= 2) {
if (i < stride) {
input[i] += input[i + stride];
}
__syncthreads();
}

if (threadIdx.x == 0) {
*output = input[0];
}
}

__global__ void Reduce1dShared(float *input, float *output, const int numel) {
const int i = threadIdx.x;
extern __shared__ float shmem[];
// First iter pulled out of loop
shmem[i] = input[i] + input[i + blockDim.x];
__syncthreads();
for (unsigned stride{blockDim.x / 2}; stride >= 1; stride /= 2) {
if (i < stride) {
shmem[i] += shmem[i + stride];
}
__syncthreads();
}

if (threadIdx.x == 0) {
*output = shmem[0];
}
}

__global__ void Reduce1dSharedGlobal(float *input, float *output,
const int numel) {
const int local_id = threadIdx.x;
const int global_id = local_id + 2 * blockDim.x * blockIdx.x;
extern __shared__ float shmem[]; // Size blockDim.x
// First iter pulled out of loop
shmem[local_id] = input[global_id] + input[global_id + blockDim.x];
__syncthreads();
for (unsigned stride{blockDim.x / 2}; stride >= 1; stride /= 2) {
if (local_id < stride) {
shmem[local_id] += shmem[local_id + stride];
}
__syncthreads();
}

if (local_id == 0) {
atomicAdd(output, shmem[0]);
}
}

template <int COARSE_FACTOR>
__global__ void Reduce1dSharedGlobalCoarse(float *input, float *output,
const int numel) {
const int local_id = threadIdx.x;
const int global_offset = COARSE_FACTOR * 2 * blockDim.x * blockIdx.x;
const int global_id = local_id + global_offset;
extern __shared__ float shmem[]; // Size blockDim.x
// First iter pulled out of loop
float sum = input[global_id];
#pragma unroll
for (int tile = 1; tile < COARSE_FACTOR * 2; tile++) {
sum += input[global_id + tile * blockIdx.x];
}
shmem[local_id] = sum;
__syncthreads();
for (unsigned stride{blockDim.x / 2}; stride >= 1; stride /= 2) {
if (local_id < stride) {
shmem[local_id] += shmem[local_id + stride];
}
__syncthreads();
}

if (local_id == 0) {
atomicAdd(output, shmem[0]);
}
}

float cpp_kernel(std::vector<float> &input) {
const auto n_elements = input.size();
std::vector<float> input_copy(input.size());
Expand All @@ -37,20 +118,28 @@ float cpp_kernel(std::vector<float> &input) {
return out;
}

void Test(KernelFunc func, const size_t numel, dim3 grid, dim3 block) {
void Test(KernelFunc func, const size_t numel, dim3 grid, dim3 block,
std::optional<size_t> shmem) {
one_d tensor_extents({numel});

HostTensor<float, one_d> input_vec(tensor_extents);
HostTensor<float, one_d> out_sum(one_d({1}));

fill_random(input_vec.data_);
// std::fill(input_vec.data_.begin(), input_vec.data_.end(), 1);
std::fill(out_sum.data_.begin(), out_sum.data_.end(), 0);

auto input_vec_d = input_vec.to_device();
auto out_sum_d = out_sum.to_device();

func<<<grid, block>>>(input_vec_d.data_ptr(), out_sum_d.data_ptr(),
tensor_extents.numel());
if (shmem.has_value()) {
func<<<grid, block, shmem.value()>>>(
input_vec_d.data_ptr(), out_sum_d.data_ptr(), tensor_extents.numel());

} else {
func<<<grid, block>>>(input_vec_d.data_ptr(), out_sum_d.data_ptr(),
tensor_extents.numel());
}
cudaCheckErrors("kernel launch failure");
cudaDeviceSynchronize();

Expand All @@ -62,7 +151,7 @@ void Test(KernelFunc func, const size_t numel, dim3 grid, dim3 block) {
const auto cpp_anwser = cpp_kernel(input_vector);

float diff = fabs(cpp_anwser - host_output_ptr[0]);
if (diff > 1e-3) {
if (diff > 5e-3) {
std::string error_string = "Houston we have a problem!\n";
error_string += fmt::format("Found a deviation of {}\n", diff);
error_string += fmt::format("Cpp anwser: {}, GPU anwser: {}\n", cpp_anwser,
Expand All @@ -74,14 +163,40 @@ void Test(KernelFunc func, const size_t numel, dim3 grid, dim3 block) {
}

int main() {
// Standard Matmul
constexpr int max_length = 1024;
constexpr int block_size = max_length/2;
constexpr int max_length = 2048;
constexpr int block_size = max_length / 2;

dim3 grid(1);
dim3 block(block_size);

Test(Reduce1dInplace, max_length, grid, block);
// Base case bad ordering inplace writes
fmt::print("• Reduced1dInplace Test: ");
Test(Reduce1dInplace, max_length, grid, block, std::nullopt);

// Inplace writes bad ordering
fmt::print("• Reduced1dInplaceBetterOrdering Test: ");
Test(Reduce1dInplaceBetterOrdering, max_length, grid, block, std::nullopt);

// Dynamic shmem version
fmt::print("• Reduce1dShared Test: ");
size_t shmem{block.x * sizeof(float)};
Test(Reduce1dShared, max_length, grid, block, shmem);

// Test larger than thread reductions
constexpr int max_length_global = 2048 * 2;

block.x = 1024;
grid.x = ceil_div(max_length_global, block.x * 2);
shmem = block.x * sizeof(float);
fmt::print("• Reduce1dSharedGlobal Test: ");
Test(Reduce1dSharedGlobal, max_length_global, grid, block, shmem);

constexpr int coarse_factor = 2;
grid.x = ceil_div(max_length_global, block.x * 2 * coarse_factor);
shmem = block.x * sizeof(float);
fmt::print("• Reduce1dSharedGlobalCoarse Test: ");
Test(Reduce1dSharedGlobalCoarse<coarse_factor>, max_length_global, grid,
block, shmem);

// profile the relevant kernels:
// ncu -k "regex:reduce" ./bin/conv1d
Expand Down
2 changes: 1 addition & 1 deletion src/include/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ template <typename T> void fill_random(T& input) {
std::random_device rnd_device;
// Specify the engine and distribution.
std::mt19937 mersenne_engine{rnd_device()}; // Generates random integers
std::normal_distribution<float> dist{2, 1};
std::normal_distribution<float> dist{0, 1};
auto gen = [&dist, &mersenne_engine]() { return dist(mersenne_engine); };

std::generate(input.begin(), input.end(), gen);
Expand Down

0 comments on commit 4b4e65e

Please sign in to comment.