Skip to content

Commit

Permalink
Reduction (#6)
Browse files Browse the repository at this point in the history
* update

* add line info

* update arch

* add bad kernel

* add makefile

* few more <1024 impls
  • Loading branch information
drisspg authored May 20, 2024
1 parent a727e31 commit 7783768
Show file tree
Hide file tree
Showing 6 changed files with 240 additions and 10 deletions.
9 changes: 6 additions & 3 deletions .github/workflows/cmake-single-platform.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,23 @@ env:
jobs:
build:
runs-on: ubuntu-latest
container:
container:
image: nvidia/cuda:12.3.2-devel-ubuntu22.04

steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
submodules: 'recursive'

- name: Set up Git
uses: actions/setup-git@v1
with:
git-version: '2.30.0' # Specify the version of Git you need

- name: Install CMake and git
run: |
apt-get update
apt-get install -y git
apt-get install -y cmake
- name: Initialize and Update Git Submodules
Expand Down
13 changes: 10 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,15 @@ target_include_directories(SimpleCudaLib PUBLIC src/include)
# Add fmtlib
add_subdirectory(third_party/fmt)

# CUDA Flags
set(EXTRA_CUDA_FLAGS "")
# Check if building in Debug mode
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# Add debug-specific flags
set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -G -g")
else()
# Add line info flag only if not building in Debug mode
set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -lineinfo")
endif()

option (SHOW_PTXAS_INFO "Show ptxas info" OFF)
if(SHOW_PTXAS_INFO)
set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -Xptxas -v")
Expand All @@ -44,7 +51,7 @@ foreach(EXAMPLE_SOURCE ${EXAMPLE_SOURCES})

# CUDA properties provided by CMAKE
set_target_properties(${EXAMPLE_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
set_target_properties(${EXAMPLE_NAME} PROPERTIES CUDA_ARCHITECTURES 90)
set_target_properties(${EXAMPLE_NAME} PROPERTIES CUDA_ARCHITECTURES 90a)

# Convert the flags string into a list of flags
separate_arguments(EXTRA_CUDA_FLAGS_LIST UNIX_COMMAND "${EXTRA_CUDA_FLAGS}")
Expand Down
16 changes: 16 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
.DEFAULT_GOAL := all

build_dir := build

all: release

release:
cmake -B$(build_dir) -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo
ninja -C $(build_dir)

debug:
cmake -B$(build_dir) -GNinja -DCMAKE_BUILD_TYPE=Debug
ninja -C $(build_dir)

clean:
rm -rf $(build_dir)
204 changes: 204 additions & 0 deletions examples/chapter10/reduce1d.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
#include "src/include/tensors.h"
#include "src/include/utils.h"

#include <cstddef>
#include <fmt/core.h>
#include <fmt/ranges.h>

#include <cmath>
#include <numeric>
#include <optional>
#include <thrust/device_vector.h>
#include <thrust/fill.h>
#include <thrust/host_vector.h>

using namespace simple_cuda;
using KernelFunc = void (*)(float *, float *, const int);

using one_d = Extent<1>;

__global__ void Reduce1dInplace(float *input, float *output, const int numel) {
const int i = 2 * threadIdx.x;
for (unsigned stride{1}; stride <= blockDim.x; stride *= 2) {
if (threadIdx.x % stride == 0) {
input[i] += input[i + stride];
}
__syncthreads();
}
if (threadIdx.x == 0) {
*output = input[0];
}
}

__global__ void Reduce1dInplaceBetterOrdering(float *input, float *output,
const int numel) {
const int i = threadIdx.x;
for (unsigned stride{blockDim.x}; stride >= 1; stride /= 2) {
if (i < stride) {
input[i] += input[i + stride];
}
__syncthreads();
}

if (threadIdx.x == 0) {
*output = input[0];
}
}

__global__ void Reduce1dShared(float *input, float *output, const int numel) {
const int i = threadIdx.x;
extern __shared__ float shmem[];
// First iter pulled out of loop
shmem[i] = input[i] + input[i + blockDim.x];
__syncthreads();
for (unsigned stride{blockDim.x / 2}; stride >= 1; stride /= 2) {
if (i < stride) {
shmem[i] += shmem[i + stride];
}
__syncthreads();
}

if (threadIdx.x == 0) {
*output = shmem[0];
}
}

__global__ void Reduce1dSharedGlobal(float *input, float *output,
const int numel) {
const int local_id = threadIdx.x;
const int global_id = local_id + 2 * blockDim.x * blockIdx.x;
extern __shared__ float shmem[]; // Size blockDim.x
// First iter pulled out of loop
shmem[local_id] = input[global_id] + input[global_id + blockDim.x];
__syncthreads();
for (unsigned stride{blockDim.x / 2}; stride >= 1; stride /= 2) {
if (local_id < stride) {
shmem[local_id] += shmem[local_id + stride];
}
__syncthreads();
}

if (local_id == 0) {
atomicAdd(output, shmem[0]);
}
}

template <int COARSE_FACTOR>
__global__ void Reduce1dSharedGlobalCoarse(float *input, float *output,
const int numel) {
const int local_id = threadIdx.x;
const int global_offset = COARSE_FACTOR * 2 * blockDim.x * blockIdx.x;
const int global_id = local_id + global_offset;
extern __shared__ float shmem[]; // Size blockDim.x
// First iter pulled out of loop
float sum = input[global_id];
#pragma unroll
for (int tile = 1; tile < COARSE_FACTOR * 2; tile++) {
sum += input[global_id + tile * blockIdx.x];
}
shmem[local_id] = sum;
__syncthreads();
for (unsigned stride{blockDim.x / 2}; stride >= 1; stride /= 2) {
if (local_id < stride) {
shmem[local_id] += shmem[local_id + stride];
}
__syncthreads();
}

if (local_id == 0) {
atomicAdd(output, shmem[0]);
}
}

float cpp_kernel(std::vector<float> &input) {
const auto n_elements = input.size();
std::vector<float> input_copy(input.size());
std::copy(input.begin(), input.end(), input_copy.begin());
auto out = std::reduce(input_copy.begin(), input_copy.end());
return out;
}

void Test(KernelFunc func, const size_t numel, dim3 grid, dim3 block,
std::optional<size_t> shmem) {
one_d tensor_extents({numel});

HostTensor<float, one_d> input_vec(tensor_extents);
HostTensor<float, one_d> out_sum(one_d({1}));

fill_random(input_vec.data_);
// std::fill(input_vec.data_.begin(), input_vec.data_.end(), 1);
std::fill(out_sum.data_.begin(), out_sum.data_.end(), 0);

auto input_vec_d = input_vec.to_device();
auto out_sum_d = out_sum.to_device();

if (shmem.has_value()) {
func<<<grid, block, shmem.value()>>>(
input_vec_d.data_ptr(), out_sum_d.data_ptr(), tensor_extents.numel());

} else {
func<<<grid, block>>>(input_vec_d.data_ptr(), out_sum_d.data_ptr(),
tensor_extents.numel());
}
cudaCheckErrors("kernel launch failure");
cudaDeviceSynchronize();

auto host_output = out_sum_d.to_host();
auto host_output_ptr = host_output.data_ptr();

std::vector<float> input_vector(input_vec.data_.begin(),
input_vec.data_.end());
const auto cpp_anwser = cpp_kernel(input_vector);

float diff = fabs(cpp_anwser - host_output_ptr[0]);
if (diff > 5e-3) {
std::string error_string = "Houston we have a problem!\n";
error_string += fmt::format("Found a deviation of {}\n", diff);
error_string += fmt::format("Cpp anwser: {}, GPU anwser: {}\n", cpp_anwser,
host_output_ptr[0]);
std::cout << error_string;
exit(1);
}
std::cout << "All good brother!\n";
}

int main() {
constexpr int max_length = 2048;
constexpr int block_size = max_length / 2;

dim3 grid(1);
dim3 block(block_size);

// Base case bad ordering inplace writes
fmt::print("• Reduced1dInplace Test: ");
Test(Reduce1dInplace, max_length, grid, block, std::nullopt);

// Inplace writes bad ordering
fmt::print("• Reduced1dInplaceBetterOrdering Test: ");
Test(Reduce1dInplaceBetterOrdering, max_length, grid, block, std::nullopt);

// Dynamic shmem version
fmt::print("• Reduce1dShared Test: ");
size_t shmem{block.x * sizeof(float)};
Test(Reduce1dShared, max_length, grid, block, shmem);

// Test larger than thread reductions
constexpr int max_length_global = 2048 * 2;

block.x = 1024;
grid.x = ceil_div(max_length_global, block.x * 2);
shmem = block.x * sizeof(float);
fmt::print("• Reduce1dSharedGlobal Test: ");
Test(Reduce1dSharedGlobal, max_length_global, grid, block, shmem);

constexpr int coarse_factor = 2;
grid.x = ceil_div(max_length_global, block.x * 2 * coarse_factor);
shmem = block.x * sizeof(float);
fmt::print("• Reduce1dSharedGlobalCoarse Test: ");
Test(Reduce1dSharedGlobalCoarse<coarse_factor>, max_length_global, grid,
block, shmem);

// profile the relevant kernels:
// ncu -k "regex:reduce" ./bin/conv1d
return 0;
}
2 changes: 1 addition & 1 deletion examples/chapter7/conv2d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ int main() {
constexpr int block_size = 32;

// dimx is inner dim, dimy is outerdim
dim3 grid(ceil_div(num_rows, block_size), ceil_div(num_cols, block_size));
dim3 grid(ceil_div(num_cols, block_size), ceil_div(num_rows, block_size));
dim3 block(block_size, block_size);

Test(Conv2D<block_size, filter_radius>, num_rows, num_cols, filter_radius,
Expand Down
6 changes: 3 additions & 3 deletions src/include/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ template <typename T, typename Y> T __host__ __device__ ceil_div(T a, Y b) {

float kernel_time(std::function<void()> kernelLauncher);

template <typename T> void fill_random(T input) {
template <typename T> void fill_random(T& input) {
// First create an instance of an engine.
std::random_device rnd_device;
// Specify the engine and distribution.
std::mt19937 mersenne_engine{rnd_device()}; // Generates random integers
std::normal_distribution<float> dist{2, 1};
std::normal_distribution<float> dist{0, 1};
auto gen = [&dist, &mersenne_engine]() { return dist(mersenne_engine); };

std::generate(input.begin(), input.end(), gen);
}
} // namespace simple_cuda
} // namespace simple_cuda

0 comments on commit 7783768

Please sign in to comment.