few more <1024 impls

drisspg · May 20, 2024 · 4b4e65e · 4b4e65e
1 parent fc4ded5
commit 4b4e65e
Show file tree

Hide file tree

Showing 4 changed files with 140 additions and 20 deletions.
diff --git a/.github/workflows/cmake-single-platform.yml b/.github/workflows/cmake-single-platform.yml
@@ -12,20 +12,23 @@ env:
 jobs:
   build:
     runs-on: ubuntu-latest
-    container: 
+    container:
       image: nvidia/cuda:12.3.2-devel-ubuntu22.04
-    
+
     steps:
     - uses: actions/checkout@v3
       with:
         fetch-depth: 0
         submodules: 'recursive'
 
+   - name: Set up Git
+      uses: actions/setup-git@v1
+      with:
+        git-version: '2.30.0' # Specify the version of Git you need
 
     - name: Install CMake and git
       run: |
         apt-get update
-        apt-get install -y git
         apt-get install -y cmake
 
     - name: Initialize and Update Git Submodules

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -27,8 +27,15 @@ target_include_directories(SimpleCudaLib PUBLIC src/include)
 # Add fmtlib
 add_subdirectory(third_party/fmt)
 
-# CUDA Flags
-set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -lineinfo")
+# Check if building in Debug mode
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+  # Add debug-specific flags
+  set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -G -g")
+else()
+  # Add line info flag only if not building in Debug mode
+  set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -lineinfo")
+endif()
+
 option (SHOW_PTXAS_INFO "Show ptxas info" OFF)
 if(SHOW_PTXAS_INFO)
     set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -Xptxas -v")
@@ -46,11 +53,6 @@ foreach(EXAMPLE_SOURCE ${EXAMPLE_SOURCES})
     set_target_properties(${EXAMPLE_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
     set_target_properties(${EXAMPLE_NAME} PROPERTIES CUDA_ARCHITECTURES 90a)
 
-    # Check if building in Debug mode and add -G flag
-    if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-      set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -G -g")
-    endif()
-
     # Convert the flags string into a list of flags
     separate_arguments(EXTRA_CUDA_FLAGS_LIST UNIX_COMMAND "${EXTRA_CUDA_FLAGS}")
 

diff --git a/examples/chapter10/reduce1d.cu b/examples/chapter10/reduce1d.cu
@@ -5,8 +5,9 @@
 #include <fmt/core.h>
 #include <fmt/ranges.h>
 
-#include <numeric>
 #include <cmath>
+#include <numeric>
+#include <optional>
 #include <thrust/device_vector.h>
 #include <thrust/fill.h>
 #include <thrust/host_vector.h>
@@ -29,6 +30,86 @@ __global__ void Reduce1dInplace(float *input, float *output, const int numel) {
   }
 }
 
+__global__ void Reduce1dInplaceBetterOrdering(float *input, float *output,
+                                              const int numel) {
+  const int i = threadIdx.x;
+  for (unsigned stride{blockDim.x}; stride >= 1; stride /= 2) {
+    if (i < stride) {
+      input[i] += input[i + stride];
+    }
+    __syncthreads();
+  }
+
+  if (threadIdx.x == 0) {
+    *output = input[0];
+  }
+}
+
+__global__ void Reduce1dShared(float *input, float *output, const int numel) {
+  const int i = threadIdx.x;
+  extern __shared__ float shmem[];
+  // First iter pulled out of loop
+  shmem[i] = input[i] + input[i + blockDim.x];
+  __syncthreads();
+  for (unsigned stride{blockDim.x / 2}; stride >= 1; stride /= 2) {
+    if (i < stride) {
+      shmem[i] += shmem[i + stride];
+    }
+    __syncthreads();
+  }
+
+  if (threadIdx.x == 0) {
+    *output = shmem[0];
+  }
+}
+
+__global__ void Reduce1dSharedGlobal(float *input, float *output,
+                                     const int numel) {
+  const int local_id = threadIdx.x;
+  const int global_id = local_id + 2 * blockDim.x * blockIdx.x;
+  extern __shared__ float shmem[]; // Size blockDim.x
+  // First iter pulled out of loop
+  shmem[local_id] = input[global_id] + input[global_id + blockDim.x];
+  __syncthreads();
+  for (unsigned stride{blockDim.x / 2}; stride >= 1; stride /= 2) {
+    if (local_id < stride) {
+      shmem[local_id] += shmem[local_id + stride];
+    }
+    __syncthreads();
+  }
+
+  if (local_id == 0) {
+    atomicAdd(output, shmem[0]);
+  }
+}
+
+template <int COARSE_FACTOR>
+__global__ void Reduce1dSharedGlobalCoarse(float *input, float *output,
+                                           const int numel) {
+  const int local_id = threadIdx.x;
+  const int global_offset = COARSE_FACTOR * 2 * blockDim.x * blockIdx.x;
+  const int global_id = local_id + global_offset;
+  extern __shared__ float shmem[]; // Size blockDim.x
+  // First iter pulled out of loop
+  float sum = input[global_id];
+#pragma unroll
+  for (int tile = 1; tile < COARSE_FACTOR * 2; tile++) {
+    sum += input[global_id + tile * blockIdx.x];
+  }
+  shmem[local_id] = sum;
+  __syncthreads();
+  for (unsigned stride{blockDim.x / 2}; stride >= 1; stride /= 2) {
+    if (local_id < stride) {
+      shmem[local_id] += shmem[local_id + stride];
+    }
+    __syncthreads();
+  }
+
+  if (local_id == 0) {
+    atomicAdd(output, shmem[0]);
+  }
+}
+
 float cpp_kernel(std::vector<float> &input) {
   const auto n_elements = input.size();
   std::vector<float> input_copy(input.size());
@@ -37,20 +118,28 @@ float cpp_kernel(std::vector<float> &input) {
   return out;
 }
 
-void Test(KernelFunc func, const size_t numel, dim3 grid, dim3 block) {
+void Test(KernelFunc func, const size_t numel, dim3 grid, dim3 block,
+          std::optional<size_t> shmem) {
   one_d tensor_extents({numel});
 
   HostTensor<float, one_d> input_vec(tensor_extents);
   HostTensor<float, one_d> out_sum(one_d({1}));
 
   fill_random(input_vec.data_);
+  // std::fill(input_vec.data_.begin(), input_vec.data_.end(), 1);
   std::fill(out_sum.data_.begin(), out_sum.data_.end(), 0);
 
   auto input_vec_d = input_vec.to_device();
   auto out_sum_d = out_sum.to_device();
 
-  func<<<grid, block>>>(input_vec_d.data_ptr(), out_sum_d.data_ptr(),
-                        tensor_extents.numel());
+  if (shmem.has_value()) {
+    func<<<grid, block, shmem.value()>>>(
+        input_vec_d.data_ptr(), out_sum_d.data_ptr(), tensor_extents.numel());
+
+  } else {
+    func<<<grid, block>>>(input_vec_d.data_ptr(), out_sum_d.data_ptr(),
+                          tensor_extents.numel());
+  }
   cudaCheckErrors("kernel launch failure");
   cudaDeviceSynchronize();
 
@@ -62,7 +151,7 @@ void Test(KernelFunc func, const size_t numel, dim3 grid, dim3 block) {
   const auto cpp_anwser = cpp_kernel(input_vector);
 
   float diff = fabs(cpp_anwser - host_output_ptr[0]);
-  if (diff > 1e-3) {
+  if (diff > 5e-3) {
     std::string error_string = "Houston we have a problem!\n";
     error_string += fmt::format("Found a deviation of {}\n", diff);
     error_string += fmt::format("Cpp anwser: {}, GPU anwser: {}\n", cpp_anwser,
@@ -74,14 +163,40 @@ void Test(KernelFunc func, const size_t numel, dim3 grid, dim3 block) {
 }
 
 int main() {
-  // Standard Matmul
-  constexpr int max_length = 1024;
-  constexpr int block_size = max_length/2;
+  constexpr int max_length = 2048;
+  constexpr int block_size = max_length / 2;
 
   dim3 grid(1);
   dim3 block(block_size);
 
-  Test(Reduce1dInplace, max_length, grid, block);
+  // Base case bad ordering inplace writes
+  fmt::print("• Reduced1dInplace Test: ");
+  Test(Reduce1dInplace, max_length, grid, block, std::nullopt);
+
+  // Inplace writes bad ordering
+  fmt::print("• Reduced1dInplaceBetterOrdering Test: ");
+  Test(Reduce1dInplaceBetterOrdering, max_length, grid, block, std::nullopt);
+
+  // Dynamic shmem version
+  fmt::print("• Reduce1dShared Test: ");
+  size_t shmem{block.x * sizeof(float)};
+  Test(Reduce1dShared, max_length, grid, block, shmem);
+
+  // Test larger than thread reductions
+  constexpr int max_length_global = 2048 * 2;
+
+  block.x = 1024;
+  grid.x = ceil_div(max_length_global, block.x * 2);
+  shmem = block.x * sizeof(float);
+  fmt::print("• Reduce1dSharedGlobal Test: ");
+  Test(Reduce1dSharedGlobal, max_length_global, grid, block, shmem);
+
+  constexpr int coarse_factor = 2;
+  grid.x = ceil_div(max_length_global, block.x * 2 * coarse_factor);
+  shmem = block.x * sizeof(float);
+  fmt::print("• Reduce1dSharedGlobalCoarse Test: ");
+  Test(Reduce1dSharedGlobalCoarse<coarse_factor>, max_length_global, grid,
+       block, shmem);
 
   // profile the relevant kernels:
   // ncu -k "regex:reduce" ./bin/conv1d

diff --git a/src/include/utils.h b/src/include/utils.h
@@ -29,7 +29,7 @@ template <typename T> void fill_random(T& input) {
   std::random_device rnd_device;
   // Specify the engine and distribution.
   std::mt19937 mersenne_engine{rnd_device()}; // Generates random integers
-  std::normal_distribution<float> dist{2, 1};
+  std::normal_distribution<float> dist{0, 1};
   auto gen = [&dist, &mersenne_engine]() { return dist(mersenne_engine); };
 
   std::generate(input.begin(), input.end(), gen);