Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup CUDA, Reuse Memory, Add Serial Model, Cleaup Std Parallelism #202

Open
wants to merge 14 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,18 @@ if ((NOT BUILD_TYPE STREQUAL RELEASE) AND (NOT BUILD_TYPE STREQUAL DEBUG))
message(FATAL_ERROR "Only Release or Debug is supported, got `${CMAKE_BUILD_TYPE}`")
endif ()

option(BUILD_NATIVE "Builds for the current systems CPU and GPU architecture." ON)

# setup some defaults flags for everything
set(DEFAULT_DEBUG_FLAGS -O2 -fno-omit-frame-pointer)
set(DEFAULT_RELEASE_FLAGS -O3 -march=native)
set(DEFAULT_RELEASE_FLAGS -O3)
if (BUILD_NATIVE)
if(CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64)
set(DEFAULT_RELEASE_FLAGS ${DEFAULT_RELEASE_FLAGS} -mcpu=native)
else()
set(DEFAULT_RELEASE_FLAGS ${DEFAULT_RELEASE_FLAGS} -march=native)
endif()
endif()

macro(hint_flag FLAG DESCRIPTION)
if (NOT DEFINED ${FLAG})
Expand Down Expand Up @@ -146,17 +155,15 @@ endif ()
include(cmake/register_models.cmake)

# register out models <model_name> <preprocessor_def_name> <source files...>
register_model(serial SERIAL SerialStream.cpp)
register_model(omp OMP OMPStream.cpp)
register_model(ocl OCL OCLStream.cpp)
register_model(std-data STD_DATA STDDataStream.cpp)
register_model(std-indices STD_INDICES STDIndicesStream.cpp)
register_model(std-ranges STD_RANGES STDRangesStream.cpp)
register_model(std STD STDStream.cpp)
register_model(hip HIP HIPStream.cpp)
register_model(cuda CUDA CUDAStream.cu)
register_model(kokkos KOKKOS KokkosStream.cpp)
register_model(sycl SYCL SYCLStream.cpp)
register_model(sycl2020-acc SYCL2020 SYCLStream2020.cpp)
register_model(sycl2020-usm SYCL2020 SYCLStream2020.cpp)
register_model(sycl2020 SYCL2020 SYCLStream2020.cpp)
register_model(acc ACC ACCStream.cpp)
# defining RAJA collides with the RAJA namespace so USE_RAJA
register_model(raja USE_RAJA RAJAStream.cpp)
Expand Down
14 changes: 6 additions & 8 deletions src/Stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,13 @@

#pragma once

#include <cstdint>
#include <array>
#include <vector>
#include <string>
#include "benchmark.h"

// Array values
#define startA (0.1)
#define startB (0.2)
#define startC (0.0)
#define startScalar (0.4)
using std::intptr_t;

template <class T>
class Stream
Expand All @@ -31,9 +30,8 @@ class Stream
virtual void nstream() = 0;
virtual T dot() = 0;

// Copy memory between host and device
virtual void init_arrays(T initA, T initB, T initC) = 0;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
// Set pointers to read from arrays
virtual void get_arrays(T const*& a, T const*& b, T const*& c) = 0;
};

// Implementation specific device functions
Expand Down
54 changes: 24 additions & 30 deletions src/StreamModels.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,8 @@

#if defined(CUDA)
#include "CUDAStream.h"
#elif defined(STD_DATA)
#include "STDDataStream.h"
#elif defined(STD_INDICES)
#include "STDIndicesStream.h"
#elif defined(STD_RANGES)
#include "STDRangesStream.hpp"
#elif defined(STD)
#include "STDStream.h"
#elif defined(TBB)
#include "TBBStream.hpp"
#elif defined(THRUST)
Expand All @@ -31,71 +27,69 @@
#include "SYCLStream2020.h"
#elif defined(OMP)
#include "OMPStream.h"
#elif defined(SERIAL)
#include "SerialStream.h"
#elif defined(FUTHARK)
#include "FutharkStream.h"
#endif

template <typename T>
std::unique_ptr<Stream<T>> make_stream(intptr_t array_size, int deviceIndex) {
template <typename T, typename...Args>
std::unique_ptr<Stream<T>> make_stream(Args... args) {
#if defined(CUDA)
// Use the CUDA implementation
return std::make_unique<CUDAStream<T>>(array_size, deviceIndex);
return std::make_unique<CUDAStream<T>>(args...);

#elif defined(HIP)
// Use the HIP implementation
return std::make_unique<HIPStream<T>>(array_size, deviceIndex);
return std::make_unique<HIPStream<T>>(args...);

#elif defined(HC)
// Use the HC implementation
return std::make_unique<HCStream<T>>(array_size, deviceIndex);
return std::make_unique<HCStream<T>>(args...);

#elif defined(OCL)
// Use the OpenCL implementation
return std::make_unique<OCLStream<T>>(array_size, deviceIndex);
return std::make_unique<OCLStream<T>>(args...);

#elif defined(USE_RAJA)
// Use the RAJA implementation
return std::make_unique<RAJAStream<T>>(array_size, deviceIndex);
return std::make_unique<RAJAStream<T>>(args...);

#elif defined(KOKKOS)
// Use the Kokkos implementation
return std::make_unique<KokkosStream<T>>(array_size, deviceIndex);
return std::make_unique<KokkosStream<T>>(args...);

#elif defined(STD_DATA)
#elif defined(STD)
// Use the C++ STD data-oriented implementation
return std::make_unique<STDDataStream<T>>(array_size, deviceIndex);

#elif defined(STD_INDICES)
// Use the C++ STD index-oriented implementation
return std::make_unique<STDIndicesStream<T>>(array_size, deviceIndex);

#elif defined(STD_RANGES)
// Use the C++ STD ranges implementation
return std::make_unique<STDRangesStream<T>>(array_size, deviceIndex);
return std::make_unique<STDStream<T>>(args...);

#elif defined(TBB)
// Use the C++20 implementation
return std::make_unique<TBBStream<T>>(array_size, deviceIndex);
return std::make_unique<TBBStream<T>>(args...);

#elif defined(THRUST)
// Use the Thrust implementation
return std::make_unique<ThrustStream<T>>(array_size, deviceIndex);
return std::make_unique<ThrustStream<T>>(args...);

#elif defined(ACC)
// Use the OpenACC implementation
return std::make_unique<ACCStream<T>>(array_size, deviceIndex);
return std::make_unique<ACCStream<T>>(args...);

#elif defined(SYCL) || defined(SYCL2020)
// Use the SYCL implementation
return std::make_unique<SYCLStream<T>>(array_size, deviceIndex);
return std::make_unique<SYCLStream<T>>(args...);

#elif defined(OMP)
// Use the OpenMP implementation
return std::make_unique<OMPStream<T>>(array_size, deviceIndex);
return std::make_unique<OMPStream<T>>(args...);

#elif defined(SERIAL)
// Use the Serial implementation
return std::make_unique<SerialStream<T>>(args...);

#elif defined(FUTHARK)
// Use the Futhark implementation
return std::make_unique<FutharkStream<T>>(array_size, deviceIndex);
return std::make_unique<FutharkStream<T>>(args...);

#else

Expand Down
20 changes: 10 additions & 10 deletions src/acc/ACCStream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
#include "ACCStream.h"

template <class T>
ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
: array_size{ARRAY_SIZE}
ACCStream<T>::ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
T initA, T initB, T initC)
: array_size{array_size}
{
acc_device_t device_type = acc_get_device_type();
acc_set_device_num(device, device_type);
acc_set_device_num(device_id, device_type);

// Set up data region on device
this->a = new T[array_size];
Expand All @@ -25,6 +26,8 @@ ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)

#pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
{}

init_arrays(initA, initB, initC);
}

template <class T>
Expand Down Expand Up @@ -62,20 +65,17 @@ void ACCStream<T>::init_arrays(T initA, T initB, T initC)
}

template <class T>
void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
{
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size])
{}

for (intptr_t i = 0; i < array_size; i++)
{
h_a[i] = a[i];
h_b[i] = b[i];
h_c[i] = c[i];
}
h_a = a;
h_b = b;
h_c = c;
}

template <class T>
Expand Down
33 changes: 13 additions & 20 deletions src/acc/ACCStream.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,32 +19,25 @@
template <class T>
class ACCStream : public Stream<T>
{
struct A{
T *a;
T *b;
T *c;
};

protected:
// Size of arrays
intptr_t array_size;
A aa;
// Device side pointers
T *a;
T *b;
T *c;
T* restrict a;
T* restrict b;
T* restrict c;

public:
ACCStream(const intptr_t, int);
ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
T initA, T initB, T initC);
~ACCStream();

virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
void copy() override;
void add() override;
void mul() override;
void triad() override;
void nstream() override;
T dot() override;

virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
void get_arrays(T const*& a, T const*& b, T const*& c) override;
void init_arrays(T initA, T initB, T initC);
};
66 changes: 66 additions & 0 deletions src/benchmark.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#pragma once

#include <algorithm>
#include <array>
#include <initializer_list>
#include <iostream>

// Array values
#define startA (0.1)
#define startB (0.2)
#define startC (0.0)
#define startScalar (0.4)

// Benchmark Identifier: identifies individual & groups of benchmarks:
// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot.
// - All: all kernels.
// - Individual kernels only.
enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All};

struct Benchmark {
BenchId id;
char const* label;
// Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW:
// bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur
size_t weight;
// Is it one of: Copy, Mul, Add, Triad, Dot?
bool classic = false;
};

// Benchmarks in the order in which - if present - should be run for validation purposes:
constexpr size_t num_benchmarks = 6;
constexpr std::array<Benchmark, num_benchmarks> bench = {
Benchmark { .id = BenchId::Copy, .label = "Copy", .weight = 2, .classic = true },
Benchmark { .id = BenchId::Mul, .label = "Mul", .weight = 2, .classic = true },
Benchmark { .id = BenchId::Add, .label = "Add", .weight = 3, .classic = true },
Benchmark { .id = BenchId::Triad, .label = "Triad", .weight = 3, .classic = true },
Benchmark { .id = BenchId::Dot, .label = "Dot", .weight = 2, .classic = true },
Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false }
};

// Which buffers are needed by each benchmark
inline bool needs_buffer(BenchId id, char n) {
auto in = [n](std::initializer_list<char> values) {
return std::find(values.begin(), values.end(), n) != values.end();
};
switch(id) {
case BenchId::All: return in({'a','b','c'});
case BenchId::Classic: return in({'a','b','c'});
case BenchId::Copy: return in({'a','c'});
case BenchId::Mul: return in({'b','c'});
case BenchId::Add: return in({'a','b','c'});
case BenchId::Triad: return in({'a','b','c'});
case BenchId::Dot: return in({'a','b'});
case BenchId::Nstream: return in({'a','b','c'});
default:
std::cerr << "Unknown benchmark" << std::endl;
abort();
}
}

// Returns true if the benchmark needs to be run:
inline bool run_benchmark(BenchId selection, Benchmark const& b) {
if (selection == BenchId::All) return true;
if (selection == BenchId::Classic && b.classic) return true;
return selection == b.id;
}
Loading
Loading