-
Notifications
You must be signed in to change notification settings - Fork 355
/
introduction_example.cu
92 lines (75 loc) · 3.62 KB
/
introduction_example.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#include <iostream>
#include <vector>
#include <cuda_runtime_api.h>
#include <cublasdx.hpp>
#include "common.hpp"
#include "reference.hpp"
template<class GEMM>
__global__ void gemm_kernel(const typename GEMM::c_value_type alpha,
const typename GEMM::a_value_type* a,
const typename GEMM::b_value_type* b,
const typename GEMM::c_value_type beta,
typename GEMM::c_value_type* c) {
extern __shared__ __align__(16) char smem[];
// Make global memory tensor
auto a_global_tensor = cublasdx::make_tensor(a, GEMM::get_layout_gmem_a());
auto b_global_tensor = cublasdx::make_tensor(b, GEMM::get_layout_gmem_b());
auto c_global_tensor = cublasdx::make_tensor(c, GEMM::get_layout_gmem_c());
// Make shared memory tensor
auto [smem_a, smem_b, smem_c] = GEMM::slice_shared_memory(smem);
auto a_shared_tensor = cublasdx::make_tensor(smem_a, GEMM::get_layout_smem_a());
auto b_shared_tensor = cublasdx::make_tensor(smem_b, GEMM::get_layout_smem_b());
auto c_shared_tensor = cublasdx::make_tensor(smem_c, GEMM::get_layout_smem_c());
// Load data from global memory tensor to shared memory tensor
using alignment = cublasdx::alignment_of<GEMM>;
cublasdx::copy<GEMM, alignment::a>(a_global_tensor, a_shared_tensor);
cublasdx::copy<GEMM, alignment::b>(b_global_tensor, b_shared_tensor);
cublasdx::copy<GEMM, alignment::c>(c_global_tensor, c_shared_tensor);
cublasdx::copy_wait();
// Execute GEMM
GEMM().execute(alpha, a_shared_tensor, b_shared_tensor, beta, c_shared_tensor);
__syncthreads();
// Store data from shared memory tensor to global memory tensor
cublasdx::copy<GEMM, alignment::c>(c_shared_tensor, c_global_tensor);
}
template<unsigned int Arch>
int introduction_example() {
using GEMM = decltype(cublasdx::Size<32, 32, 32>()
+ cublasdx::Precision<double>()
+ cublasdx::Type<cublasdx::type::real>()
+ cublasdx::Arrangement<cublasdx::row_major, cublasdx::col_major>()
+ cublasdx::Function<cublasdx::function::MM>()
+ cublasdx::SM<700>()
+ cublasdx::Block()
+ cublasdx::BlockDim<256>());
using value_type = typename example::uniform_value_type_t<GEMM>;
constexpr auto global_a_size = example::global_memory_size_of<GEMM>::a_size;
constexpr auto global_b_size = example::global_memory_size_of<GEMM>::b_size;
constexpr auto global_c_size = example::global_memory_size_of<GEMM>::c_size;
// Allocate managed memory for A, B, C matrices in one go
value_type* abc;
auto size = global_a_size + global_b_size + global_c_size;
auto size_bytes = size * sizeof(value_type);
CUDA_CHECK_AND_EXIT(cudaMallocManaged(&abc, size_bytes));
// Generate data
for (size_t i = 0; i < size; i++) {
abc[i] = double(i / size);
}
value_type* a = abc;
value_type* b = abc + global_a_size;
value_type* c = abc + global_a_size + global_b_size;
// Invokes kernel with GEMM::block_dim threads in CUDA block
gemm_kernel<GEMM><<<1, GEMM::block_dim, GEMM::shared_memory_size>>>(1.0, a, b, 1.0, c);
CUDA_CHECK_AND_EXIT(cudaPeekAtLastError());
CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize());
CUDA_CHECK_AND_EXIT(cudaFree(abc));
std::cout << "Success" << std::endl;
return 0;
}
template<unsigned int Arch>
struct introduction_example_functor {
int operator()() { return introduction_example<Arch>(); }
};
int main(int, char**) {
return example::sm_runner<introduction_example_functor>();
}