-
Notifications
You must be signed in to change notification settings - Fork 0
/
matrix_multiply.cu
189 lines (139 loc) · 4.73 KB
/
matrix_multiply.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
/*
* CUDA matrix multiply
*/
#include <vector>
#include <iostream>
#include <sstream>
#include <cassert>
#include <chrono>
#include "utils.hxx"
#include "bits.hxx"
#include "matrix.hxx"
using namespace std;
const int BS = 32; // The blocks are BS x BS.
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
// Simple algorithm for multiplying two matrices.
void Matrix::matmul_naive(const Matrix &A, const Matrix &B) {
int N = this->N();
for (int r=0; r<N; ++r){
for (int k=0; k<N; ++k){
for (int c=0; c<N; ++c) {
if (k==0){
(*this)(r,c) = 0.0F;
}
(*this)(r,c) += (A(r,k) * B(k,c));
}
}
}
}
// CUDA kernel function
__global__ void matmul_kernel (float *d_A, float *d_B, float *d_C, int N) {
int rB = blockIdx.x;
int cB = blockIdx.y;
int ri = threadIdx.x;
int ci = threadIdx.y;
__shared__ float SA[BS][BS];
__shared__ float SB[BS][BS];
// printf("In thread with r=(%d,%d) c=(%d,%d)\n", rB,ri,cB,ci);
// get row and column locations
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
for (int kB = 0; kB < gridDim.x; kB++){
// coalesced memory access
SA[ci][ri] = d_A[row*N + (kB*BS + ri)];
SB[ci][ri] = d_B[(kB*BS + ci)*N + col];
__syncthreads();
for (int ki = 0; ki < BS; ki++){
// shared-memory bank access
sum += SA[ci][ki] * SB[ki][ri];
}
__syncthreads();
}
// store results back into device memory
d_C[row * N + col] = sum;
}
// host function
// allocates memory and moves data between CPU and GPU
void Matrix::matmul_host (const Matrix &A, const Matrix &B, int BS) {
int numElem=N()*N();
int sizeBytes = numElem*4;
// Copy A from host memory to device memory.
float *d_A = NULL;
cudaError_t err = cudaMalloc((void **)&d_A, sizeBytes);
ERR_CHK (err, "Failed to allocate device matrix A");
err = cudaMemcpy (d_A, A.data.data(), sizeBytes, cudaMemcpyHostToDevice);
ERR_CHK (err, "Failed to copy matrix A from host to device");
// Allocate device memory for B.
float *d_B = NULL;
err = cudaMalloc((void **)&d_B, sizeBytes);
ERR_CHK (err, "Failed to allocate device matrix B");
// Copy B from host memory to device memory.
err = cudaMemcpy(d_B, B.data.data(), sizeBytes, cudaMemcpyHostToDevice);
ERR_CHK(err, "Failed to copy matrix B from host to device");
// Allocate device memory for C.
float *d_C = NULL;
err = cudaMalloc((void **)&d_C, sizeBytes);
ERR_CHK (err, "Failed to allocate device matrix C");
// Launch the CUDA Kernel
dim3 thBlocks(N()/BS, N()/BS);
dim3 threads(BS, BS);
// matrix multiplication
matmul_kernel <<< thBlocks, threads >>> (d_A, d_B, d_C, N());
// Copy the result from device memory to host memory.
err = cudaMemcpy(this->data.data(), d_C, sizeBytes, cudaMemcpyDeviceToHost);
ERR_CHK (err, "Failed to copy result C from device to host");
// Free device memory.
err = cudaFree(d_A);
ERR_CHK (err, "Failed to free CUDA matrix A");
err = cudaFree(d_B);
ERR_CHK (err, "Failed to free CUDA matrix B");
err = cudaFree(d_C);
ERR_CHK (err, "Failed to free CUDA matrix C");
}
// This function executes the various pieces for a given matrix size.
static void run (int N) {
Matrix a(N), b(N), c(N), d(N);
// initialize A and B
a.init_random(5);
b.init_random(10);
//a.init_cyclic_order();
//b.init_count_order();
//b.init_identity();
LOG (endl<<"Working on "<<N<<"x"<<N<<" matrices with BS="<<BS);
//LOG ("A="<<a.str());
//LOG ("B="<<b.str());
// Compute the reference solution
auto start = start_time();
c.matmul_naive(b, a);
long int time = delta_usec (start);
LOG ("Naive matrix multiplication took "<<(time/1000000.0)<<"sec");
long int total_time=0;
for (int i=0; i<4; ++i) {
auto start = start_time();
d.matmul_host (b, a, BS);
long int time = delta_usec (start);
total_time += time;
c.compare (d);
}
LOG ("matmul_host averaged "<<(total_time/4000000.0)<<"sec");
}
// Main() lives on the CPU.
int main() {
// Check the compute capability
cudaDeviceProp prop;
int device=0;
cudaError_t err = cudaGetDeviceProperties (&prop, device);
ERR_CHK (err, "Failed to get compute capability");
int major = prop.major, minor=prop.minor;
LOG ("Compute capability = "<<major<<"."<<minor);
// Test a bunch of different matrix sizes
run (32);
run (64);
run (256);
run (1024);
run (2048);
run (4096);
return (0);
}