-
Notifications
You must be signed in to change notification settings - Fork 1
/
tune_kernel_array_beam_slave_sincos.py
executable file
·105 lines (82 loc) · 3.57 KB
/
tune_kernel_array_beam_slave_sincos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python
import numpy as np
import kernel_tuner
from collections import OrderedDict
def power_bit_length(x):
return 2**(int(x-1).bit_length())
def get_input_data(Nelem):
# r1, r2, r3 = np.random.rand(3).astype(np.float32)
r1, r2, r3 = np.linspace(0.1,1,3, endpoint=False).astype(np.float32)
# x, y, z = np.random.rand(3, Nelem).astype(np.float32)
x, y, z = np.linspace(0.1,1,3*Nelem, endpoint=False).astype(np.float32).reshape(3, Nelem)
tar = np.empty(2).astype(np.float32)
return np.int32(Nelem), r1, r2, r3, x, y, z, tar
def call_reference_kernel(Nelem, r1, r2, r3, x, y, z, tar):
with open('predict_model_snippet.cu', 'r') as f:
kernel_string = f.read()
blockDim_2 = np.int32(power_bit_length(Nelem))
args = [np.int32(Nelem), r1, r2, r3, x, y, z, tar, blockDim_2]
params = {"block_size_x": int(Nelem)}
reference = kernel_tuner.run_kernel("kernel_array_beam_slave_sincos_original", kernel_string, 1,
args, params, grid_div_x=[])
return reference[7]
def test_manual_kernel():
with open('predict_model_snippet.cu', 'r') as f:
kernel_string = f.read()
Nelem = 500
args = get_input_data(Nelem)
reference = call_reference_kernel(*args)
params = {"block_size_x": 32}
answer = kernel_tuner.run_kernel("sincos_manual", kernel_string, 1,
args, params, grid_div_x=[])
answer = answer[7]
assert np.allclose(reference, answer, atol=1e-6)
def test_cub_kernel():
with open('predict_model_snippet.cu', 'r') as f:
kernel_string = f.read()
Nelem = 500
args = get_input_data(Nelem)
reference = call_reference_kernel(*args)
params = {"block_size_x": 192}
answer = kernel_tuner.run_kernel("sincos_cub", kernel_string, 1,
args, params, grid_div_x=[])
answer = answer[7]
assert np.allclose(reference, answer, atol=1e-6)
def tune_kernels():
with open('predict_model_snippet.cu', 'r') as f:
kernel_string = f.read()
# problem_size = (4096, 2048)
Nelem = 500
args = get_input_data(Nelem)
# N: no of stations
N = 61
K = 1e4
T = 1
F = 1
# Tell the Kernel Tuner how the grid dimensions are to be computed
problem_size = 100000
grid_div_x = []
# Compute reference answer using the original kernel
reference = call_reference_kernel(*args)
answer = [None, None, None, None, None, None, None, reference]
tune_params = OrderedDict()
# Tune the original kernel. Actually, run it once, to compare the run times
# with the other kernels.
# And it will only give the correct answer for a block size equal to Nelem,
# so there is nothing to be tuned.
tune_params["block_size_x"] = [int(Nelem)]
kernel_tuner.tune_kernel("kernel_array_beam_slave_sincos_original", kernel_string, problem_size,
args, tune_params, grid_div_x=[], verbose=True, answer=answer)
# Tune the kernel with the manual reduction loop
tune_params["block_size_x"] = [2**i for i in range (5,11)]
manual_kernel, _ = kernel_tuner.tune_kernel("sincos_manual", kernel_string, problem_size,
args, tune_params, grid_div_x=grid_div_x, verbose=True, answer=answer)
# Tune the kernel that uses CUB for reductions
tune_params["block_size_x"] = [32*i for i in range (1,33)]
cub_kernel, _ = kernel_tuner.tune_kernel("sincos_cub", kernel_string, problem_size,
args, tune_params, grid_div_x=grid_div_x, verbose=True, answer=answer)
return manual_kernel, cub_kernel
if __name__ == "__main__":
test_manual_kernel()
test_cub_kernel()
tune_kernels()