-
Notifications
You must be signed in to change notification settings - Fork 8
/
layer_examples.py
264 lines (222 loc) · 11.3 KB
/
layer_examples.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
# These examples show how to implement neural networks with RISC-V assembly code
import numpy as np
import keras as kr
from keras.layers import Conv2D, DepthwiseConv2D
from keras.initializers import constant
from machine import machine
# alternatively, uncomment the line below to use the tinyfive package
# from tinyfive.machine import machine
from layers import *
np.random.seed(5) # fix seed for reproducible results
m = machine(mem_size=10000000) # instantiate RISC-V machine with 10MB of memory
# TODO: reduce to 500KB once we use branches to reduce image size
# abbreviations for shape dimensions:
# C : input channels (and output channels if the same as input channels)
# F : output channels (or filters), only used if F is not the same as C
# R : input resolution (and output resolution if the same as input).
# Q : output resolution, only used if Q is not the same as R
#-------------------------------------------------------------------------------
# Example 1: 4x4 Dense layer
#-------------------------------------------------------------------------------
print('-------------- Example 1: Dense layer ---------------------------------')
# This is a very small dense layer example using floating point
# generate 4x4 matrices A and B (float32) and store them in memory
a = np.random.normal(size=(4, 4)).astype(np.float32)
b = np.random.normal(size=(4, 4)).astype(np.float32)
m.write_f32_vec(a.flatten(), 0) # write matrix A to mem[0]
m.write_f32_vec(b.flatten(), 4*16) # write matrix B to mem[4*16]
# TODO: merge this with conv_1x1 or parameterize it, move into def
# store assembly program starting at address 4*128
m.pc = 4*128
m.lbl('start')
# load the entire B matrix into registers f[16] ... f[31]
for i in range(4):
for j in range(4):
m.asm('flw.s', 16+4*i+j, 4*(16+4*i+j), 0)
# perform matmul in row-major order
for i in range(4):
for k in range(4): # load f[10] ... f[13] with row i of A
m.asm('flw.s', 10+k, 4*(4*i+k), 0) # load f[10+k] with A[i, k]
for j in range(4):
m.asm('fmul.s', 15, 10, 16+j) # f[15] = f[10] * f[16+j] = A[i, 0] * B[0, j]
for k in range(1, 4):
m.asm('fmadd.s', 15, 10+k, 16+4*k+j, 15) # f[15] += A[i, k] * B[k, j]
m.asm('fsw.s', 15, 4*(32+i*4+j), 0) # store res[i, j] from f[15]
m.lbl('end')
# execute program from 'start' to 'end'
m.exe(start='start', end='end')
m.print_perf()
# compare results against np.matmul(A, B)
res = m.read_f32_vec(4*32, size=4*4).reshape(4, 4) # read result matrix
m.print_rel_err(res, np.matmul(a, b))
# Output: should be very small, e.g. smaller than 1e-06, but could be larger
#-------------------------------------------------------------------------------
# Example 2: Conv2D 1x1, 32 input and 32 output channels, 6x6 image
#-------------------------------------------------------------------------------
print('-------------- Example 2: Conv2D 1x1 layer ----------------------------')
C = 32 # input-channels (tested it up to 128)
F = C # output-channels (tested it up to 128)
R = 6 # resolution of image (tested it up to 48 with C = 8)
#-------------------------------------------------------------------------------
# generate activations and weights for keras (suffix *k is for keras)
a_k = np.random.normal(size=(1, R, R, C)).astype(np.float32)
w_k = np.random.normal(size=(1, 1, C, F)).astype(np.float32)
# input shape: (1, R, R, C) : batch-size, 4x4 image, channels
# output shape: (1, R, R, F) : batch-size, 4x4 image, channels
# kernel shape: (1, 1, C, F) : 1x1 kernel, in-channels, out-channels
# run inference with keras (golden reference)
y_k = Conv2D(F, 1, kernel_initializer=constant(w_k))(a_k)
# TODO: use below if you want to use bias and ReLU activation
# layer = Conv2D(128, 1, activation="relu", name="layer1", input_shape=(4, 4, 128),
# kernel_initializer=constant(w_k),
# bias_initializer=constant(b_k))
# Instead of using kr.initializer, you could use set_weights() as follows:
# y_k = layer(a_k) # dummy run with random weights, needed before set_weights()
# layer.set_weights([w_k, b_k])
# y_k = layer(a_k)
# print(layer.get_weights()[0].shape) # print weights
# print(layer.get_weights()[1].shape) # print biases
#-------------------------------------------------------------------------------
# flatten keras tensors
a = a_k.reshape(R*R, C) # a_k (1, R, R, C) -> A (R*R, C)
w = w_k.reshape(C, F) # w_k (1, 1, C, F) -> W (C, F)
y = y_k.numpy().reshape(R*R, F)
#-------------------------------------------------------------------------------
# proof of concept
# first, compare keras conv2D with simple numpy matmul
m.print_rel_err(np.matmul(a, w), y)
# proof of concept
conv_1x1_concept(m, C, F, R, 4, w, a, y) # set S=4
#-------------------------------------------------------------------------------
# run assembly and compare for various implementations
m.clear_mem()
m.clear_cpu()
# write A and W to memory
a_base = 0
w_base = a_base + R*R*C * 4
y_base = w_base + C*F * 4
code_start = y_base + R*R*F * 4
m.write_f32_vec(a.flatten(), a_base) # write A to mem[a_base]
m.write_f32_vec(w.flatten(), w_base) # write W to mem[w_base]
# S=4, trans=False
conv_1x1(m, C, F, R, a_base, w_base, y_base, code_start)
y_asm = m.read_f32_vec(y_base, size=R*R*F).reshape(R*R, F)
m.print_rel_err(y_asm, y)
m.print_perf()
# S=3, trans=False
conv_1x1(m, C, F, R, a_base, w_base, y_base, code_start, S=3)
y_asm = m.read_f32_vec(y_base, size=R*R*F).reshape(R*R, F)
m.print_rel_err(y_asm, y)
#m.print_perf()
# S=4, trans=True
conv_1x1(m, C, F, R, a_base, w_base, y_base, code_start, trans=True)
y_asm = np.transpose(m.read_f32_vec(y_base, size=R*R*F).reshape(F, R*R), axes=[1, 0])
m.print_rel_err(y_asm, y)
#m.print_perf()
# S=3, trans=True
conv_1x1(m, C, F, R, a_base, w_base, y_base, code_start, trans=True, S=3)
y_asm = np.transpose(m.read_f32_vec(y_base, size=R*R*F).reshape(F, R*R), axes=[1, 0])
m.print_rel_err(y_asm, y)
# S=4, but use conv_1x1_big
conv_1x1_big(m, C, F, R, a_base, w_base, y_base, code_start)
y_asm = m.read_f32_vec(y_base, size=R*R*F).reshape(R*R, F)
m.print_rel_err(y_asm, y)
# S=4, trans=True, but use conv_1x1_big
conv_1x1_big(m, C, F, R, a_base, w_base, y_base, code_start, trans=True)
y_asm = np.transpose(m.read_f32_vec(y_base, size=R*R*F).reshape(F, R*R), axes=[1, 0])
m.print_rel_err(y_asm, y)
# S=3, trans=True, but use conv_1x1_big
conv_1x1_big(m, C, F, R, a_base, w_base, y_base, code_start, trans=True, S=3)
y_asm = np.transpose(m.read_f32_vec(y_base, size=R*R*F).reshape(F, R*R), axes=[1, 0])
m.print_rel_err(y_asm, y)
#-------------------------------------------------------------------------------
# Example 3: Depthwise Conv2D 3x3 with 4 channels, stride=1,2, 6x6 image
#-------------------------------------------------------------------------------
print('-------------- Example 3: Depthwise Conv2D 3x3 layer, stride=1,2 ------')
C = 4 # channels
R = 6 # resolution
Q = R//2 # output resolution for stride 2 only
#-------------------------------------------------------------------------------
# generate activations and weights, run inference
a = np.random.normal(size=(1, R, R, C)).astype(np.float32)
w = np.random.normal(size=(3, 3, C)).astype(np.float32)
# activation shape: (1, R, R, C) : batch-size, RxR image, channels
# output shape for stride=2: (1, Q, Q, C) : batch-size, QxQ image, channels
# kernel shape: (3, 3, C) : 3x3 kernel, channels
# run inference with keras (golden reference) for strides 1 and 2:
# y1 refers to stride=1; y2 refers to stride=2
y1_k = DepthwiseConv2D(3, padding='same', depthwise_initializer=constant(w))(a)
y2_k = DepthwiseConv2D(3, padding='same', strides=2, depthwise_initializer=constant(w))(a)
y1 = y1_k.numpy().reshape(R, R, C) # flatten
y2 = y2_k.numpy().reshape(Q, Q, C) # flatten
#-------------------------------------------------------------------------------
# run assembly and compare
m.clear_mem()
m.clear_cpu()
# write A and W to memory
a_base = 0
w_base = a_base + R*R*C *4
y1_base = w_base + 3*3*C *4 # y_base for stride=1
y2_base = y1_base + R*R*C *4 # y_base for stride=2
m.write_f32_vec(np.transpose(a, axes=[3, 0, 1, 2]).flatten(), a_base)
m.write_f32_vec(np.transpose(w, axes=[2, 0, 1]).flatten(), w_base)
# note on the transpose in the last two lines: We rearrange the matrices so
# that the last axis (channel) is now the first axis (aka 'channel-first order').
# That's important so that when we flatten it in row-major, all the pixels of the
# first channel are contigously in memory, because we process one channel at a time
# run assembly code for strides 1 and 2
dw_conv_3x3_stride1(m, C, R, a_base, w_base, y1_base)
dw_conv_3x3_stride2(m, C, R, a_base, w_base, y2_base)
# compare results against keras
y1_asm = np.transpose(m.read_f32_vec(y1_base, size=R*R*C).reshape(C, R, R), axes=[1, 2, 0])
y2_asm = np.transpose(m.read_f32_vec(y2_base, size=Q*Q*C).reshape(C, Q, Q), axes=[1, 2, 0])
m.print_rel_err(y1_asm, y1)
m.print_rel_err(y2_asm, y2)
# now rerun both cases with 'out_chan_first=False' and compare against previous runs
dw_conv_3x3_stride1(m, C, R, a_base, w_base, y1_base, out_chan_first=False)
dw_conv_3x3_stride2(m, C, R, a_base, w_base, y2_base, out_chan_first=False)
y1_asm_t = m.read_f32_vec(y1_base, size=R*R*C).reshape(R, R, C)
y2_asm_t = m.read_f32_vec(y2_base, size=Q*Q*C).reshape(Q, Q, C)
m.print_rel_err(y1_asm_t, y1_asm)
m.print_rel_err(y2_asm_t, y2_asm)
#-------------------------------------------------------------------------------
# Example 4: Conv2D 3x3, 3 in-channels, 8 out-channels, 12x12 image, stride=1,2
#-------------------------------------------------------------------------------
print('-------------- Example 4: Conv2D 3x3 layer, stride=1,2 ----------------')
F = 8 # output-channels
R = 12 # image resolution
Q = R//2 # output resolution for stride 2 only
#-------------------------------------------------------------------------------
# generate activations and weights, run inference
a = np.random.normal(size=(1, R, R, 3)).astype(np.float32)
w = np.random.normal(size=(3, 3, 3, F)).astype(np.float32)
# input shape: (1, R, R, 3) : batch-size, RxR image, channels
# kernel shape: (3, 3, 3, F) : 3x3 kernel, in-channels, out-channels
# output shape for stride=1: (1, R, R, F) : batch-size, RxR image, channels
# output shape for stride=2: (1, Q, Q, F) : batch-size, QxQ image, channels
# run inference with keras (golden reference) for strides 1 and 2:
# y1 refers to stride=1; y2 refers to stride=2
y1_k = Conv2D(F, 3, padding='same', kernel_initializer=constant(w))(a)
y2_k = Conv2D(F, 3, padding='same', strides=2, kernel_initializer=constant(w))(a)
y1 = y1_k.numpy().reshape(R, R, F)
y2 = y2_k.numpy().reshape(Q, Q, F)
#-------------------------------------------------------------------------------
# run assembly and compare
m.clear_mem()
m.clear_cpu()
# write to memory
a_base = 0
w_base = a.size * 4
y1_base = (a.size + w.size) * 4 # y_base for stride=1
y2_base = y1_base + F*R*R * 4 # y_base for stride=2
m.write_f32_vec(a.flatten(), a_base)
m.write_f32_vec(np.transpose(w, axes=[3, 0, 1, 2]).flatten(), w_base)
# transpose W so that the output-channels is first axes
# run assembly code for strides 1 and 2
conv_3x3x3_stride1(m, F, R, a_base, w_base, y1_base)
conv_3x3x3_stride2(m, F, R, a_base, w_base, y2_base)
# compare results against expected
y1_asm = np.transpose(m.read_f32_vec(y1_base, size=R*R*F).reshape(F, R, R), axes=[1, 2, 0])
y2_asm = np.transpose(m.read_f32_vec(y2_base, size=Q*Q*F).reshape(F, Q, Q), axes=[1, 2, 0])
m.print_rel_err(y1_asm, y1)
m.print_rel_err(y2_asm, y2)