-
Notifications
You must be signed in to change notification settings - Fork 6
/
memset64.asm
368 lines (310 loc) · 13.1 KB
/
memset64.asm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
;************************* memset64.asm *************************************
; Author: Agner Fog
; Date created: 2008-07-19
; Last modified: 2013-08-04
; Description:
; Faster version of the standard memset function:
; void * A_memset(void * dest, int c, size_t count);
; Sets 'count' bytes from 'dest' to the 8-bit value 'c'
;
; Overriding standard function memset:
; The alias ?OVR_memset is changed to _memset in the object file if
; it is desired to override the standard library function memset.
;
; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
; extern "C" void SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
;
; Optimization:
; Uses XMM registers to set 16 bytes at a time, aligned.
;
; The latest version of this file is available at:
; www.agner.org/optimize/asmexamples.zip
; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
;******************************************************************************
default rel
global A_memset: function ; Function memset
global ?OVR_memset: function ; ?OVR removed if standard function memset overridden
global memsetSSE2: function ; SSE2 version
global memsetAVX: function ; version for CPUs with fast 256-bit store
global GetMemsetCacheLimit: function ; Data blocks bigger than this will be stored uncached by memset
global SetMemsetCacheLimit: function ; Change limit in GetMemsetCacheLimit
; Imported from cachesize64.asm:
extern DataCacheSize ; Get size of data cache
; Imported from unalignedisfaster64.asm:
extern Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores
; Define prolog for this function
%MACRO PROLOGM 0
%IFDEF WINDOWS
%define Rdest rcx ; dest
movzx eax, dl ; c
mov rdx, r8 ; count
%define Rcount rdx ; count
%define Rdest2 r9 ; copy of dest
%define Rcount2 r8 ; copy of count
%ELSE ; Unix
%define Rdest rdi ; dest
movzx eax, sil ; c
%define Rcount rdx ; count
%define Rdest2 rcx ; copy of dest
%define Rcount2 rsi ; copy of count
mov Rcount2, Rcount ; copy count
%ENDIF
%ENDMACRO
SECTION .text align=16
; extern "C" void * memset(void * dest, int c, size_t count);
; Function entry:
A_memset:
?OVR_memset:
jmp [memsetDispatch] ; CPU dispatch table
memsetAVX: ; AVX version. Use ymm register
memsetAVX@: ; local label
PROLOGM
imul eax, 01010101H ; Broadcast c into all bytes of eax
mov Rdest2, Rdest ; save dest
cmp Rcount, 16
ja B100
B050: lea r10, [MemsetJTab] ; SSE2 version comes in here
jmp qword [r10+Rcount*8] ; jump table for small counts
; Separate code for each count from 0 to 16:
M16: mov [Rdest+12], eax
M12: mov [Rdest+8], eax
M08: mov [Rdest+4], eax
M04: mov [Rdest], eax
M00: mov rax, Rdest2 ; return dest
ret
M15: mov [Rdest+11], eax
M11: mov [Rdest+7], eax
M07: mov [Rdest+3], eax
M03: mov [Rdest+1], ax
M01: mov [Rdest], al
mov rax, Rdest2 ; return dest
ret
M14: mov [Rdest+10], eax
M10: mov [Rdest+6], eax
M06: mov [Rdest+2], eax
M02: mov [Rdest], ax
mov rax, Rdest2 ; return dest
ret
M13: mov [Rdest+9], eax
M09: mov [Rdest+5], eax
M05: mov [Rdest+1], eax
mov [Rdest], al
mov rax, Rdest2 ; return dest
ret
B100: ; AVX version, Rcount > 16
movd xmm0, eax
pshufd xmm0, xmm0, 0 ; Broadcast c into all bytes of xmm0
lea rax, [Rdest+Rcount] ; point to end
cmp Rcount, 20H
jbe K600 ; faster to use xmm registers if small
; Store the first possibly unaligned 16 bytes
; It is faster to always write 16 bytes, possibly overlapping
; with the subsequent regular part, than to make possibly mispredicted
; branches depending on the size of the first part.
movups oword [Rdest], xmm0
; store another 16 bytes, aligned
add Rdest, 10H
and Rdest, -10H
movaps oword [Rdest], xmm0
; go to next 32 bytes boundary
add Rdest, 10H
and Rdest, -20H
; Check if count very big
cmp Rcount, [MemsetCacheLimit]
ja K300 ; Use non-temporal store if count > MemsetCacheLimit
; find last 32 bytes boundary
mov Rcount, rax
and Rcount, -20H
; - size of 32-bytes blocks
sub Rdest, Rcount
jnb K200 ; Jump if not negative
; extend value to 256 bits
vinsertf128 ymm0,ymm0,xmm0,1
align 16
K100: ; Loop through 32-bytes blocks. Register use is swapped
; Rcount = end of 32-bytes blocks part
; Rdest = negative index from the end, counting up to zero
vmovaps [Rcount+Rdest], ymm0
add Rdest, 20H
jnz K100
vzeroupper
K200: ; the last part from Rcount to rax is < 32 bytes. write last 32 bytes with overlap
movups [rax-20H], xmm0
movups [rax-10H], xmm0
mov rax, Rdest2 ; return dest
ret
K300: ; Use non-temporal moves, same code as above:
; find last 32 bytes boundary
mov Rcount, rax
and Rcount, -20H
; - size of 32-bytes blocks
sub Rdest, Rcount
jnb K500 ; Jump if not negative
; extend value to 256 bits
vinsertf128 ymm0,ymm0,xmm0,1
align 16
K400: ; Loop through 32-bytes blocks. Register use is swapped
; Rcount = end of 32-bytes blocks part
; Rdest = negative index from the end, counting up to zero
vmovntps [Rcount+Rdest], ymm0
add Rdest, 20H
jnz K400
vzeroupper
K500: ; the last part from Rcount to rax is < 32 bytes. write last 32 bytes with overlap
movups [rax-20H], xmm0
movups [rax-10H], xmm0
mov rax, Rdest2 ; return dest
ret
K600: ; 16 < count <= 32
movups [Rdest], xmm0
movups [rax-10H], xmm0
mov rax, Rdest2 ; return dest
ret
memsetSSE2: ; count > 16. Use SSE2 instruction set
memsetSSE2@: ; local label
PROLOGM
imul eax, 01010101H ; Broadcast c into all bytes of eax
mov Rdest2, Rdest ; save dest
cmp Rcount, 16
jna B050
movd xmm0, eax
pshufd xmm0, xmm0, 0 ; Broadcast c into all bytes of xmm0
; Store the first unaligned part.
; The size of this part is 1 - 16 bytes.
; It is faster to always write 16 bytes, possibly overlapping
; with the subsequent regular part, than to make possibly mispredicted
; branches depending on the size of the first part.
movq qword [Rdest], xmm0
movq qword [Rdest+8], xmm0
; Check if count very big
M150: mov rax, [MemsetCacheLimit]
cmp Rcount, rax
ja M500 ; Use non-temporal store if count > MemsetCacheLimit
; Point to end of regular part:
; Round down dest+count to nearest preceding 16-bytes boundary
lea Rcount, [Rdest+Rcount-1]
and Rcount, -10H
; Point to start of regular part:
; Round up dest to next 16-bytes boundary
add Rdest, 10H
and Rdest, -10H
; -(size of regular part)
sub Rdest, Rcount
jnb M300 ; Jump if not negative
align 16
M200: ; Loop through regular part
; Rcount = end of regular part
; Rdest = negative index from the end, counting up to zero
movdqa [Rcount+Rdest], xmm0
add Rdest, 10H
jnz M200
M300: ; Do the last irregular part
; The size of this part is 1 - 16 bytes.
; It is faster to always write 16 bytes, possibly overlapping
; with the preceding regular part, than to make possibly mispredicted
; branches depending on the size of the last part.
mov rax, Rdest2 ; dest
movq qword [rax+Rcount2-10H], xmm0
movq qword [rax+Rcount2-8], xmm0
ret
M500: ; Use non-temporal moves, same code as above:
; End of regular part:
; Round down dest+count to nearest preceding 16-bytes boundary
lea Rcount, [Rdest+Rcount-1]
and Rcount, -10H
; Start of regular part:
; Round up dest to next 16-bytes boundary
add Rdest, 10H
and Rdest, -10H
; -(size of regular part)
sub Rdest, Rcount
jnb M700 ; Jump if not negative
align 16
M600: ; Loop through regular part
; Rcount = end of regular part
; Rdest = negative index from the end, counting up to zero
movntdq [Rcount+Rdest], xmm0
add Rdest, 10H
jnz M600
M700: ; Do the last irregular part
; The size of this part is 1 - 16 bytes.
; It is faster to always write 16 bytes, possibly overlapping
; with the preceding regular part, than to make possibly mispredicted
; branches depending on the size of the last part.
mov rax, Rdest2 ; dest
movq qword [rax+Rcount2-10H], xmm0
movq qword [rax+Rcount2-8], xmm0
ret
memsetCPUDispatch: ; CPU dispatcher, check for instruction sets and which method is fastest
; This part is executed only once
push rbx
push rcx
push rdx
push rsi
push rdi
push r8
; set CacheBypassLimit to half the size of the largest level cache
call GetMemsetCacheLimit@
lea rbx, [memsetSSE2@]
call Store256BitIsFaster ; Test if 256-bit read/write is available and faster than 128-bit read/write
test eax, eax
jz Q100
lea rbx, [memsetAVX@]
Q100:
; Insert appropriate pointer
mov [memsetDispatch], rbx
mov rax, rbx
pop r8
pop rdi
pop rsi
pop rdx
pop rcx
pop rbx
; Jump according to the replaced function pointer
jmp rax
; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
GetMemsetCacheLimit:
GetMemsetCacheLimit@:
mov rax, [MemsetCacheLimit]
test rax, rax
jnz U200
; Get half the size of the largest level cache
%ifdef WINDOWS
xor ecx, ecx ; 0 means largest level cache
%else
xor edi, edi ; 0 means largest level cache
%endif
call DataCacheSize ; get cache size
shr eax, 1 ; half the size
jnz U100
mov eax, 400000H ; cannot determine cache size. use 4 Mbytes
U100: mov [MemsetCacheLimit], eax
U200: ret
; extern "C" void SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
SetMemsetCacheLimit:
%ifdef WINDOWS
mov rax, rcx
%else
mov rax, rdi
%endif
test rax, rax
jnz U400
; zero, means default
mov [MemsetCacheLimit], rax
call GetMemsetCacheLimit@
U400: mov [MemsetCacheLimit], rax
ret
SECTION .data
align 16
; Jump table for count from 0 to 16:
MemsetJTab:DQ M00, M01, M02, M03, M04, M05, M06, M07
DQ M08, M09, M10, M11, M12, M13, M14, M15, M16
; Pointer to appropriate version.
; This initially points to memsetCPUDispatch. memsetCPUDispatch will
; change this to the appropriate version of memset, so that
; memsetCPUDispatch is only executed once:
memsetDispatch: DQ memsetCPUDispatch
; Bypass cache by using non-temporal moves if count > MemsetCacheLimit
; The optimal value of MemsetCacheLimit is difficult to estimate, but
; a reasonable value is half the size of the largest cache
MemsetCacheLimit: DQ 0