-
Notifications
You must be signed in to change notification settings - Fork 6
/
unalignedisfaster32.asm
178 lines (157 loc) · 6.25 KB
/
unalignedisfaster32.asm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
;************************* unalignedisfaster32.asm ******************************
; Author: Agner Fog
; Date created: 2011-07-09
; Last modified: 2013-08-30
; Source URL: www.agner.org/optimize
; Project: asmlib.zip
; Language: assembly, NASM/YASM syntax, 64 bit
;
; C++ prototype:
; extern "C" int UnalignedIsFaster(void);
;
; Description:
; This function finds out if unaligned 16-bytes memory read is
; faster than aligned read followed by an alignment shift (PALIGNR) on the
; current CPU.
;
; Return value:
; 0: Unaligned read is probably slower than alignment shift
; 1: Unknown
; 2: Unaligned read is probably faster than alignment shift
;
;
; C++ prototype:
; extern "C" int Store256BitIsFaster(void);
;
; Description:
; This function finds out if a 32-bytes memory write is
; faster than two 16-bytes writes on the current CPU.
;
; Return value:
; 0: 32-bytes memory write is slower or AVX not supported
; 1: Unknown
; 2: 32-bytes memory write is faster
;
; Copyright (c) 2011 - 2013 GNU General Public License www.gnu.org/licenses
;******************************************************************************
;
; C++ prototype:
; extern "C" int UnalignedIsFaster(void);
global _UnalignedIsFaster: function
global _Store256BitIsFaster: function
extern _CpuType
extern _InstructionSet
SECTION .text
_UnalignedIsFaster:
push ebx
push 0 ; vendor
push 0 ; family
push 0 ; model
mov eax, esp
push eax ; &model
add eax, 4
push eax ; &family
add eax, 4
push eax ; &vendor
call _CpuType ; get vendor, family, model
add esp, 12
pop edx ; model
pop ecx ; family
pop ebx ; vendor
xor eax, eax ; return value
dec ebx
jz Intel
dec ebx
jz AMD
dec ebx
jz VIA
; unknown vendor
inc eax
jmp Uend
Intel: ; Unaligned read is faster on Intel Nehalem and later, but not Atom
; Nehalem = family 6, model 1AH
; Atom = family 6, model 1CH
; Netburst = family 0FH
; Future models are likely to be family 6, mayby > 6, model > 1C
cmp ecx, 6
jb Uend ; old Pentium 1, etc
cmp ecx, 0FH
je Uend ; old Netburst architecture
cmp edx, 1AH
jb Uend ; earlier than Nehalem
cmp edx, 1CH
je Uend ; Intel Atom
or eax, 2 ; Intel Nehalem and later, except Atom
jmp Uend
AMD: ; AMD processors:
; The PALIGNR instruction is slow on AMD Bobcat but fast on Jaguar
; K10/Opteron = family 10H ; Use unaligned
; Bobcat = family 14H ; PALIGNR is very slow. Use unaligned
; Piledriver = family 15H ; Use unaligned
; Jaguar = family 16H ; PALIGNR is fast. Use aligned (aligned is faster in most cases, but not all)
cmp ecx, 10H ; AMD K8 or earlier: use aligned
jb Uend
cmp ecx, 16H ; Jaguar: use aligned
je Uend
or eax, 2 ; AMD K10 or later: use unaligned
jmp Uend
VIA: ; Unaligned read is not faster than PALIGNR on VIA Nano 2000 and 3000
cmp ecx, 0FH
jna Uend ; VIA Nano
inc eax ; Future versions: unknown
;jmp Uend
Uend: pop ebx
ret
;_UnalignedIsFaster ENDP
_Store256BitIsFaster:
call _InstructionSet
cmp eax, 11 ; AVX supported
jb S90
push 0 ; vendor
push 0 ; family
push 0 ; model
mov eax, esp
push eax ; &model
add eax, 4
push eax ; &family
add eax, 4
push eax ; &vendor
call _CpuType ; get vendor, family, model
add esp, 12
pop edx ; model
pop ecx ; family
pop eax ; vendor
cmp eax, 1 ; Intel
je S_Intel
cmp eax, 2 ; AMD
je S_AMD
cmp eax, 3 ; VIA
je S_VIA
jmp S91 ; other vendor, not known
S_Intel:cmp ecx, 6
jne S92 ; unknown family. possibly future model
; model 2AH Sandy Bridge
; model 3AH Ivy Bridge
; model 3CH Haswell
; Sandy Bridge and Ivy Bridge are slightly faster with 128 than with 256 bit moves on large data blocks
; Haswell is much faster with 256 bit moves
cmp edx, 3AH
jbe S90
jmp S92
S_AMD: ; AMD
cmp ecx, 15H ; family 15h = Bulldozer, Piledriver
ja S92 ; assume future AMD families are faster
; model 1 = Bulldozer is a little slower on 256 bit write
; model 2 = Piledriver is terribly slow on 256 bit write
; assume future models 3-4 are like Bulldozer
cmp edx, 4
jbe S90
jmp S91 ; later models: don't know
S_VIA: jmp S91 ; don't know
S90: xor eax, eax ; return 0
ret
S91: mov eax, 1 ; return 1
ret
S92: mov eax, 2 ; return 2
ret
; _Store256BitIsFaster ENDP