-
Notifications
You must be signed in to change notification settings - Fork 2
/
mmxbfly29.s
155 lines (133 loc) · 3.6 KB
/
mmxbfly29.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/* Intel SIMD MMX implementation of Viterbi ACS butterflies
for 256-state (k=9) convolutional code
Copyright 2004 Phil Karn, KA9Q
This code may be used under the terms of the GNU Lesser General Public License (LGPL)
void update_viterbi29_blk_mmx(struct v29 *vp,unsigned char *syms,int nbits);
*/
# These are offsets into struct v29, defined in viterbi29.h
.set DP,512
.set OLDMETRICS,516
.set NEWMETRICS,520
.text
.global update_viterbi29_blk_mmx,Mettab29_1,Mettab29_2
.type update_viterbi29_blk_mmx,@function
.align 16
# MMX (64-bit SIMD) version
# requires Pentium-MMX, Pentium-II or better
update_viterbi29_blk_mmx:
pushl %ebp
movl %esp,%ebp
pushl %esi
pushl %edi
pushl %edx
pushl %ebx
movl 8(%ebp),%edx # edx = vp
movl OLDMETRICS(%edx),%esi # esi -> old metrics
movl NEWMETRICS(%edx),%edi # edi -> new metrics
movl DP(%edx),%edx # edx -> decisions
1: movl 16(%ebp),%eax # eax = nbits
decl %eax
jl 2f # passed zero, we're done
movl %eax,16(%ebp)
movl 12(%ebp),%ebx # ebx = syms
movw (%ebx),%ax # ax = second symbol : first symbol
addl $2,%ebx
movl %ebx,12(%ebp)
movb %ah,%bl
andl $255,%eax
andl $255,%ebx
# shift into first array index dimension slot
shll $7,%eax
shll $7,%ebx
# each invocation of this macro will do 8 butterflies in parallel
.MACRO butterfly GROUP
# Compute branch metrics
movq (Mettab29_1+8*\GROUP)(%eax),%mm3
movq fifteens,%mm0
paddb (Mettab29_2+8*\GROUP)(%ebx),%mm3
paddb ones,%mm3 # emulate pavgb - this may not be necessary
psrlq $1,%mm3
pand %mm0,%mm3
movq (8*\GROUP)(%esi),%mm6 # Incoming path metric, high bit = 0
movq ((8*\GROUP)+128)(%esi),%mm2 # Incoming path metric, high bit = 1
movq %mm6,%mm1
movq %mm2,%mm7
paddb %mm3,%mm6
paddb %mm3,%mm2
pxor %mm0,%mm3 # invert branch metric
paddb %mm3,%mm7 # path metric for inverted symbols
paddb %mm3,%mm1
# live registers 1 2 6 7
# Compare mm6 and mm7; mm1 and mm2
pxor %mm3,%mm3
movq %mm6,%mm4
movq %mm1,%mm5
psubb %mm7,%mm4 # mm4 = mm6 - mm7
psubb %mm2,%mm5 # mm5 = mm1 - mm2
pcmpgtb %mm3,%mm4 # mm4 = first set of decisions (ff = 1 better)
pcmpgtb %mm3,%mm5 # mm5 = second set of decisions
# live registers 1 2 4 5 6 7
# select survivors
movq %mm4,%mm0
pand %mm4,%mm7
movq %mm5,%mm3
pand %mm5,%mm2
pandn %mm6,%mm0
pandn %mm1,%mm3
por %mm0,%mm7 # mm7 = first set of survivors
por %mm3,%mm2 # mm2 = second set of survivors
# live registers 2 4 5 7
# interleave & store decisions in mm4, mm5
# interleave & store new branch metrics in mm2, mm7
movq %mm4,%mm3
movq %mm7,%mm0
punpckhbw %mm5,%mm4
punpcklbw %mm5,%mm3
punpcklbw %mm2,%mm7 # interleave second 8 new metrics
punpckhbw %mm2,%mm0 # interleave first 8 new metrics
movq %mm4,(16*\GROUP+8)(%edx)
movq %mm3,(16*\GROUP)(%edx)
movq %mm7,(16*\GROUP)(%edi)
movq %mm0,(16*\GROUP+8)(%edi)
.endm
# invoke macro 16 times for a total of 128 butterflies
butterfly GROUP=0
butterfly GROUP=1
butterfly GROUP=2
butterfly GROUP=3
butterfly GROUP=4
butterfly GROUP=5
butterfly GROUP=6
butterfly GROUP=7
butterfly GROUP=8
butterfly GROUP=9
butterfly GROUP=10
butterfly GROUP=11
butterfly GROUP=12
butterfly GROUP=13
butterfly GROUP=14
butterfly GROUP=15
addl $256,%edx # bump decision pointer
# swap metrics
movl %esi,%eax
movl %edi,%esi
movl %eax,%edi
jmp 1b
2: emms
movl 8(%ebp),%ebx # ebx = vp
# stash metric pointers
movl %esi,OLDMETRICS(%ebx)
movl %edi,NEWMETRICS(%ebx)
movl %edx,DP(%ebx) # stash incremented value of vp->dp
popl %ebx
popl %edx
popl %edi
popl %esi
popl %ebp
ret
.data
.align 8
fifteens:
.byte 15,15,15,15,15,15,15,15
.align 8
ones: .byte 1,1,1,1,1,1,1,1