-
Notifications
You must be signed in to change notification settings - Fork 0
/
tensorFlopKernel.asm
752 lines (590 loc) · 21 KB
/
tensorFlopKernel.asm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
bits 64
%include "utility.asm"
section .data
;===- TensorFlop Constants -===;
;==============================;
;# Limitations #
MAX_LAYERS_COUNT equ 10
MAX_NETWORKS_COUNT equ 2
MAX_INDEX_LOSS_HISTORY equ 8000
MAX_SIZE_LOSS_HISTORY equ 64000
MAX_IDEALS_COUNT equ 10
;# Initial constants #
INITIAL_LEAKY_RELU_COEFFICIENT dq 0.1
INITIAL_LEARNING_RATE dq -0.001
INITIAL_CLASSIFICATION_RADIUS_VALUE dq 0.8
INITIAL_VERIFICATION_SQUARE_DXY dq 0.1
;# Constants #
TF_ZERO equ 0
TF_FALSE equ 0
TF_TRUE equ 1
TF_NO_BIASES equ 1
TF_USE_BIASES equ 2
TF_RANDOM equ 3
TF_LOAD_DATA equ 4
TF_LINEAR equ 5
TF_RELU equ 6
TF_LEAKY_RELU equ 7
TF_SIGMOID equ 8
TF_TANH equ 9
TF_SOFTMAX equ 10
TF_NORMALIZE equ 11
TF_CLASSIFICATION_RADIUS equ 12
TF_CLASSIFICATION_CHESSBOARD equ 13
TF_CLASSIFICATION_HALF equ 14
section .bss
;===- TENSORFLOP VARIABLES -===;
;==============================;
; TENSORFLOP HANDLING
tensorflopHandles: resq MAX_NETWORKS_COUNT ; Pointer to my CNN-Handles
tensorflopHandleCreated : resb 1 ; Amount of CNN and-so Cnn-Handles created
; CNN-global
CnnLayersCount: resq 1 ; The number of layer of the current CNN
CnnWeightsCount: resq 1 ; The number of weights of the current CNN
CnnNeuronsCount: resq 1 ; The number of neurons of the current CNN
CnnDataInputDim: resq 1 ; Size (vector size) of what enters in CNN
CnnDataOutputDim: resq 1 ; Size (vector size) of what comes out of CNN
CnnDataInputSize: resq 1
CnnDataOutputSize: resq 1
;Has to be allocated;
CnnActivatedMatrixPointer: resq 1 ; A pseudo matrix of all the activated layer's neurons | Size DOUBLE_SIZE * CnnNeuronsCount
CnnUnactivatedMatrixPointer: resq 1 ; A pseudo matrix of all the unactivated layer's neurons | Size DOUBLE_SIZE * CnnNeuronsCount
CnnWeightsMatrixPointer: resq 1 ; A pseudo matrix of all the weights
CnnBiasesMatrixPointer: resq 1 ; A pseudo matrx of all the biases
CnnCostMatrixPointer: resq 1 ;
CnnBackpropagationWeightsMatrixPointer: resq 1;
CnnBackpropagationBiasesMatrixPointer: resq 1;
CnnActivationFunctions : resq MAX_LAYERS_COUNT ; Array of activation functions of layers created
CnnLayersSizes : resq MAX_LAYERS_COUNT ; Array of sizes of the layers (in neurons)
CnnFirstLayerOffset : resq 1 ; Offset of Neurons needed to propagate (skipping input neurons)
CnnLastLayerOffset : resq 1 ; Offset of Neurons needed to backpropagate (skipping output neurons)
CnnLearningRate: resq 1
CnnEpochs : resq 1
CnnBatchSize : resq 1
CnnStaticInput: resq 1
; CNN-Computation
computeInputLayerSize: resq 1 ; Size of the input layer
computeOutputLayerSize: resq 1 ; Size of the output layer
computeUnchangedOutputLayerSize: resq 1
computeActivationFunction: resq 1 ; Activation function of the layer
computeTempUnactivatedPointer: resq 1
computeOutputNeuronPointer: resq 1 ; Pointer to the neuron, used in dot product to store the result
computeTempOutputPointer: resq 1 ; POint to the result of a single multiplication in a dot product, temporary
computedFloatPointer: resq 1 ; Commonly used for activation function, as the result and the parameter of these
computedSoftMaxSum: resq 1 ; Summation of e^Xi of an array X, used in softMax
computedTanhExp: resq 1
computedTanhDenominator: resq 1
; CNN-Backpropagation
backproptemp: resq 1
backpropTempFloatPointer: resq 1
backpropTempFloatPointerTwo: resq 1
backpropLayerIndex: resq 1
backpropFormerLayerIndex: resq 1
backpropLayerSize: resq 1
backpropLayerBytesSize: resq 1
backpropFormerLayerSize: resq 1
backpropFormerLayerBytesSize: resq 1
backpropNeuronsLayerByteOffset: resq 1
backpropReverseNeuronsLayerByteOffset: resq 1
backpropReverseNeuronsFormerLayerByteOffset: resq 1
backpropInputLayerNeuronsPointer: resq 1
backpropInputLayerWeightsPointer: resq 1
; CNN-Loss
lossHistory: resq MAX_INDEX_LOSS_HISTORY
lossHistoryIndex: resq 1
; CNN-Ideals / Inputs / Samples
SampleIndex: resq 1
CnnDataInputSample: resq 1
inputSampleOffset: resq 1
inputSampleSize: resq 1
CnnDataOutputSample: resq 1
outputSampleOffset: resq 1
outputSampleSize: resq 1
trainSample: resq 1
validationSample: resq 1
idealsArray: resq MAX_IDEALS_COUNT
idealsIndex: resq 1
section .text
;============================;
;/\- Activation functions -/\;
;============================;
selectActivationFunction:
; Proceed to check if it's linear
cmp qword [computeActivationFunction], TF_LINEAR
jne selectActivationFunctionNotLinear
call Linear ; call the right function
jmp selectActivationDone
selectActivationFunctionNotLinear:
;============================================;
; Proceed to check if it's ReLU
cmp qword [computeActivationFunction], TF_RELU
jne selectActivationFunctionNotReLU
call ReLU ; call the right function
jmp selectActivationDone
selectActivationFunctionNotReLU:
;============================================;
; Proceed to check if it's LeakyReLU
cmp qword [computeActivationFunction], TF_LEAKY_RELU
jne selectActivationFunctionNotLeakyReLU
call LeakyReLU
jmp selectActivationDone
selectActivationFunctionNotLeakyReLU:
;============================================;
; Proceed to check if it's Sigmoid
cmp qword [computeActivationFunction], TF_SIGMOID
jne selectActivationFunctionNotSigmoid
call Sigmoid
jmp selectActivationDone
selectActivationFunctionNotSigmoid:
;============================================;
; Proceed to check if it's Tanh
cmp qword [computeActivationFunction], TF_TANH
jne selectActivationFunctionNotTanh
call Tanh
jmp selectActivationDone
selectActivationFunctionNotTanh:
selectActivationDone:
ret
Linear:
ret
ReLU:
push rax ; save registers
xor rax, rax ; prepare it
mov byte al, [computedFloatPointer+7] ; get the byte containing the sign bit
test al, 0d128 ; test the sign
jns ReLUDone
mov qword [computedFloatPointer], 0 ; if negative, set pointer to 0
ReLUDone:
pop rax ; get back registers
ret
LeakyReLU:
push rax ; save registers
xor rax, rax ; prepare it
mov byte al, [computedFloatPointer+7] ; get the byte containing the sign bit
test al, 0d128 ; test the sign
jns LeakyReLUDone
; If the float is negative :
fld qword [computedFloatPointer] ; Load the float
fld qword [INITIAL_LEAKY_RELU_COEFFICIENT] ; Load the coefficient of the leeaky relu
fmulp ; Multiply both and pop the FPU stack
fstp qword [computedFloatPointer] ; Store it back in float pointer
LeakyReLUDone:
pop rax ; get back registers
ret
Sigmoid:
ret
Tanh:
; -2z
fld qword [floatMinusTwo]
fld qword [computedFloatPointer]
fmulp
fstp qword [computedFloatPointer]
; e^-2z
call exponential
; 1+e^-2z
fld1
fld qword [computedFloatPointer]
faddp
fstp qword [computedTanhDenominator]
; 1-e^-2z
fld1
fld qword [computedFloatPointer]
fsubp
; 1-e^-2z / 1+e^-2z
fld qword [computedTanhDenominator]
fdivp
fstp qword [computedFloatPointer]
ret
selectDerivativeActivationFunction:
; Proceed to check if it's linear
cmp qword [computeActivationFunction], TF_LINEAR
jne selectDerivativeActivationFunctionNotLinear
call ConstantFunction ; call the right function
jmp selectDerivativeActivationDone
selectDerivativeActivationFunctionNotLinear:
;============================================;
; Proceed to check if it's ReLU
cmp qword [computeActivationFunction], TF_RELU
jne selectDerivativeActivationFunctionNotReLU
call Heaviside ; call the right function
jmp selectDerivativeActivationDone
selectDerivativeActivationFunctionNotReLU:
;============================================;
; Proceed to check if it's LeakyReLU
cmp qword [computeActivationFunction], TF_LEAKY_RELU
jne selectDerivativeActivationFunctionNotLeakyReLU
call LeakyHeaviside
jmp selectDerivativeActivationDone
selectDerivativeActivationFunctionNotLeakyReLU:
;============================================;
; Proceed to check if it's Sigmoid
cmp qword [computeActivationFunction], TF_SIGMOID
jne selectDerivativeActivationFunctionNotSigmoid
call DerivativeSigmoid
jmp selectDerivativeActivationDone
selectDerivativeActivationFunctionNotSigmoid:
;============================================;
; Proceed to check if it's Tanh
cmp qword [computeActivationFunction], TF_TANH
jne selectDerivativeActivationFunctionNotTanh
call DerivativeTanh
jmp selectDerivativeActivationDone
selectDerivativeActivationFunctionNotTanh:
selectDerivativeActivationDone:
ret
ConstantFunction:
ret
Heaviside:
push rax ; save registers
xor rax, rax ; prepare it
mov byte al, [computedFloatPointer+7] ; get the byte containing the sign bit
fld1 ; load one
fstp qword [computedFloatPointer] ; store one in the pointer
test al, 0d128 ; test the sign
jns HeavySideDone
mov qword [computedFloatPointer], 0 ; if negative, set pointer to 0
HeavySideDone:
pop rax ; get back registers
ret
LeakyHeaviside:
push rax ; save registers
xor rax, rax ; prepare it
mov byte al, [computedFloatPointer+7] ; get the byte containing the sign bit
fld1 ; load one
fstp qword [computedFloatPointer] ; store +1.0 in float pointer
test al, 0d128 ; test the sign
jns LeakyHeavySideDone
fld qword [INITIAL_LEAKY_RELU_COEFFICIENT] ; if negative, set pointer to the leaky relu coefficient
fstp qword [computedFloatPointer] ; store back in float pointer
LeakyHeavySideDone:
pop rax ; get back registers
ret
DerivativeSigmoid:
ret
DerivativeTanh:
fcomp st0
call Tanh
fld qword [computedFloatPointer]
fld st0
fmulp
fld1
fxch
fsubp
fstp qword [computedFloatPointer]
ret
softMax:
push rax ; summation sum
push rbx ; src/dst list length
push rcx ; src list pointer
push rdx ; dst list pointer
xor rax, rax
fldz ; Load 0.0
fstp qword [computedSoftMaxSum] ; Set the summation as 0.0
push rbx ; save the initial count
push rcx ; save the initial src-list-pointer
push rdx ; save the initial dst-list-pointer
softMaxSummationLOOP:
; Compute the exponential of the element
mov qword rax, [rcx]
mov [computedFloatPointer], rax
call exponential
; Store it in the dst-list, in order to not calculate it again
mov qword rax, [computedFloatPointer]
mov qword [rdx], rax
; Add it to the summation
fld qword [computedFloatPointer]
fld qword [computedSoftMaxSum]
faddp
fstp qword [computedSoftMaxSum]
; Looping stuff
add rcx, DOUBLE_SIZE
add rdx, DOUBLE_SIZE
dec rbx
cmp rbx, 0
jnz softMaxSummationLOOP
pop rdx ; get back initial dst-list-pointer
pop rcx ; get back initial src-list-pointer
pop rbx ; get back the initial count
softMaxDivisionLOOP:
fld qword [rdx] ; Load e^Xi stored
fld qword [computedSoftMaxSum] ; Load Sum(e^Xk)
fdivp ; Divide as e^Xi / Sum(e^Xk) and pop the FPU stack
fstp qword [rdx] ; store in the dst-list-pointer
; Looping stuff
add rdx, DOUBLE_SIZE
dec rbx
cmp rbx, 0
jnz softMaxDivisionLOOP
pop rdx
pop rcx
pop rbx
pop rax
ret
DerivativeSoftmax:
; 1.1-Ai
fld1
fld qword [tweakedSoftMaxRoot]
faddp
fld qword [computedFloatPointer]
fsubp
fstp qword [backpropTempFloatPointerTwo]
; 0.1 + Ai
fld qword [tweakedSoftMaxRoot]
fld qword [computedFloatPointer]
faddp
; Ai*(1-Ai)
fld qword [backpropTempFloatPointerTwo]
fmulp
fstp qword [computedFloatPointer]
ret
exponential:
fcomp st0
; Let x a double precision float : returns e^x
; Actually computes (2^intX * 2^((X-intX)*log2(e)) - 1 + 1)
fld qword [computedFloatPointer] ; load in ST0 the float
fldl2e ; load in ST1 log2(e)
fmulp
fld st0 ; load ST0 in ST1
frndint ; round ST0
fxch ; swap
fsub st0, st1 ; get the part -1< <1 in ST0 and the round part in ST1
f2xm1 ; compute 2^(ST0) - 1 and store in ST0
fld1
faddp ; add 1 to get back 2^(x*log2(e)), pop loaded-1
fscale ; multiply by 2^ST1
fstp qword [computedFloatPointer] ; returns e^x
ret
getDistance:
push rbx
push rdi
fldz
fstp qword [computedFloatPointer]
getDistanceDimLOOP:
fld qword [rbx]
fld st0
fmulp
fld qword [computedFloatPointer]
faddp
fstp qword [computedFloatPointer]
add rbx, DOUBLE_SIZE
dec rdi
cmp rdi, 0
jnz getDistanceDimLOOP
fld qword [computedFloatPointer]
fsqrt
fstp qword [computedFloatPointer]
pop rdi
pop rbx
ret
integerPower:
push rbx
push rcx
mov qword rax, 1
integerPowerLOOP:
mul rbx
dec rcx
cmp rcx, 0
jnz integerPowerLOOP
pop rcx
pop rbx
ret
;==========================;
;/\- Utility procedures -/\;
;==========================;
getOffsetFromIndex:
; Assuming it's all db float type
push rdx
xor rdx, rdx
mov rdx, DOUBLE_SIZE
mul rdx
pop rdx
ret
getWorkingCnn:
mov rax, [tensorflopHandles] ; return a pointer to a cnnHandle
ret
getLocalWeightsCount:
; IN RAX -> a layer index (offset * 8)
push rbx
push rcx
push rdx
mov rcx, CnnLayersSizes
add rcx, rax
mov rbx, [rcx] ; store the nth-layer neurons-count in rb8
mov rax, [rcx+DOUBLE_SIZE] ; store the (n+1)th-layer neurons-count in rax
mul rbx ; get the number of weights by multiplying r8 ($L) and rax ($L+1), -> rax AND rdx (unused)
pop rdx
pop rcx
pop rbx
ret
getWeightsCount:
xor rcx, rcx ; index of loop
mov rdi, [CnnLayersCount] ; loop bound
dec rdi ; no weights for final layer
xor rax, rax ; mul result and first argument of mul
xor rdx, rdx ; second argument of mul
xor rsi, rsi ; summation stored there
mov rbx, CnnLayersSizes ; pointer to my neurons count
getWeightsCountLoop:
xor rax, rax
xor rdx, rdx
mov rax, [rbx] ; store in rax (1 low byte) the neuron count of nth layer
mov rdx, [rbx+8] ; store in ------------------------------ of n+1th layer
mul rdx ; multiply and store in rax
add rsi, rax ; add result of the multiplication (weights) in rsi
; Looping stuff
add rbx, DOUBLE_SIZE
inc rcx
cmp rcx, rdi
jne getWeightsCountLoop
mov qword [CnnWeightsCount], rsi ; store the amount of weights in the right variable
ret
getNeuronsCount:
xor rcx, rcx ; index of loop
xor rax, rax
mov rbx, CnnLayersSizes
getNeuronsCountLoop:
xor rdx, rdx
mov rdx, [rbx] ; Store in rdx (1 low byte) the layer size (neurons)
add rax, rdx ; Add it to rax
; Looping stuff
add rbx, DOUBLE_SIZE
inc rcx
cmp rcx, [CnnLayersCount]
jne getNeuronsCountLoop
mov qword [CnnNeuronsCount], rax ; Store rax in the right variable
ret
getLayerSize:
push rbx
cmp rax, 0
jge getLayerSizePositive
add rax, [CnnLayersCount]
getLayerSizePositive:
mov rbx, DOUBLE_SIZE
mul rbx
mov rbx, rax
add rbx, CnnLayersSizes
mov rax, [rbx]
pop rbx
ret
getFirstLayerOffset:
mov rax, [CnnLayersSizes] ; store the first layer size in al
mov qword rbx, DOUBLE_SIZE ; store the data size (db float) in the second operand
mul rbx ; multiply and store in rax
mov qword [CnnFirstLayerOffset], rax ; store the first layer offset in the right variable
ret
getLastLayerOffset:
mov rax, [CnnNeuronsCount] ; Amount of Neurons
sub rax, [CnnDataOutputDim] ; Get the index of last layer
call getOffsetFromIndex ; get the offset in bytes
mov qword [CnnLastLayerOffset], rax
ret
getDataInputAndOutputSizes:
mov rax, [CnnLayersSizes]
mov [CnnDataInputDim], rax
mov rax, [CnnLayersCount]
dec rax
mov rdx, DOUBLE_SIZE
mul rdx
add rax, CnnLayersSizes
mov rbx, [rax]
mov [CnnDataOutputDim], rbx
ret
getNeuronsLayerByteOffset:
push rdx
; Case layer -n is asked
cmp rax, 0
jge getNeuronsLayerByteOffsetPositive
; If the layer is negative, just add the layer count
add rax, [CnnLayersCount]
getNeuronsLayerByteOffsetPositive:
dec rax
; Get the real layer offset in bytes (x8)
mov rdx, DOUBLE_SIZE
mul rdx
xor rdx, rdx ; store here the neurons count
getNeuronsLayerByteOffsetLOOP:
add rdx, [CnnLayersSizes+rax] ; add the layer's neurons
sub rax, DOUBLE_SIZE
cmp rax, 0
jge getNeuronsLayerByteOffsetLOOP
mov rax, DOUBLE_SIZE
mul rdx
pop rdx
ret
getReverseNeuronsLayerByteOffset:
push rdx
push rdi
; Case layer -n is asked
cmp rax, 0
jge getReverseNeuronsLayerByteOffsetPositive
; If the layer is negative, just add the layer count
add rax, [CnnLayersCount]
getReverseNeuronsLayerByteOffsetPositive:
inc rax
; Get the real layer offset in bytes (x8)
push rax
mov rax, DOUBLE_SIZE
mov rdi, [CnnLayersCount]
mul rdi
mov rdi ,rax
pop rax
mov rdx, DOUBLE_SIZE
mul rdx
xor rdx, rdx ; store here the neurons count
getReverseNeuronsLayerByteOffsetLOOP:
add rdx, [CnnLayersSizes+rax] ; add the layer's neurons
add rax, DOUBLE_SIZE
cmp rax, rdi
jl getReverseNeuronsLayerByteOffsetLOOP
mov rax, rdx
mov rax, DOUBLE_SIZE
mul rdx
pop rdi
pop rdx
ret
getWeightsLayerByteOffset:
push rbx
push rcx
push rdx
; Case layer -n is asked
cmp rax, 0
jge getWeightsLayerByteOffsetPositive
; If the layer is negative, just add the layer count
add rax, [CnnLayersCount]
dec rax
getWeightsLayerByteOffsetPositive:
; Get the real layer offset in bytes (x8)
mov rdx, DOUBLE_SIZE
mul rdx
mov rcx, rax
xor rbx, rbx
getWeightsLayerByteOffsetLOOP:
xor rdx, rdx
xor rax, rax
mov rax, [CnnLayersSizes+rcx] ; move the layer's neurons
sub rcx, DOUBLE_SIZE
mov rdx, [CnnLayersSizes+rcx] ; move the precedent layer's neurons
mul rdx
add rbx, rax
cmp rcx, 0
jg getWeightsLayerByteOffsetLOOP
mov rdx, rbx
mov rax, DOUBLE_SIZE
mul rdx
pop rdx
pop rcx
pop rbx
ret
getReverseWeightsLayerByteOffset:
push rdx
push rax
mov qword rdx, [CnnWeightsCount]
mov rax, DOUBLE_SIZE
mul rdx
mov rdx, rax
pop rax
call getWeightsLayerByteOffset
sub rdx, rax
mov rax, rdx
pop rdx
ret