From 0b50e7202fd73106fcfe8ad50b64e2eb017600b4 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Tue, 10 May 2022 09:45:12 +0200 Subject: [PATCH] Use different FP registers, fix iNTT range (#234) * different fp registers, fix iNTT range * Remove redundant packing of signature component 'z' * benchmarks Co-authored-by: Alexandre Adomnicai Co-authored-by: Matthias J. Kannwischer --- benchmarks.csv | 66 +++++++++---------- benchmarks.md | 66 +++++++++---------- .../kyber1024-90s/m4fstack/fastinvntt.S | 2 +- crypto_kem/kyber1024/m4fstack/fastinvntt.S | 2 +- crypto_kem/kyber512-90s/m4fstack/fastinvntt.S | 2 +- crypto_kem/kyber512/m4fstack/fastinvntt.S | 2 +- crypto_kem/kyber768-90s/m4fspeed/matacc_asm.S | 62 ++++++++--------- crypto_kem/kyber768-90s/m4fstack/fastinvntt.S | 2 +- crypto_kem/kyber768-90s/m4fstack/matacc_asm.S | 14 ++-- crypto_kem/kyber768/m4fspeed/fastinvntt.S | 49 +++++++------- crypto_kem/kyber768/m4fspeed/fastntt.S | 40 +++++------ crypto_kem/kyber768/m4fspeed/matacc.i | 32 ++++----- crypto_kem/kyber768/m4fspeed/matacc_asm.S | 40 +++++------ crypto_kem/kyber768/m4fstack/matacc.i | 2 +- crypto_kem/kyber768/m4fstack/matacc_asm.S | 4 +- crypto_sign/dilithium2/m4f/ntt.S | 32 ++++----- crypto_sign/dilithium2/m4f/sign.c | 1 - crypto_sign/dilithium3/m4f/smallntt.S | 42 ++++++------ 18 files changed, 232 insertions(+), 228 deletions(-) diff --git a/benchmarks.csv b/benchmarks.csv index e12398b5..45942ed1 100644 --- a/benchmarks.csv +++ b/benchmarks.csv @@ -11,23 +11,23 @@ frodokem640shake (100 executions),m4,77984424,77835411,77989050,78893964,7889393 frodokem640shake (100 executions),opt,90654791,90654775,90654814,104138444,104138419,104138534,103834288,103834251,103834338 hqc-rmrs-128 (100 executions),clean,2967262,2924920,3153007,5415022,5325331,5590378,7845462,7713583,8205411 kyber1024 (100 executions),clean,1637602,1635186,1649476,2010657,2008239,2022536,2148861,2146442,2160739 -kyber1024 (100 executions),m4fspeed,1146506,1144173,1170735,1334375,1332046,1358608,1237498,1235169,1261731 -kyber1024 (100 executions),m4fstack,1154566,1152611,1166824,1346343,1344384,1358597,1251026,1249067,1263280 +kyber1024 (100 executions),m4fspeed,1147546,1144075,1171003,1335403,1331934,1358861,1238542,1235073,1262000 +kyber1024 (100 executions),m4fstack,1155599,1152703,1178851,1347359,1344461,1370611,1252058,1249160,1275310 kyber1024-90s (100 executions),clean,3002358,3000594,3017386,3278118,3276347,3293172,3511420,3509649,3526441 kyber1024-90s (100 executions),m4fspeed,995987,988645,1000850,1088634,1081292,1093497,1086861,1079552,1091724 kyber1024-90s (100 executions),m4fstack,1006908,999573,1009607,1103436,1096101,1106135,1103215,1095880,1105914 kyber512 (100 executions),clean,631058,630445,643862,842478,841864,855281,936458,935844,949261 -kyber512 (100 executions),m4fspeed,441489,440447,454135,540354,539311,553000,491034,489992,503681 -kyber512 (100 executions),m4fstack,441712,441068,454552,542251,541606,555090,493434,492790,506274 +kyber512 (100 executions),m4fspeed,440960,440407,441438,539817,539265,540330,490506,489954,490985 +kyber512 (100 executions),m4fstack,441893,441124,454585,542424,541653,555082,493616,492845,506274 kyber512-90s (100 executions),clean,944546,944075,944961,1114167,1113662,1114581,1259241,1258736,1259655 -kyber512-90s (100 executions),m4fspeed,376946,372781,381569,434013,429848,438636,435784,431618,440406 -kyber512-90s (100 executions),m4fstack,377943,370186,378865,436682,428925,437604,438958,431201,439880 +kyber512-90s (100 executions),m4fspeed,376872,372758,381423,433931,429818,438482,435709,431596,440260 +kyber512-90s (100 executions),m4fstack,378094,373957,382653,436828,432691,441386,439111,434974,443669 kyber768 (100 executions),clean,1051021,1049570,1063275,1349400,1347982,1361620,1463791,1462338,1476010 -kyber768 (100 executions),m4fspeed,723575,722237,736565,877501,876197,890491,803740,802401,816729 -kyber768 (100 executions),m4fstack,728304,725897,740275,884925,882518,896896,812180,809773,824151 +kyber768 (100 executions),m4fspeed,724775,722419,748971,878689,876333,902884,804940,802583,829134 +kyber768 (100 executions),m4fstack,727958,725958,752786,884567,882566,909395,811833,809833,836662 kyber768-90s (100 executions),clean,1811726,1810670,1827045,2034922,2033865,2050240,2222406,2221348,2237723 -kyber768-90s (100 executions),m4fspeed,632398,621252,638272,711096,699950,717003,710430,699284,716304 -kyber768-90s (100 executions),m4fstack,638129,630786,643556,719513,712171,724940,719862,712520,725289 +kyber768-90s (100 executions),m4fspeed,632999,628813,638215,711686,707500,716901,711031,706845,716246 +kyber768-90s (100 executions),m4fstack,638316,630857,643504,719690,712230,724877,720051,712592,725239 lightsaber (100 executions),clean,1023162,1023162,1023163,1494462,1494456,1494495,1805694,1805691,1805725 lightsaber (100 executions),m4fspeed,352196,352196,352229,481006,481005,481038,452654,452653,452687 lightsaber (100 executions),m4fstack,422434,422434,422467,591556,591555,591588,581073,581072,581106 @@ -77,12 +77,12 @@ sntrup953 (100 executions),m4f,12761557,12761535,12761571,943350,943350,943350,7 Signature Schemes,,,,,,,,,, Scheme,Implementation,Key Generation [cycles] (mean),Key Generation [cycles] (min),Key Generation [cycles] (max),Sign [cycles] (mean),Sign [cycles] (min),Sign [cycles] (max),Verify [cycles] (mean),Verify [cycles] (min),Verify [cycles] (max) dilithium2 (100 executions),clean,1976311,1934124,2022613,7465108,3241343,29601126,2109292,2108823,2109692 -dilithium2 (10000 executions),m4f,1597282,1543011,1644501,4119336,1983077,34081046,1572328,1571561,1572863 +dilithium2 (10000 executions),m4f,1597200,1542979,1644602,4095865,1973993,23877766,1572329,1571611,1572899 dilithium2aes (100 executions),clean,5153665,5109045,5227715,12016668,6375642,28738015,4824282,4779372,4898600 dilithium3 (100 executions),clean,3414513,3413395,3416061,11722059,5037981,36169675,3499388,3498955,3499805 -dilithium3 (10000 executions),m4f,2829260,2827405,2842880,6652990,3235358,42043815,2691471,2690861,2691949 +dilithium3 (10000 executions),m4f,2829250,2827244,2831235,6610160,3222476,33943453,2691469,2690961,2691943 dilithium3aes (100 executions),clean,9258325,9166749,9369734,19417325,10745071,60023085,8581938,8491758,8694807 -dilithium5 (10000 executions),m4f,4826132,4737167,4901952,8817385,5433369,40315104,4705982,4705308,4706614 +dilithium5 (10000 executions),m4f,4826293,4737193,4901769,8767067,5413471,45368141,4705981,4705302,4706551 falcon-1024 (100 executions),clean,582455197,338850289,1754663445,133655078,133335905,133985773,1526901,1526233,1527648 falcon-1024 (100 executions),m4-ct,458300837,273960881,1558842038,85160712,84941964,85410952,977811,966969,985555 falcon-1024 (100 executions),opt-ct,445577914,273960881,1180316927,85152483,84871257,85396462,978443,966990,985220 @@ -214,11 +214,11 @@ Signature Schemes,,,,,,,,,, Scheme,Implementation,Key Generation [bytes],Sign [bytes],Verify [bytes],,,,,, dilithium2,clean,38284,51908,36196,,,,,, dilithium2aes,clean,39764,53388,37676,,,,,, -dilithium2,m4f,38276,49356,36296,,,,,, +dilithium2,m4f,38276,49356,36188,,,,,, dilithium3,clean,60812,79664,57700,,,,,, dilithium3aes,clean,62292,81036,59180,,,,,, dilithium3,m4f,60804,68804,57692,,,,,, -dilithium5,m4f,97776,116016,92872,,,,,, +dilithium5,m4f,97668,115908,92764,,,,,, falcon-1024,clean,36264,82428,8796,,,,,, falcon-1024,m4-ct,1488,2568,496,,,,,, falcon-1024,opt-ct,1448,2568,388,,,,,, @@ -292,13 +292,13 @@ kyber512,clean,54.7,51.7,37.1,,,,,, kyber512,m4fspeed,78.1,80.5,70.6,,,,,, kyber512,m4fstack,78.0,80.2,70.2,,,,,, kyber512-90s,clean,68.8,62.8,52.7,,,,,, -kyber512-90s,m4fspeed,71.0,72.8,64.2,,,,,, +kyber512-90s,m4fspeed,71.1,72.8,64.2,,,,,, kyber512-90s,m4fstack,70.8,72.3,63.7,,,,,, kyber768,clean,52.8,52.4,39.7,,,,,, kyber768,m4fspeed,76.5,80.3,72.0,,,,,, kyber768,m4fstack,76.1,79.8,71.4,,,,,, kyber768-90s,clean,71.5,67.4,59.5,,,,,, -kyber768-90s,m4fspeed,68.8,71.8,64.8,,,,,, +kyber768-90s,m4fspeed,68.9,71.8,64.8,,,,,, kyber768-90s,m4fstack,68.3,71.1,64.0,,,,,, lightsaber,clean,24.9,23.8,15.5,,,,,, lightsaber,m4fspeed,72.3,74.0,61.9,,,,,, @@ -350,11 +350,11 @@ Signature Schemes,,,,,,,,,, Scheme,Implementation,Key Generation [%],Sign [%],Verify [%],,,,,, dilithium2,clean,67.1,37.3,60.1,,,,,, dilithium2aes,clean,2.7,3.3,5.0,,,,,, -dilithium2,m4f,82.8,65.2,80.4,,,,,, +dilithium2,m4f,82.8,66.6,80.4,,,,,, dilithium3,clean,70.5,37.0,63.6,,,,,, dilithium3aes,clean,2.2,2.5,3.5,,,,,, -dilithium3,m4f,85.0,64.4,82.6,,,,,, -dilithium5,m4f,86.0,69.1,84.5,,,,,, +dilithium3,m4f,85.0,64.7,82.6,,,,,, +dilithium5,m4f,86.0,69.7,84.5,,,,,, falcon-1024,clean,8.4,0.3,26.9,,,,,, falcon-1024,m4-ct,10.6,0.5,34.2,,,,,, falcon-1024,opt-ct,11.1,0.5,34.3,,,,,, @@ -419,23 +419,23 @@ frodokem640shake,m4,8644,0,0,8644,,,,, frodokem640shake,opt,6796,0,0,6796,,,,, hqc-rmrs-128,clean,18436,0,0,18436,,,,, kyber1024,clean,6084,0,0,6084,,,,, -kyber1024,m4fspeed,18324,0,0,18324,,,,, -kyber1024,m4fstack,15452,0,0,15452,,,,, +kyber1024,m4fspeed,18332,0,0,18332,,,,, +kyber1024,m4fstack,15460,0,0,15460,,,,, kyber1024-90s,clean,6468,0,0,6468,,,,, kyber1024-90s,m4fspeed,18832,0,0,18832,,,,, kyber1024-90s,m4fstack,15744,0,0,15744,,,,, kyber512,clean,4900,0,0,4900,,,,, -kyber512,m4fspeed,17220,0,0,17220,,,,, -kyber512,m4fstack,14624,0,0,14624,,,,, +kyber512,m4fspeed,17228,0,0,17228,,,,, +kyber512,m4fstack,14632,0,0,14632,,,,, kyber512-90s,clean,5144,0,0,5144,,,,, -kyber512-90s,m4fspeed,17728,0,0,17728,,,,, -kyber512-90s,m4fstack,14900,0,0,14900,,,,, +kyber512-90s,m4fspeed,17736,0,0,17736,,,,, +kyber512-90s,m4fstack,14908,0,0,14908,,,,, kyber768,clean,4912,0,0,4912,,,,, -kyber768,m4fspeed,17532,0,0,17532,,,,, -kyber768,m4fstack,14528,0,0,14528,,,,, +kyber768,m4fspeed,17540,0,0,17540,,,,, +kyber768,m4fstack,14536,0,0,14536,,,,, kyber768-90s,clean,5176,0,0,5176,,,,, -kyber768-90s,m4fspeed,18064,0,0,18064,,,,, -kyber768-90s,m4fstack,14784,0,0,14784,,,,, +kyber768-90s,m4fspeed,18072,0,0,18072,,,,, +kyber768-90s,m4fstack,14792,0,0,14792,,,,, lightsaber,clean,10368,0,0,10368,,,,, lightsaber,m4fspeed,18900,0,0,18900,,,,, lightsaber,m4fstack,19712,0,0,19712,,,,, @@ -486,11 +486,11 @@ Signature Schemes,,,,,,,,,, Scheme,Implementation,.text [bytes],.data [bytes],.bss [bytes],Total [bytes],,,,, dilithium2,clean,7948,0,0,7948,,,,, dilithium2aes,clean,14982,0,0,14982,,,,, -dilithium2,m4f,18440,0,0,18440,,,,, +dilithium2,m4f,18424,0,0,18424,,,,, dilithium3,clean,7444,0,0,7444,,,,, dilithium3aes,clean,14470,0,0,14470,,,,, -dilithium3,m4f,19912,0,0,19912,,,,, -dilithium5,m4f,18236,0,0,18236,,,,, +dilithium3,m4f,19904,0,0,19904,,,,, +dilithium5,m4f,18220,0,0,18220,,,,, falcon-1024,clean,82285,0,0,82285,,,,, falcon-1024,m4-ct,81265,0,79872,161137,,,,, falcon-1024,opt-ct,81265,0,79872,161137,,,,, diff --git a/benchmarks.md b/benchmarks.md index 940ff5b4..78378f41 100644 --- a/benchmarks.md +++ b/benchmarks.md @@ -12,23 +12,23 @@ | frodokem640shake (100 executions) | opt | AVG: 90,654,791
MIN: 90,654,775
MAX: 90,654,814 | AVG: 104,138,444
MIN: 104,138,419
MAX: 104,138,534 | AVG: 103,834,288
MIN: 103,834,251
MAX: 103,834,338 | | hqc-rmrs-128 (100 executions) | clean | AVG: 2,967,262
MIN: 2,924,920
MAX: 3,153,007 | AVG: 5,415,022
MIN: 5,325,331
MAX: 5,590,378 | AVG: 7,845,462
MIN: 7,713,583
MAX: 8,205,411 | | kyber1024 (100 executions) | clean | AVG: 1,637,602
MIN: 1,635,186
MAX: 1,649,476 | AVG: 2,010,657
MIN: 2,008,239
MAX: 2,022,536 | AVG: 2,148,861
MIN: 2,146,442
MAX: 2,160,739 | -| kyber1024 (100 executions) | m4fspeed | AVG: 1,146,506
MIN: 1,144,173
MAX: 1,170,735 | AVG: 1,334,375
MIN: 1,332,046
MAX: 1,358,608 | AVG: 1,237,498
MIN: 1,235,169
MAX: 1,261,731 | -| kyber1024 (100 executions) | m4fstack | AVG: 1,154,566
MIN: 1,152,611
MAX: 1,166,824 | AVG: 1,346,343
MIN: 1,344,384
MAX: 1,358,597 | AVG: 1,251,026
MIN: 1,249,067
MAX: 1,263,280 | +| kyber1024 (100 executions) | m4fspeed | AVG: 1,147,546
MIN: 1,144,075
MAX: 1,171,003 | AVG: 1,335,403
MIN: 1,331,934
MAX: 1,358,861 | AVG: 1,238,542
MIN: 1,235,073
MAX: 1,262,000 | +| kyber1024 (100 executions) | m4fstack | AVG: 1,155,599
MIN: 1,152,703
MAX: 1,178,851 | AVG: 1,347,359
MIN: 1,344,461
MAX: 1,370,611 | AVG: 1,252,058
MIN: 1,249,160
MAX: 1,275,310 | | kyber1024-90s (100 executions) | clean | AVG: 3,002,358
MIN: 3,000,594
MAX: 3,017,386 | AVG: 3,278,118
MIN: 3,276,347
MAX: 3,293,172 | AVG: 3,511,420
MIN: 3,509,649
MAX: 3,526,441 | | kyber1024-90s (100 executions) | m4fspeed | AVG: 995,987
MIN: 988,645
MAX: 1,000,850 | AVG: 1,088,634
MIN: 1,081,292
MAX: 1,093,497 | AVG: 1,086,861
MIN: 1,079,552
MAX: 1,091,724 | | kyber1024-90s (100 executions) | m4fstack | AVG: 1,006,908
MIN: 999,573
MAX: 1,009,607 | AVG: 1,103,436
MIN: 1,096,101
MAX: 1,106,135 | AVG: 1,103,215
MIN: 1,095,880
MAX: 1,105,914 | | kyber512 (100 executions) | clean | AVG: 631,058
MIN: 630,445
MAX: 643,862 | AVG: 842,478
MIN: 841,864
MAX: 855,281 | AVG: 936,458
MIN: 935,844
MAX: 949,261 | -| kyber512 (100 executions) | m4fspeed | AVG: 441,489
MIN: 440,447
MAX: 454,135 | AVG: 540,354
MIN: 539,311
MAX: 553,000 | AVG: 491,034
MIN: 489,992
MAX: 503,681 | -| kyber512 (100 executions) | m4fstack | AVG: 441,712
MIN: 441,068
MAX: 454,552 | AVG: 542,251
MIN: 541,606
MAX: 555,090 | AVG: 493,434
MIN: 492,790
MAX: 506,274 | +| kyber512 (100 executions) | m4fspeed | AVG: 440,960
MIN: 440,407
MAX: 441,438 | AVG: 539,817
MIN: 539,265
MAX: 540,330 | AVG: 490,506
MIN: 489,954
MAX: 490,985 | +| kyber512 (100 executions) | m4fstack | AVG: 441,893
MIN: 441,124
MAX: 454,585 | AVG: 542,424
MIN: 541,653
MAX: 555,082 | AVG: 493,616
MIN: 492,845
MAX: 506,274 | | kyber512-90s (100 executions) | clean | AVG: 944,546
MIN: 944,075
MAX: 944,961 | AVG: 1,114,167
MIN: 1,113,662
MAX: 1,114,581 | AVG: 1,259,241
MIN: 1,258,736
MAX: 1,259,655 | -| kyber512-90s (100 executions) | m4fspeed | AVG: 376,946
MIN: 372,781
MAX: 381,569 | AVG: 434,013
MIN: 429,848
MAX: 438,636 | AVG: 435,784
MIN: 431,618
MAX: 440,406 | -| kyber512-90s (100 executions) | m4fstack | AVG: 377,943
MIN: 370,186
MAX: 378,865 | AVG: 436,682
MIN: 428,925
MAX: 437,604 | AVG: 438,958
MIN: 431,201
MAX: 439,880 | +| kyber512-90s (100 executions) | m4fspeed | AVG: 376,872
MIN: 372,758
MAX: 381,423 | AVG: 433,931
MIN: 429,818
MAX: 438,482 | AVG: 435,709
MIN: 431,596
MAX: 440,260 | +| kyber512-90s (100 executions) | m4fstack | AVG: 378,094
MIN: 373,957
MAX: 382,653 | AVG: 436,828
MIN: 432,691
MAX: 441,386 | AVG: 439,111
MIN: 434,974
MAX: 443,669 | | kyber768 (100 executions) | clean | AVG: 1,051,021
MIN: 1,049,570
MAX: 1,063,275 | AVG: 1,349,400
MIN: 1,347,982
MAX: 1,361,620 | AVG: 1,463,791
MIN: 1,462,338
MAX: 1,476,010 | -| kyber768 (100 executions) | m4fspeed | AVG: 723,575
MIN: 722,237
MAX: 736,565 | AVG: 877,501
MIN: 876,197
MAX: 890,491 | AVG: 803,740
MIN: 802,401
MAX: 816,729 | -| kyber768 (100 executions) | m4fstack | AVG: 728,304
MIN: 725,897
MAX: 740,275 | AVG: 884,925
MIN: 882,518
MAX: 896,896 | AVG: 812,180
MIN: 809,773
MAX: 824,151 | +| kyber768 (100 executions) | m4fspeed | AVG: 724,775
MIN: 722,419
MAX: 748,971 | AVG: 878,689
MIN: 876,333
MAX: 902,884 | AVG: 804,940
MIN: 802,583
MAX: 829,134 | +| kyber768 (100 executions) | m4fstack | AVG: 727,958
MIN: 725,958
MAX: 752,786 | AVG: 884,567
MIN: 882,566
MAX: 909,395 | AVG: 811,833
MIN: 809,833
MAX: 836,662 | | kyber768-90s (100 executions) | clean | AVG: 1,811,726
MIN: 1,810,670
MAX: 1,827,045 | AVG: 2,034,922
MIN: 2,033,865
MAX: 2,050,240 | AVG: 2,222,406
MIN: 2,221,348
MAX: 2,237,723 | -| kyber768-90s (100 executions) | m4fspeed | AVG: 632,398
MIN: 621,252
MAX: 638,272 | AVG: 711,096
MIN: 699,950
MAX: 717,003 | AVG: 710,430
MIN: 699,284
MAX: 716,304 | -| kyber768-90s (100 executions) | m4fstack | AVG: 638,129
MIN: 630,786
MAX: 643,556 | AVG: 719,513
MIN: 712,171
MAX: 724,940 | AVG: 719,862
MIN: 712,520
MAX: 725,289 | +| kyber768-90s (100 executions) | m4fspeed | AVG: 632,999
MIN: 628,813
MAX: 638,215 | AVG: 711,686
MIN: 707,500
MAX: 716,901 | AVG: 711,031
MIN: 706,845
MAX: 716,246 | +| kyber768-90s (100 executions) | m4fstack | AVG: 638,316
MIN: 630,857
MAX: 643,504 | AVG: 719,690
MIN: 712,230
MAX: 724,877 | AVG: 720,051
MIN: 712,592
MAX: 725,239 | | lightsaber (100 executions) | clean | AVG: 1,023,162
MIN: 1,023,162
MAX: 1,023,163 | AVG: 1,494,462
MIN: 1,494,456
MAX: 1,494,495 | AVG: 1,805,694
MIN: 1,805,691
MAX: 1,805,725 | | lightsaber (100 executions) | m4fspeed | AVG: 352,196
MIN: 352,196
MAX: 352,229 | AVG: 481,006
MIN: 481,005
MAX: 481,038 | AVG: 452,654
MIN: 452,653
MAX: 452,687 | | lightsaber (100 executions) | m4fstack | AVG: 422,434
MIN: 422,434
MAX: 422,467 | AVG: 591,556
MIN: 591,555
MAX: 591,588 | AVG: 581,073
MIN: 581,072
MAX: 581,106 | @@ -79,12 +79,12 @@ | scheme | implementation | key generation [cycles] | sign [cycles] | verify [cycles] | | ------ | -------------- | ----------------------- | ------------- | --------------- | | dilithium2 (100 executions) | clean | AVG: 1,976,311
MIN: 1,934,124
MAX: 2,022,613 | AVG: 7,465,108
MIN: 3,241,343
MAX: 29,601,126 | AVG: 2,109,292
MIN: 2,108,823
MAX: 2,109,692 | -| dilithium2 (10000 executions) | m4f | AVG: 1,597,282
MIN: 1,543,011
MAX: 1,644,501 | AVG: 4,119,336
MIN: 1,983,077
MAX: 34,081,046 | AVG: 1,572,328
MIN: 1,571,561
MAX: 1,572,863 | +| dilithium2 (10000 executions) | m4f | AVG: 1,597,200
MIN: 1,542,979
MAX: 1,644,602 | AVG: 4,095,865
MIN: 1,973,993
MAX: 23,877,766 | AVG: 1,572,329
MIN: 1,571,611
MAX: 1,572,899 | | dilithium2aes (100 executions) | clean | AVG: 5,153,665
MIN: 5,109,045
MAX: 5,227,715 | AVG: 12,016,668
MIN: 6,375,642
MAX: 28,738,015 | AVG: 4,824,282
MIN: 4,779,372
MAX: 4,898,600 | | dilithium3 (100 executions) | clean | AVG: 3,414,513
MIN: 3,413,395
MAX: 3,416,061 | AVG: 11,722,059
MIN: 5,037,981
MAX: 36,169,675 | AVG: 3,499,388
MIN: 3,498,955
MAX: 3,499,805 | -| dilithium3 (10000 executions) | m4f | AVG: 2,829,260
MIN: 2,827,405
MAX: 2,842,880 | AVG: 6,652,990
MIN: 3,235,358
MAX: 42,043,815 | AVG: 2,691,471
MIN: 2,690,861
MAX: 2,691,949 | +| dilithium3 (10000 executions) | m4f | AVG: 2,829,250
MIN: 2,827,244
MAX: 2,831,235 | AVG: 6,610,160
MIN: 3,222,476
MAX: 33,943,453 | AVG: 2,691,469
MIN: 2,690,961
MAX: 2,691,943 | | dilithium3aes (100 executions) | clean | AVG: 9,258,325
MIN: 9,166,749
MAX: 9,369,734 | AVG: 19,417,325
MIN: 10,745,071
MAX: 60,023,085 | AVG: 8,581,938
MIN: 8,491,758
MAX: 8,694,807 | -| dilithium5 (10000 executions) | m4f | AVG: 4,826,132
MIN: 4,737,167
MAX: 4,901,952 | AVG: 8,817,385
MIN: 5,433,369
MAX: 40,315,104 | AVG: 4,705,982
MIN: 4,705,308
MAX: 4,706,614 | +| dilithium5 (10000 executions) | m4f | AVG: 4,826,293
MIN: 4,737,193
MAX: 4,901,769 | AVG: 8,767,067
MIN: 5,413,471
MAX: 45,368,141 | AVG: 4,705,981
MIN: 4,705,302
MAX: 4,706,551 | | falcon-1024 (100 executions) | clean | AVG: 582,455,197
MIN: 338,850,289
MAX: 1,754,663,445 | AVG: 133,655,078
MIN: 133,335,905
MAX: 133,985,773 | AVG: 1,526,901
MIN: 1,526,233
MAX: 1,527,648 | | falcon-1024 (100 executions) | m4-ct | AVG: 458,300,837
MIN: 273,960,881
MAX: 1,558,842,038 | AVG: 85,160,712
MIN: 84,941,964
MAX: 85,410,952 | AVG: 977,811
MIN: 966,969
MAX: 985,555 | | falcon-1024 (100 executions) | opt-ct | AVG: 445,577,914
MIN: 273,960,881
MAX: 1,180,316,927 | AVG: 85,152,483
MIN: 84,871,257
MAX: 85,396,462 | AVG: 978,443
MIN: 966,990
MAX: 985,220 | @@ -217,12 +217,12 @@ | Scheme | Implementation | Key Generation [bytes] | Sign [bytes] | Verify [bytes] | | ------ | -------------- | ---------------------- | ------------ | -------------- | | dilithium2 | clean | 38,284 | 51,908 | 36,196 | -| dilithium2 | m4f | 38,276 | 49,356 | 36,296 | +| dilithium2 | m4f | 38,276 | 49,356 | 36,188 | | dilithium2aes | clean | 39,764 | 53,388 | 37,676 | | dilithium3 | clean | 60,812 | 79,664 | 57,700 | | dilithium3 | m4f | 60,804 | 68,804 | 57,692 | | dilithium3aes | clean | 62,292 | 81,036 | 59,180 | -| dilithium5 | m4f | 97,776 | 116,016 | 92,872 | +| dilithium5 | m4f | 97,668 | 115,908 | 92,764 | | falcon-1024 | clean | 36,264 | 82,428 | 8,796 | | falcon-1024 | m4-ct | 1,488 | 2,568 | 496 | | falcon-1024 | opt-ct | 1,448 | 2,568 | 388 | @@ -297,13 +297,13 @@ | kyber512 | m4fspeed | 78.1% | 80.5% | 70.6% | | kyber512 | m4fstack | 78.0% | 80.2% | 70.2% | | kyber512-90s | clean | 68.8% | 62.8% | 52.7% | -| kyber512-90s | m4fspeed | 71.0% | 72.8% | 64.2% | +| kyber512-90s | m4fspeed | 71.1% | 72.8% | 64.2% | | kyber512-90s | m4fstack | 70.8% | 72.3% | 63.7% | | kyber768 | clean | 52.8% | 52.4% | 39.7% | | kyber768 | m4fspeed | 76.5% | 80.3% | 72.0% | | kyber768 | m4fstack | 76.1% | 79.8% | 71.4% | | kyber768-90s | clean | 71.5% | 67.4% | 59.5% | -| kyber768-90s | m4fspeed | 68.8% | 71.8% | 64.8% | +| kyber768-90s | m4fspeed | 68.9% | 71.8% | 64.8% | | kyber768-90s | m4fstack | 68.3% | 71.1% | 64.0% | | lightsaber | clean | 24.9% | 23.8% | 15.5% | | lightsaber | m4fspeed | 72.3% | 74.0% | 61.9% | @@ -355,12 +355,12 @@ | Scheme | Implementation | Key Generation [%] | Sign [%] | Verify [%] | | ------ | -------------- | ------------------ | -------- | ---------- | | dilithium2 | clean | 67.1% | 37.3% | 60.1% | -| dilithium2 | m4f | 82.8% | 65.2% | 80.4% | +| dilithium2 | m4f | 82.8% | 66.6% | 80.4% | | dilithium2aes | clean | 2.7% | 3.3% | 5.0% | | dilithium3 | clean | 70.5% | 37.0% | 63.6% | -| dilithium3 | m4f | 85.0% | 64.4% | 82.6% | +| dilithium3 | m4f | 85.0% | 64.7% | 82.6% | | dilithium3aes | clean | 2.2% | 2.5% | 3.5% | -| dilithium5 | m4f | 86.0% | 69.1% | 84.5% | +| dilithium5 | m4f | 86.0% | 69.7% | 84.5% | | falcon-1024 | clean | 8.4% | 0.3% | 26.9% | | falcon-1024 | m4-ct | 10.6% | 0.5% | 34.2% | | falcon-1024 | opt-ct | 11.1% | 0.5% | 34.3% | @@ -426,23 +426,23 @@ | frodokem640shake | opt | 6,796 | 0 | 0 | 6,796 | | hqc-rmrs-128 | clean | 18,436 | 0 | 0 | 18,436 | | kyber1024 | clean | 6,084 | 0 | 0 | 6,084 | -| kyber1024 | m4fspeed | 18,324 | 0 | 0 | 18,324 | -| kyber1024 | m4fstack | 15,452 | 0 | 0 | 15,452 | +| kyber1024 | m4fspeed | 18,332 | 0 | 0 | 18,332 | +| kyber1024 | m4fstack | 15,460 | 0 | 0 | 15,460 | | kyber1024-90s | clean | 6,468 | 0 | 0 | 6,468 | | kyber1024-90s | m4fspeed | 18,832 | 0 | 0 | 18,832 | | kyber1024-90s | m4fstack | 15,744 | 0 | 0 | 15,744 | | kyber512 | clean | 4,900 | 0 | 0 | 4,900 | -| kyber512 | m4fspeed | 17,220 | 0 | 0 | 17,220 | -| kyber512 | m4fstack | 14,624 | 0 | 0 | 14,624 | +| kyber512 | m4fspeed | 17,228 | 0 | 0 | 17,228 | +| kyber512 | m4fstack | 14,632 | 0 | 0 | 14,632 | | kyber512-90s | clean | 5,144 | 0 | 0 | 5,144 | -| kyber512-90s | m4fspeed | 17,728 | 0 | 0 | 17,728 | -| kyber512-90s | m4fstack | 14,900 | 0 | 0 | 14,900 | +| kyber512-90s | m4fspeed | 17,736 | 0 | 0 | 17,736 | +| kyber512-90s | m4fstack | 14,908 | 0 | 0 | 14,908 | | kyber768 | clean | 4,912 | 0 | 0 | 4,912 | -| kyber768 | m4fspeed | 17,532 | 0 | 0 | 17,532 | -| kyber768 | m4fstack | 14,528 | 0 | 0 | 14,528 | +| kyber768 | m4fspeed | 17,540 | 0 | 0 | 17,540 | +| kyber768 | m4fstack | 14,536 | 0 | 0 | 14,536 | | kyber768-90s | clean | 5,176 | 0 | 0 | 5,176 | -| kyber768-90s | m4fspeed | 18,064 | 0 | 0 | 18,064 | -| kyber768-90s | m4fstack | 14,784 | 0 | 0 | 14,784 | +| kyber768-90s | m4fspeed | 18,072 | 0 | 0 | 18,072 | +| kyber768-90s | m4fstack | 14,792 | 0 | 0 | 14,792 | | lightsaber | clean | 10,368 | 0 | 0 | 10,368 | | lightsaber | m4fspeed | 18,900 | 0 | 0 | 18,900 | | lightsaber | m4fstack | 19,712 | 0 | 0 | 19,712 | @@ -493,12 +493,12 @@ | Scheme | Implementation | .text [bytes] | .data [bytes] | .bss [bytes] | Total [bytes] | | ------ | -------------- | ------------- | ------------- | ------------ | ------------- | | dilithium2 | clean | 7,948 | 0 | 0 | 7,948 | -| dilithium2 | m4f | 18,440 | 0 | 0 | 18,440 | +| dilithium2 | m4f | 18,424 | 0 | 0 | 18,424 | | dilithium2aes | clean | 14,982 | 0 | 0 | 14,982 | | dilithium3 | clean | 7,444 | 0 | 0 | 7,444 | -| dilithium3 | m4f | 19,912 | 0 | 0 | 19,912 | +| dilithium3 | m4f | 19,904 | 0 | 0 | 19,904 | | dilithium3aes | clean | 14,470 | 0 | 0 | 14,470 | -| dilithium5 | m4f | 18,236 | 0 | 0 | 18,236 | +| dilithium5 | m4f | 18,220 | 0 | 0 | 18,220 | | falcon-1024 | clean | 82,285 | 0 | 0 | 82,285 | | falcon-1024 | m4-ct | 81,265 | 0 | 79,872 | 161,137 | | falcon-1024 | opt-ct | 81,265 | 0 | 79,872 | 161,137 | diff --git a/crypto_kem/kyber1024-90s/m4fstack/fastinvntt.S b/crypto_kem/kyber1024-90s/m4fstack/fastinvntt.S index 38ea2e36..8b242d6c 120000 --- a/crypto_kem/kyber1024-90s/m4fstack/fastinvntt.S +++ b/crypto_kem/kyber1024-90s/m4fstack/fastinvntt.S @@ -1 +1 @@ -../../kyber768/m4fspeed/fastinvntt.S \ No newline at end of file +../../kyber768/m4fstack/fastinvntt.S \ No newline at end of file diff --git a/crypto_kem/kyber1024/m4fstack/fastinvntt.S b/crypto_kem/kyber1024/m4fstack/fastinvntt.S index 1ad2d319..8b242d6c 120000 --- a/crypto_kem/kyber1024/m4fstack/fastinvntt.S +++ b/crypto_kem/kyber1024/m4fstack/fastinvntt.S @@ -1 +1 @@ -../m4fspeed/fastinvntt.S \ No newline at end of file +../../kyber768/m4fstack/fastinvntt.S \ No newline at end of file diff --git a/crypto_kem/kyber512-90s/m4fstack/fastinvntt.S b/crypto_kem/kyber512-90s/m4fstack/fastinvntt.S index 38ea2e36..8b242d6c 120000 --- a/crypto_kem/kyber512-90s/m4fstack/fastinvntt.S +++ b/crypto_kem/kyber512-90s/m4fstack/fastinvntt.S @@ -1 +1 @@ -../../kyber768/m4fspeed/fastinvntt.S \ No newline at end of file +../../kyber768/m4fstack/fastinvntt.S \ No newline at end of file diff --git a/crypto_kem/kyber512/m4fstack/fastinvntt.S b/crypto_kem/kyber512/m4fstack/fastinvntt.S index 1ad2d319..8b242d6c 120000 --- a/crypto_kem/kyber512/m4fstack/fastinvntt.S +++ b/crypto_kem/kyber512/m4fstack/fastinvntt.S @@ -1 +1 @@ -../m4fspeed/fastinvntt.S \ No newline at end of file +../../kyber768/m4fstack/fastinvntt.S \ No newline at end of file diff --git a/crypto_kem/kyber768-90s/m4fspeed/matacc_asm.S b/crypto_kem/kyber768-90s/m4fspeed/matacc_asm.S index 92bf6b9a..3237dd36 100644 --- a/crypto_kem/kyber768-90s/m4fspeed/matacc_asm.S +++ b/crypto_kem/kyber768-90s/m4fspeed/matacc_asm.S @@ -41,7 +41,7 @@ mov.w r0, \tmp // buf + off implicitly after copying loop mov r1, #1 - vmov r2, s20 // get state ptr + vmov r2, s10 // get state ptr bl aes256xof_squeezeblocks vmov r0, s2 @@ -83,13 +83,13 @@ matacc_asm_cache_16_32: ldr.w zetaptr, [sp, #13*4] // load zetaptr from stack ldr.w tmp, [sp, #14*4] // load state from stack - vmov s20, tmp + vmov s10, tmp ldr.w tmp, [sp, #15*4] // load aprimeptr from stack - vmov s21, tmp + vmov s11, tmp movw tmp2, #64 // XOF_BLOCKBYTES - vmov s23, tmp2 + vmov s13, tmp2 movw q, #3329 movw k, #0 @@ -105,7 +105,7 @@ matacc_asm_cache_16_32: second_if doublebasemul_asm_cache_16_32, tmp, tmp2, tmp3, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qqinv, ctr - update_buf_loop_finish tmp, tmp2, tmp3, val0, val1, bufptr, ctr, s23 + update_buf_loop_finish tmp, tmp2, tmp3, val0, val1, bufptr, ctr, s13 pop {r0-r11, pc} .size matacc_asm_cache_16_32, . - matacc_asm_cache_16_32 @@ -136,13 +136,13 @@ matacc_asm_cache_32_32: ldr.w zetaptr, [sp, #13*4] // load zetaptr from stack ldr.w tmp, [sp, #14*4] // load state from stack - vmov s20, tmp + vmov s10, tmp ldr.w tmp, [sp, #15*4] // load aprimeptr from stack - vmov s21, tmp + vmov s11, tmp movw tmp2, #64 // XOF_BLOCKBYTES - vmov s23, tmp2 + vmov s13, tmp2 movw q, #3329 movw k, #0 @@ -158,7 +158,7 @@ matacc_asm_cache_32_32: second_if doublebasemul_asm_acc_cache_32_32, tmp, tmp2, tmp3, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qqinv, ctr - update_buf_loop_finish tmp, tmp2, tmp3, val0, val1, bufptr, ctr, s23 + update_buf_loop_finish tmp, tmp2, tmp3, val0, val1, bufptr, ctr, s13 pop {r0-r11, pc} .size matacc_asm_cache_32_32, . - matacc_asm_cache_32_32 @@ -190,17 +190,17 @@ matacc_asm_cache_32_16: ldr.w zetaptr, [sp, #13*4] // load zetaptr from stack ldr.w tmp, [sp, #14*4] // load state from stack - vmov s20, tmp + vmov s10, tmp ldr.w tmp, [sp, #15*4] // load aprimeptr from stack - vmov s21, tmp + vmov s11, tmp - vmov s22, rptr // store "real" destinaton in FP - vmov s23, rptr // backup + vmov s12, rptr // store "real" destinaton in FP + vmov s13, rptr // backup ldr.w rptr, [sp, #16*4] movw tmp2, #64 // XOF_BLOCKBYTES - vmov s25, tmp2 + vmov s15, tmp2 movw q, #3329 movw k, #0 @@ -216,9 +216,9 @@ matacc_asm_cache_32_16: second_if doublebasemul_asm_acc_cache_32_16, tmp, tmp2, tmp3, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qqinv, ctr - update_buf_loop_finish tmp, tmp2, tmp3, val0, val1, bufptr, ctr, s25 + update_buf_loop_finish tmp, tmp2, tmp3, val0, val1, bufptr, ctr, s15 - vmov rptr, s23 + vmov rptr, s13 pop {r0-r11, pc} .size matacc_asm_cache_32_16, . - matacc_asm_cache_32_16 @@ -253,13 +253,13 @@ matacc_asm_opt_16_32: movt qqinv, #3327 ldr.w tmp, [sp, #13*4] // load state from stack - vmov s20, tmp + vmov s10, tmp ldr.w tmp, [sp, #14*4] // load aprimeptr from stack - vmov s21, tmp + vmov s11, tmp movw tmp2, #64 // XOF_BLOCKBYTES - vmov s23, tmp2 + vmov s13, tmp2 movw q, #3329 movw k, #0 @@ -275,7 +275,7 @@ matacc_asm_opt_16_32: second_if doublebasemul_asm_opt_16_32, tmp, tmp2, tmp3, val0, val1, rptr, bptr, cptr, bufptr, tmp4, k, q, qqinv, ctr - update_buf_loop_finish tmp, tmp2, tmp3, val0, val1, bufptr, ctr, s23 + update_buf_loop_finish tmp, tmp2, tmp3, val0, val1, bufptr, ctr, s13 pop {r0-r11, pc} .size matacc_asm_opt_16_32, . - matacc_asm_opt_16_32 @@ -309,13 +309,13 @@ matacc_asm_opt_32_32: movt qqinv, #3327 ldr.w tmp, [sp, #13*4] // load state from stack - vmov s20, tmp + vmov s10, tmp ldr.w tmp, [sp, #14*4] // load aprimeptr from stack - vmov s21, tmp + vmov s11, tmp movw tmp2, #64 // XOF_BLOCKBYTES - vmov s23, tmp2 + vmov s13, tmp2 movw q, #3329 movw k, #0 @@ -331,7 +331,7 @@ matacc_asm_opt_32_32: second_if doublebasemul_asm_acc_opt_32_32, tmp, tmp2, tmp3, val0, val1, rptr, bptr, cptr, bufptr, tmp4, k, q, qqinv, ctr - update_buf_loop_finish tmp, tmp2, tmp3, val0, val1, bufptr, ctr, s23 + update_buf_loop_finish tmp, tmp2, tmp3, val0, val1, bufptr, ctr, s13 pop {r0-r11, pc} .size matacc_asm_opt_32_32, . - matacc_asm_opt_32_32 @@ -365,17 +365,17 @@ matacc_asm_opt_32_16: movt qqinv, #3327 ldr.w tmp, [sp, #13*4] // load state from stack - vmov s20, tmp + vmov s10, tmp ldr.w tmp, [sp, #14*4] // load aprimeptr from stack - vmov s21, tmp + vmov s11, tmp - vmov s22, rptr // store "real" destinaton in FP - vmov s23, rptr // backup + vmov s12, rptr // store "real" destinaton in FP + vmov s13, rptr // backup ldr.w rptr, [sp, #15*4] movw tmp2, #64 // XOF_BLOCKBYTES - vmov s25, tmp2 + vmov s15, tmp2 movw q, #3329 movw k, #0 @@ -391,9 +391,9 @@ matacc_asm_opt_32_16: second_if doublebasemul_asm_acc_opt_32_16, tmp, tmp2, tmp3, val0, val1, rptr, bptr, cptr, bufptr, tmp4, k, q, qqinv, ctr - update_buf_loop_finish tmp, tmp2, tmp3, val0, val1, bufptr, ctr, s25 + update_buf_loop_finish tmp, tmp2, tmp3, val0, val1, bufptr, ctr, s15 - vmov rptr, s23 + vmov rptr, s13 pop {r0-r11, pc} .size matacc_asm_opt_32_16, . - matacc_asm_opt_32_16 \ No newline at end of file diff --git a/crypto_kem/kyber768-90s/m4fstack/fastinvntt.S b/crypto_kem/kyber768-90s/m4fstack/fastinvntt.S index 38ea2e36..8b242d6c 120000 --- a/crypto_kem/kyber768-90s/m4fstack/fastinvntt.S +++ b/crypto_kem/kyber768-90s/m4fstack/fastinvntt.S @@ -1 +1 @@ -../../kyber768/m4fspeed/fastinvntt.S \ No newline at end of file +../../kyber768/m4fstack/fastinvntt.S \ No newline at end of file diff --git a/crypto_kem/kyber768-90s/m4fstack/matacc_asm.S b/crypto_kem/kyber768-90s/m4fstack/matacc_asm.S index 6be35156..80ce479e 100644 --- a/crypto_kem/kyber768-90s/m4fstack/matacc_asm.S +++ b/crypto_kem/kyber768-90s/m4fstack/matacc_asm.S @@ -40,7 +40,7 @@ mov r0, \tmp // buf + off implicitly after copying loop mov r1, #1 - vmov r2, s20 // get state ptr + vmov r2, s10 // get state ptr bl aes256xof_squeezeblocks vmov r0, s2 @@ -82,10 +82,10 @@ matacc_asm: ldr.w zetaptr, [sp, #13*4] // load zetaptr from stack ldr.w tmp, [sp, #14*4] // load state from stack - vmov s20, tmp + vmov s10, tmp movw tmp2, #64 // XOF_BLOCKBYTES - vmov s21, tmp2 + vmov s11, tmp2 movw q, #3329 movw k, #0 @@ -101,7 +101,7 @@ matacc_asm: second_if doublebasemul_asm, tmp, tmp2, tmp3, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qqinv, ctr - update_buf_loop_finish tmp, tmp2, tmp3, val0, val1, bufptr, ctr, s21 + update_buf_loop_finish tmp, tmp2, tmp3, val0, val1, bufptr, ctr, s11 pop {r0-r11, pc} .size matacc_asm, . - matacc_asm @@ -131,10 +131,10 @@ matacc_asm_acc: ldr.w zetaptr, [sp, #13*4] // load zetaptr from stack ldr.w tmp, [sp, #14*4] // load state from stack - vmov s20, tmp + vmov s10, tmp movw tmp2, #64 // XOF_BLOCKBYTES - vmov s21, tmp2 + vmov s11, tmp2 movw q, #3329 movw k, #0 @@ -150,7 +150,7 @@ matacc_asm_acc: second_if doublebasemul_asm_acc, tmp, tmp2, tmp3, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qqinv, ctr - update_buf_loop_finish tmp, tmp2, tmp3, val0, val1, bufptr, ctr, s21 + update_buf_loop_finish tmp, tmp2, tmp3, val0, val1, bufptr, ctr, s11 pop {r0-r11, pc} .size matacc_asm_acc, . - matacc_asm_acc diff --git a/crypto_kem/kyber768/m4fspeed/fastinvntt.S b/crypto_kem/kyber768/m4fspeed/fastinvntt.S index d9a5ee83..984d067e 100644 --- a/crypto_kem/kyber768/m4fspeed/fastinvntt.S +++ b/crypto_kem/kyber768/m4fspeed/fastinvntt.S @@ -150,15 +150,6 @@ invntt_fast: movw q, #3329 movt qinv, #3327 - - // barrettconst = -(2^(32)/KYBER_Q) - movw barrettconst, #0x5049 - movt barrettconst, #0xffec - vmov s10, barrettconst - - // barrettconst2 = 2^(15) - movw barrettconst, #32768 - vmov s11, barrettconst ### LAYER 7+6+5+4 .equ distance, 16 @@ -166,32 +157,32 @@ invntt_fast: .equ strincr, 64 // pre-load twiddle factors to FPU registers - vldm twiddle_ptr!, {s20-s27} + vldm twiddle_ptr!, {s8-s15} add.w tmp, poly, #8*strincr - vmov s12, tmp + vmov s8, tmp 1: // load a1, a3, ..., a15 load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset // NTT on a1, a3, ..., a15 - _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, q, qinv, tmp, tmp2 + _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, q, qinv, tmp, tmp2 // multiply coeffs by layer 4 twiddles for later use - vmov twiddle, s24 + vmov twiddle, s12 mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv // could be omitted but kept for reduction only mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv - vmov twiddle, s25 + vmov twiddle, s13 mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv - vmov twiddle, s26 + vmov twiddle, s14 mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv - vmov twiddle, s27 + vmov twiddle, s15 mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv @@ -211,7 +202,7 @@ invntt_fast: load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 // NTT on a0, a2, ..., a14 - _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, q, qinv, tmp, tmp2 + _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, q, qinv, tmp, tmp2 // layer 4 - 1 // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) @@ -265,13 +256,23 @@ invntt_fast: str.w tmp2, [poly, #offset] str.w tmp, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each) - vmov tmp, s12 + vmov tmp, s8 cmp.w poly, tmp bne.w 1b sub.w poly, #8*strincr ### LAYER 3+2+1 + + // barrettconst = -(2^(32)/KYBER_Q) + movw barrettconst, #0x5049 + movt barrettconst, #0xffec + vmov s10, barrettconst + + // barrettconst2 = 2^(15) + movw barrettconst, #32768 + vmov s11, barrettconst + .equ distance, distance*16 .equ strincr, 4 @@ -280,14 +281,14 @@ invntt_fast: load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 vmov barrettconst, s10 - vmov s2, poly + vmov s15, poly vmov poly, s11 half_barrett poly0, poly1, poly2, poly3, barrettconst, poly, tmp, tmp2, q half_barrett poly4, poly5, poly6, poly7, barrettconst, poly, tmp, tmp2, q - vmov poly, s2 - vldm twiddle_ptr!, {s21-s23} + vmov poly, s15 + vldm twiddle_ptr!, {s0-s2} - _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, q, qinv, tmp, tmp2 + _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s0, s0, s1, s2, twiddle, q, qinv, tmp, tmp2 // twisting _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2 @@ -330,10 +331,10 @@ invntt_fast: load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 vmov barrettconst, s10 - vmov s2, poly + vmov s15, poly vmov poly, s11 half_barrett poly0, poly2, poly4, poly6, barrettconst, poly, tmp, tmp2, q - vmov poly, s2 + vmov poly, s15 _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2 diff --git a/crypto_kem/kyber768/m4fspeed/fastntt.S b/crypto_kem/kyber768/m4fspeed/fastntt.S index a3841c98..3971e2f2 100644 --- a/crypto_kem/kyber768/m4fspeed/fastntt.S +++ b/crypto_kem/kyber768/m4fspeed/fastntt.S @@ -79,7 +79,7 @@ .align 2 ntt_fast: push {r4-r11, r14} - + vpush.w {s16} poly .req r0 twiddle_ptr .req r1 poly0 .req r2 @@ -100,46 +100,37 @@ ntt_fast: movw q, #3329 movt qinv, #3327 - // barrettconst = -(2^(32)/KYBER_Q) - movw barrettconst, #0x5049 - movt barrettconst, #0xffec - vmov s10, barrettconst - - // barrettconst2 = 2^(15) - movw barrettconst, #32768 - vmov s11, barrettconst - ### LAYER 7+6+5+4 .equ distance, 256 .equ offset, 32 .equ strincr, 4 // pre-load twiddle factors to FPU registers - vldm twiddle_ptr!, {s20-s27} + vldm twiddle_ptr!, {s8-s15} add tmp, poly, #strincr*8 - vmov s12, tmp + vmov s16, tmp 1: // load a1, a3, ..., a15 load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset // 8-NTT on a1, a3, ..., a15 - _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, qinv, q, tmp, tmp2 + _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, qinv, q, tmp, tmp2 // multiply coeffs by layer 4 twiddles for later use - vmov twiddle, s24 + vmov twiddle, s12 mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv - vmov twiddle, s25 + vmov twiddle, s13 mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv - vmov twiddle, s26 + vmov twiddle, s14 mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv - vmov twiddle, s27 + vmov twiddle, s15 mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv @@ -157,7 +148,7 @@ ntt_fast: load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 // 8-NTT on a0, a2, ..., a14 - _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, qinv, q, tmp, tmp2 + _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, qinv, q, tmp, tmp2 // layer 4 - 1 // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) @@ -211,7 +202,7 @@ ntt_fast: str.w tmp2, [poly, #offset] str.w tmp, [poly], #4 - vmov tmp, s12 + vmov tmp, s16 cmp.w poly, tmp bne.w 1b @@ -219,6 +210,15 @@ ntt_fast: ### LAYER 3+2+1 + // barrettconst = -(2^(32)/KYBER_Q) + movw barrettconst, #0x5049 + movt barrettconst, #0xffec + vmov s10, barrettconst + + // barrettconst2 = 2^(15) + movw barrettconst, #32768 + vmov s11, barrettconst + .equ distance, distance/16 .equ strincr, 32 @@ -250,5 +250,5 @@ ntt_fast: vmov tmp, s13 cmp.w poly, tmp bne.w 2b - + vpop.w {s16} pop {r4-r11, pc} diff --git a/crypto_kem/kyber768/m4fspeed/matacc.i b/crypto_kem/kyber768/m4fspeed/matacc.i index 97278559..7f712e8a 100644 --- a/crypto_kem/kyber768/m4fspeed/matacc.i +++ b/crypto_kem/kyber768/m4fspeed/matacc.i @@ -57,7 +57,7 @@ .endm .macro doublebasemul_asm_cache_16_32 rptr_tmp, aptr, bptr, zetaptr, poly0, poly1, poly2, poly3, q, qinv, tmp, tmp2, res, aprimeptr, zeta - vmov \aprimeptr, s21 + vmov \aprimeptr, s11 ldr \poly0, [\aptr], #4 ldr \poly1, [\bptr] ldr \poly2, [\aptr], #4 @@ -89,11 +89,11 @@ smuadx \tmp, \poly2, \poly3 str.w \tmp, [\rptr_tmp, #4] str \tmp2, [\rptr_tmp], #8 - vmov s21, \aprimeptr + vmov s11, \aprimeptr .endm .macro doublebasemul_asm_acc_cache_32_32 rptr_tmp, aptr, bptr, zetaptr, poly0, poly1, poly2, poly3, q, qinv, tmp, tmp2, res, aprimeptr, zeta - vmov \aprimeptr, s21 + vmov \aprimeptr, s11 ldr \poly0, [\aptr], #4 ldr \poly1, [\bptr] ldr \poly2, [\aptr], #4 @@ -129,11 +129,11 @@ smladx \res, \poly2, \poly3, \res str \res, [\rptr_tmp], #4 - vmov s21, \aprimeptr + vmov s11, \aprimeptr .endm .macro doublebasemul_asm_acc_cache_32_16 rptr_tmp, aptr, bptr, zetaptr, poly0, poly1, poly2, poly3, q, qinv, tmp, tmp2, res, aprimeptr, zeta - vmov \aprimeptr, s21 + vmov \aprimeptr, s11 ldr \poly0, [\aptr], #4 ldr \poly1, [\bptr] ldr \poly2, [\aptr], #4 @@ -155,7 +155,7 @@ montgomery \q, \qinv, \tmp, \poly0 pkhtb \res, \poly0, \tmp2, asr#16 - vmov \poly0, s22 + vmov \poly0, s12 str \res, [\poly0], #4 neg \zeta, \zeta @@ -177,8 +177,8 @@ pkhtb \res, \tmp, \tmp2, asr#16 str \res, [\poly0], #4 - vmov s22, \poly0 - vmov s21, \aprimeptr + vmov s12, \poly0 + vmov s11, \aprimeptr .endm .macro load_vals val0, val1, bufptr, tmp @@ -191,7 +191,7 @@ .endm .macro doublebasemul_asm_opt_16_32 rptr_tmp, aptr, bptr, tmp3, poly0, poly1, poly2, poly3, q, qinv, tmp, tmp2, res, aprimeptr, tmp4 - vmov \aprimeptr, s21 + vmov \aprimeptr, s11 ldr \poly0, [\aptr], #4 ldr \poly1, [\bptr] ldr \poly2, [\aptr], #4 @@ -215,11 +215,11 @@ str.w \tmp2, [\rptr_tmp], #4 str.w \tmp3, [\rptr_tmp], #4 - vmov s21, \aprimeptr + vmov s11, \aprimeptr .endm .macro doublebasemul_asm_acc_opt_32_32 rptr_tmp, aptr, bptr, tmp3, poly0, poly1, poly2, poly3, q, qinv, tmp, tmp2, res, aprimeptr, tmp4 - vmov \aprimeptr, s21 + vmov \aprimeptr, s11 ldr.w \poly0, [\aptr], #4 ldr.w \poly1, [\bptr] ldr.w \poly2, [\aptr], #4 @@ -249,11 +249,11 @@ str.w \tmp, [\rptr_tmp, #4] str \tmp4, [\rptr_tmp], #8 - vmov s21, \aprimeptr + vmov s11, \aprimeptr .endm .macro doublebasemul_asm_acc_opt_32_16 rptr_tmp, aptr, bptr, tmp3, poly0, poly1, poly2, poly3, q, qinv, tmp, tmp2, res, aprimeptr, tmp4 - vmov \aprimeptr, s21 + vmov \aprimeptr, s11 ldr \poly0, [\aptr], #4 ldr \poly1, [\bptr] @@ -274,7 +274,7 @@ montgomery \q, \qinv, \tmp, \tmp3 pkhtb \res, \tmp3, \res, asr#16 - vmov \poly0, s22 + vmov \poly0, s12 str \res, [\poly0], #4 ldr \tmp2, [\aprimeptr], #4 // load cached value @@ -291,6 +291,6 @@ pkhtb \res, \tmp3, \res, asr#16 str \res, [\poly0], #4 - vmov s22, \poly0 - vmov s21, \aprimeptr + vmov s12, \poly0 + vmov s11, \aprimeptr .endm diff --git a/crypto_kem/kyber768/m4fspeed/matacc_asm.S b/crypto_kem/kyber768/m4fspeed/matacc_asm.S index 446836e5..79c09808 100644 --- a/crypto_kem/kyber768/m4fspeed/matacc_asm.S +++ b/crypto_kem/kyber768/m4fspeed/matacc_asm.S @@ -24,8 +24,10 @@ mov \rptr, \bufptr movw \bptr, #1 - vmov \cptr, s20 // load state + vmov \cptr, s10 // load state + bl kyber_shake128_squeezeblocks + vmov \rptr, s2 vmov \bptr, s3 vmov \cptr, s4 @@ -60,10 +62,10 @@ matacc_asm_cache_16_32: ldr.w zetaptr, [sp, #13*4] // load zetaptr from stack ldr.w tmp, [sp, #14*4] // load state from stack - vmov s20, tmp + vmov s10, tmp ldr.w tmp, [sp, #15*4] // load aprimeptr from stack - vmov s21, tmp + vmov s11, tmp movw q, #3329 movw k, #0 @@ -113,10 +115,10 @@ matacc_asm_cache_32_32: ldr.w zetaptr, [sp, #13*4] // load zetaptr from stack ldr.w tmp, [sp, #14*4] // load state from stack - vmov s20, tmp + vmov s10, tmp ldr.w tmp, [sp, #15*4] // load aprimeptr from stack - vmov s21, tmp + vmov s11, tmp movw q, #3329 movw k, #0 @@ -167,13 +169,13 @@ matacc_asm_cache_32_16: ldr.w zetaptr, [sp, #13*4] // load zetaptr from stack ldr.w tmp, [sp, #14*4] // load state from stack - vmov s20, tmp + vmov s10, tmp ldr.w tmp, [sp, #15*4] // load aprimeptr from stack - vmov s21, tmp + vmov s11, tmp - vmov s22, rptr // store "real" destinaton in FP - vmov s23, rptr // backup + vmov s12, rptr // store "real" destinaton in FP + vmov s13, rptr // backup ldr.w rptr, [sp, #16*4] movw q, #3329 @@ -194,7 +196,7 @@ matacc_asm_cache_32_16: cmp ctr, #256/4 blt.w 1b - vmov rptr, s23 + vmov rptr, s13 pop {r0-r11, pc} .size matacc_asm_cache_32_16, . - matacc_asm_cache_32_16 @@ -229,10 +231,10 @@ matacc_asm_opt_16_32: movt qqinv, #3327 ldr.w tmp, [sp, #13*4] // load state from stack - vmov s20, tmp + vmov s10, tmp ldr.w tmp, [sp, #14*4] // load aprimeptr from stack - vmov s21, tmp + vmov s11, tmp movw q, #3329 movw k, #0 @@ -287,10 +289,10 @@ matacc_asm_opt_32_32: movt qqinv, #3327 ldr.w tmp, [sp, #13*4] // load state from stack - vmov s20, tmp + vmov s10, tmp ldr.w tmp, [sp, #14*4] // load aprimeptr from stack - vmov s21, tmp + vmov s11, tmp movw q, #3329 movw k, #0 @@ -343,13 +345,13 @@ matacc_asm_opt_32_16: movt qqinv, #3327 ldr.w tmp, [sp, #13*4] // load state from stack - vmov s20, tmp + vmov s10, tmp ldr.w tmp, [sp, #14*4] // load aprimeptr from stack - vmov s21, tmp + vmov s11, tmp - vmov s22, rptr // store "real" destinaton in FP - vmov s23, rptr // backup + vmov s12, rptr // store "real" destinaton in FP + vmov s13, rptr // backup ldr.w rptr, [sp, #15*4] movw q, #3329 @@ -371,7 +373,7 @@ matacc_asm_opt_32_16: cmp ctr, #256/4 blt.w 1b - vmov rptr, s23 + vmov rptr, s13 pop {r0-r11, pc} .size matacc_asm_opt_32_16, . - matacc_asm_opt_32_16 \ No newline at end of file diff --git a/crypto_kem/kyber768/m4fstack/matacc.i b/crypto_kem/kyber768/m4fstack/matacc.i index c86e5284..7d2e6456 100644 --- a/crypto_kem/kyber768/m4fstack/matacc.i +++ b/crypto_kem/kyber768/m4fstack/matacc.i @@ -82,7 +82,7 @@ mov \rptr, \bufptr movw \bptr, #1 - vmov \cptr, s20 // load state + vmov \cptr, s10 // load state bl kyber_shake128_squeezeblocks vmov \rptr, s2 diff --git a/crypto_kem/kyber768/m4fstack/matacc_asm.S b/crypto_kem/kyber768/m4fstack/matacc_asm.S index 4fc85b52..bd7b509e 100644 --- a/crypto_kem/kyber768/m4fstack/matacc_asm.S +++ b/crypto_kem/kyber768/m4fstack/matacc_asm.S @@ -31,7 +31,7 @@ matacc_asm: ldr.w zetaptr, [sp, #13*4] // load zetaptr from stack ldr.w tmp, [sp, #14*4] // load state from stack - vmov s20, tmp + vmov s10, tmp movw q, #3329 movw k, #0 @@ -86,7 +86,7 @@ matacc_asm_acc: ldr.w zetaptr, [sp, #13*4] // load zetaptr from stack ldr.w tmp, [sp, #14*4] // load state from stack - vmov s20, tmp + vmov s10, tmp movw q, #3329 movw k, #0 diff --git a/crypto_sign/dilithium2/m4f/ntt.S b/crypto_sign/dilithium2/m4f/ntt.S index 9293eab7..bfd5f7a4 100644 --- a/crypto_sign/dilithium2/m4f/ntt.S +++ b/crypto_sign/dilithium2/m4f/ntt.S @@ -54,7 +54,7 @@ pqcrystals_dilithium_ntt: vmov s0, ptr_zeta add.w temp_l, ptr_p, #32*strincr // 32 iterations - vmov s25, temp_l + vmov s9, temp_l 1: .rept 2 ldr.w pol0, [ptr_p] @@ -77,7 +77,7 @@ pqcrystals_dilithium_ntt: str.w pol7, [ptr_p, #7*distance/4] str.w pol0, [ptr_p], #strincr .endr - vmov temp_l, s25 + vmov temp_l, s9 cmp.w ptr_p, temp_l bne 1b @@ -86,10 +86,10 @@ pqcrystals_dilithium_ntt: // stage 4 - 6 .equ distance, 64 add.w temp_l, ptr_p, #8*112+8*4*4 // 8 iterations - vmov s25, temp_l + vmov s9, temp_l 1: add.w temp_l, ptr_p, #4*strincr // 4 iterations - vmov s26, temp_l + vmov s10, temp_l vmov ptr_zeta, s0 vldm ptr_zeta!, {s2-s8} vmov s0, ptr_zeta @@ -115,12 +115,12 @@ pqcrystals_dilithium_ntt: str.w pol7, [ptr_p, #7*distance/4] str.w pol0, [ptr_p], #4 .endr - vmov temp_l, s26 + vmov temp_l, s10 cmp.w ptr_p, temp_l bne 2b add.w ptr_p, #112 - vmov temp_l, s25 + vmov temp_l, s9 cmp.w ptr_p, temp_l bne 1b @@ -213,7 +213,7 @@ pqcrystals_dilithium_invntt_tomont: vmov s0, ptr_zeta add.w temp_l, ptr_p, #32*strincr // 32 iterations - vmov s25, temp_l + vmov s9, temp_l 1: ldr.w pol4, [ptr_p, #4*distance/4] ldr.w pol1, [ptr_p, #5*distance/4] @@ -235,7 +235,7 @@ pqcrystals_dilithium_invntt_tomont: str.w pol6, [ptr_p, #6*distance/4] str.w pol7, [ptr_p, #7*distance/4] str.w pol0, [ptr_p], #strincr - vmov temp_l, s25 + vmov temp_l, s9 cmp.w ptr_p, temp_l bne.w 1b @@ -248,7 +248,7 @@ pqcrystals_dilithium_invntt_tomont: // iteration 0 movw temp_l, #4 add.w temp_l, ptr_p, #4*256 // 4 iterations - vmov s26, temp_l + vmov s10, temp_l vmov ptr_zeta, s0 vldm ptr_zeta!, {s2-s8} @@ -277,7 +277,7 @@ pqcrystals_dilithium_invntt_tomont: str.w pol0, [ptr_p] add.w ptr_p, #strincr - vmov temp_l, s26 + vmov temp_l, s10 cmp.w temp_l, ptr_p bne.w 2b @@ -285,10 +285,10 @@ pqcrystals_dilithium_invntt_tomont: // iteration 1-7 add.w temp_l, ptr_p, #7*4 // 7 iterations - vmov s25, temp_l + vmov s9, temp_l 1: add.w temp_l, ptr_p, #4*strincr // 4 iterations - vmov s26, temp_l + vmov s10, temp_l vmov ptr_zeta, s0 vldm ptr_zeta!, {s2-s8} @@ -315,12 +315,12 @@ pqcrystals_dilithium_invntt_tomont: str.w pol0, [ptr_p] add.w ptr_p, #strincr - vmov temp_l, s26 + vmov temp_l, s10 cmp.w ptr_p, temp_l bne 2b sub.w ptr_p, #4*strincr-4 - vmov temp_l, s25 + vmov temp_l, s9 cmp.w temp_l, ptr_p bne 1b @@ -331,7 +331,7 @@ pqcrystals_dilithium_invntt_tomont: .equ strincr, 4 add.w cntr, ptr_p, #64*strincr // 64 iterations - vmov s25, cntr + vmov s9, cntr 1: ldr.w zeta1, [ptr_zeta, #4] ldr.w zeta2, [ptr_zeta, #8] @@ -357,7 +357,7 @@ pqcrystals_dilithium_invntt_tomont: str.w pol3, [ptr_p, #768] str pol0, [ptr_p], #strincr - vmov cntr, s25 + vmov cntr, s9 cmp.w cntr, ptr_p bne.w 1b diff --git a/crypto_sign/dilithium2/m4f/sign.c b/crypto_sign/dilithium2/m4f/sign.c index a1d4d7e4..0573f5d3 100644 --- a/crypto_sign/dilithium2/m4f/sign.c +++ b/crypto_sign/dilithium2/m4f/sign.c @@ -165,7 +165,6 @@ int crypto_sign_signature(uint8_t *sig, unsigned int hints_written = 0; /* Check that subtracting cs2 does not change high bits of w and low bits * do not reveal secret information */ - pack_sig_z(sig, &z); for(unsigned int i = 0; i < K; ++i) { poly *tmp = &z.vec[0]; poly_small_basemul_invntt(tmp, &cp_small, &cp_small_prime, &s2_prime[i]); diff --git a/crypto_sign/dilithium3/m4f/smallntt.S b/crypto_sign/dilithium3/m4f/smallntt.S index 5adf8a31..747c111c 100644 --- a/crypto_sign/dilithium3/m4f/smallntt.S +++ b/crypto_sign/dilithium3/m4f/smallntt.S @@ -111,6 +111,7 @@ .align 2 small_ntt_asm: push {r4-r11, r14} + vpush.w {s16} poly .req r0 twiddle_ptr .req r1 @@ -136,33 +137,33 @@ small_ntt_asm: .equ offset, 32 .equ strincr, 4 // pre-load twiddle factors to FPU registers - vldm twiddle_ptr!, {s20-s27} + vldm twiddle_ptr!, {s8-s15} add tmp, poly, #strincr*8 - vmov s12, tmp + vmov s16, tmp 1: // load a1, a3, ..., a15 load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset // 8-NTT on a1, a3, ..., a15 - _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, qinv, q, tmp, tmp2 + _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, qinv, q, tmp, tmp2 // multiply coeffs by layer 4 twiddles for later use - vmov twiddle, s24 + vmov twiddle, s12 mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv - vmov twiddle, s25 + vmov twiddle, s13 mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv - vmov twiddle, s26 + vmov twiddle, s14 mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv - vmov twiddle, s27 + vmov twiddle, s15 mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv @@ -182,7 +183,7 @@ small_ntt_asm: load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 // 8-NTT on a0, a2, ..., a14 - _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, qinv, q, tmp, tmp2 + _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, qinv, q, tmp, tmp2 // layer 4 - 1 // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) @@ -246,7 +247,7 @@ small_ntt_asm: str.w poly7, [poly, #6*distance/4+offset] str.w poly0, [poly], #4 - vmov tmp, s12 + vmov tmp, s16 cmp.w poly, tmp bne.w 1b @@ -276,6 +277,7 @@ small_ntt_asm: cmp.w poly, tmp bne.w 2b + vpop.w {s16} pop {r4-r11, pc} @@ -493,32 +495,32 @@ small_invntt_tomont_asm: .equ strincr, 64 // pre-load twiddle factors to FPU registers - vldm twiddle_ptr!, {s20-s27} + vldm twiddle_ptr!, {s8-s15} add.w tmp, poly, #8*strincr - vmov s12, tmp + vmov s8, tmp 1: // load a1, a3, ..., a15 load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset // NTT on a1, a3, ..., a15 - _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, q, qinv, tmp, tmp2 + _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, q, qinv, tmp, tmp2 // multiply coeffs by layer 4 twiddles for later use - vmov twiddle, s24 + vmov twiddle, s12 mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv // could be omitted but kept for reduction only mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv - vmov twiddle, s25 + vmov twiddle, s13 mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv - vmov twiddle, s26 + vmov twiddle, s14 mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv - vmov twiddle, s27 + vmov twiddle, s15 mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv @@ -538,7 +540,7 @@ small_invntt_tomont_asm: load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 // NTT on a0, a2, ..., a14 - _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, q, qinv, tmp, tmp2 + _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, q, qinv, tmp, tmp2 // layer 4 - 1 // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) @@ -602,7 +604,7 @@ small_invntt_tomont_asm: str.w poly7, [poly, #6*distance/4+offset] str.w poly0, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each) - vmov tmp, s12 + vmov tmp, s8 cmp.w poly, tmp bne.w 1b @@ -616,9 +618,9 @@ small_invntt_tomont_asm: load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - vldm twiddle_ptr!, {s21-s23} + vldm twiddle_ptr!, {s5-s7} - _3_layer_double_inv_CT_16_light_reduce poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, q, qinv, tmp, tmp2 + _3_layer_double_inv_CT_16_light_reduce poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s5, s5, s6, s7, twiddle, q, qinv, tmp, tmp2 vmov.w s2, poly movw poly, #:lower16:5585133