From ffd97e97cbc9d30fe9159955d04b04c24213ee84 Mon Sep 17 00:00:00 2001 From: Martin Othamar Date: Sun, 9 Jul 2023 20:47:59 +0200 Subject: [PATCH] More vectorization of Shishua, .NET 8, use standard intrinsics API (#1) --- .github/workflows/build.yml | 2 +- .vscode/settings.json | 3 + Fast.PRNGs.sln | 50 +- README.md | 8 +- .../Fast.PRNGs.Benchmarks.csproj | 2 +- .../Internals/ToDoubleBenchmarks.cs | 30 + .../Fast.PRNGs.Benchmarks/PRNGsScaling.cs | 56 +- .../SimpleBenchConfig.cs | 13 + .../Fast.PRNGs.Benchmarks/ThroughputColumn.cs | 41 + global.json | 7 +- img/perf-scaling-2.png | Bin 0 -> 65195 bytes src/External/Directory.Build.props | 8 - .../RawIntrinsics/AVX.ManuallyAdded.cs | 25 - src/External/RawIntrinsics/AVX.cs | 1336 ------------- src/External/RawIntrinsics/AVX2.cs | 1726 ----------------- src/External/RawIntrinsics/FMA.cs | 326 ---- .../RawIntrinsics/MMX.ManuallyAdded.cs | 11 - src/External/RawIntrinsics/MMX.cs | 65 - src/External/RawIntrinsics/Other.cs | 101 - .../RawIntrinsics/RawIntrinsics.csproj | 8 - .../RawIntrinsics/SSE.ManuallyAdded.cs | 11 - src/External/RawIntrinsics/SSE.cs | 766 -------- .../RawIntrinsics/SSE2.ManuallyAdded.cs | 17 - src/External/RawIntrinsics/SSE2.cs | 1714 ---------------- src/External/RawIntrinsics/SSE3.cs | 100 - src/External/RawIntrinsics/SSE41.cs | 525 ----- src/External/RawIntrinsics/SSE42.cs | 15 - src/External/RawIntrinsics/SSSE3.cs | 148 -- src/External/RawIntrinsics/Types.cs | 178 -- .../RawIntrinsics/Utils.ManuallyAdded.cs | 7 - .../RawIntrinsicsGenerator/Generator.cs | 556 ------ .../RawIntrinsicsGenerator/Program.cs | 16 - .../RawIntrinsicsGenerator.csproj | 16 - src/Fast.PRNGs/Common.cs | 36 +- src/Fast.PRNGs/Fast.PRNGs.csproj | 6 +- src/Fast.PRNGs/MWC256.cs | 10 +- src/Fast.PRNGs/Shishua.cs | 130 +- src/Fast.PRNGs/Splitmix64.cs | 10 +- src/Fast.PRNGs/Xoroshiro128Plus.cs | 11 +- src/Fast.PRNGs/Xoshiro256Plus.cs | 10 +- test/Fast.PRNGs.Tests/Fast.PRNGs.Tests.csproj | 2 +- test/Fast.PRNGs.Tests/ShishuaTests.cs | 21 +- 42 files changed, 298 insertions(+), 7825 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 benchmark/Fast.PRNGs.Benchmarks/Internals/ToDoubleBenchmarks.cs create mode 100644 benchmark/Fast.PRNGs.Benchmarks/SimpleBenchConfig.cs create mode 100644 benchmark/Fast.PRNGs.Benchmarks/ThroughputColumn.cs create mode 100644 img/perf-scaling-2.png delete mode 100644 src/External/Directory.Build.props delete mode 100644 src/External/RawIntrinsics/AVX.ManuallyAdded.cs delete mode 100644 src/External/RawIntrinsics/AVX.cs delete mode 100644 src/External/RawIntrinsics/AVX2.cs delete mode 100644 src/External/RawIntrinsics/FMA.cs delete mode 100644 src/External/RawIntrinsics/MMX.ManuallyAdded.cs delete mode 100644 src/External/RawIntrinsics/MMX.cs delete mode 100644 src/External/RawIntrinsics/Other.cs delete mode 100644 src/External/RawIntrinsics/RawIntrinsics.csproj delete mode 100644 src/External/RawIntrinsics/SSE.ManuallyAdded.cs delete mode 100644 src/External/RawIntrinsics/SSE.cs delete mode 100644 src/External/RawIntrinsics/SSE2.ManuallyAdded.cs delete mode 100644 src/External/RawIntrinsics/SSE2.cs delete mode 100644 src/External/RawIntrinsics/SSE3.cs delete mode 100644 src/External/RawIntrinsics/SSE41.cs delete mode 100644 src/External/RawIntrinsics/SSE42.cs delete mode 100644 src/External/RawIntrinsics/SSSE3.cs delete mode 100644 src/External/RawIntrinsics/Types.cs delete mode 100644 src/External/RawIntrinsics/Utils.ManuallyAdded.cs delete mode 100644 src/External/RawIntrinsicsGenerator/Generator.cs delete mode 100644 src/External/RawIntrinsicsGenerator/Program.cs delete mode 100644 src/External/RawIntrinsicsGenerator/RawIntrinsicsGenerator.csproj diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ae0c2b8..ba390cd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -18,7 +18,7 @@ jobs: - name: Setup .NET Core uses: actions/setup-dotnet@v1 with: - dotnet-version: '7.0.x' + dotnet-version: '8.0.100-preview.5.23303.2' - name: Install dependencies run: dotnet restore diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..2ed6265 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "dotnet.defaultSolution": "Fast.PRNGs.sln" +} diff --git a/Fast.PRNGs.sln b/Fast.PRNGs.sln index c78b6ae..9077a5f 100644 --- a/Fast.PRNGs.sln +++ b/Fast.PRNGs.sln @@ -7,58 +7,30 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{B56AF188-D99 EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{82A9760F-251B-4220-9263-153755FA2EC3}" EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "External", "External", "{12A7C294-6EF5-4FDF-A2BA-A01E320B9C36}" - ProjectSection(SolutionItems) = preProject - src\External\Directory.Build.props = src\External\Directory.Build.props - EndProjectSection -EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "RawIntrinsics", "src\External\RawIntrinsics\RawIntrinsics.csproj", "{BA5145CD-6180-4BA3-817F-197158280327}" -EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "RawIntrinsicsGenerator", "src\External\RawIntrinsicsGenerator\RawIntrinsicsGenerator.csproj", "{A161A378-55BF-48D2-84FF-DA3F09EA5258}" -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "_files", "_files", "{3D9E2A5B-D3F0-49AB-BEC3-647C5063537C}" ProjectSection(SolutionItems) = preProject - Directory.Build.props = Directory.Build.props - global.json = global.json - Fast.PRNGs.sln = Fast.PRNGs.sln .editorconfig = .editorconfig - .gitignore = .gitignore .gitattributes = .gitattributes + .gitignore = .gitignore + Directory.Build.props = Directory.Build.props + Fast.PRNGs.sln = Fast.PRNGs.sln + global.json = global.json EndProjectSection EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Fast.PRNGs", "src\Fast.PRNGs\Fast.PRNGs.csproj", "{AE271FFA-B5D2-40D8-92E4-71D970142F6D}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Fast.PRNGs", "src\Fast.PRNGs\Fast.PRNGs.csproj", "{AE271FFA-B5D2-40D8-92E4-71D970142F6D}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Fast.PRNGs.Tests", "test\Fast.PRNGs.Tests\Fast.PRNGs.Tests.csproj", "{732E59B8-C209-495B-8608-77E746A68F22}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Fast.PRNGs.Tests", "test\Fast.PRNGs.Tests\Fast.PRNGs.Tests.csproj", "{732E59B8-C209-495B-8608-77E746A68F22}" EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "benchmark", "benchmark", "{089CE6DA-C860-48D3-95D2-353C7A71C9CD}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Fast.PRNGs.Benchmarks", "benchmark\Fast.PRNGs.Benchmarks\Fast.PRNGs.Benchmarks.csproj", "{2A875B02-B84C-43A3-BF16-593F5E6276BC}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Fast.PRNGs.Benchmarks", "benchmark\Fast.PRNGs.Benchmarks\Fast.PRNGs.Benchmarks.csproj", "{2A875B02-B84C-43A3-BF16-593F5E6276BC}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU Release|Any CPU = Release|Any CPU EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection - GlobalSection(NestedProjects) = preSolution - {12A7C294-6EF5-4FDF-A2BA-A01E320B9C36} = {B56AF188-D999-4444-AE68-4971A573FAA4} - {BA5145CD-6180-4BA3-817F-197158280327} = {12A7C294-6EF5-4FDF-A2BA-A01E320B9C36} - {A161A378-55BF-48D2-84FF-DA3F09EA5258} = {12A7C294-6EF5-4FDF-A2BA-A01E320B9C36} - {AE271FFA-B5D2-40D8-92E4-71D970142F6D} = {B56AF188-D999-4444-AE68-4971A573FAA4} - {732E59B8-C209-495B-8608-77E746A68F22} = {82A9760F-251B-4220-9263-153755FA2EC3} - {2A875B02-B84C-43A3-BF16-593F5E6276BC} = {089CE6DA-C860-48D3-95D2-353C7A71C9CD} - EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution - {BA5145CD-6180-4BA3-817F-197158280327}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {BA5145CD-6180-4BA3-817F-197158280327}.Debug|Any CPU.Build.0 = Debug|Any CPU - {BA5145CD-6180-4BA3-817F-197158280327}.Release|Any CPU.ActiveCfg = Release|Any CPU - {BA5145CD-6180-4BA3-817F-197158280327}.Release|Any CPU.Build.0 = Release|Any CPU - {A161A378-55BF-48D2-84FF-DA3F09EA5258}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {A161A378-55BF-48D2-84FF-DA3F09EA5258}.Debug|Any CPU.Build.0 = Debug|Any CPU - {A161A378-55BF-48D2-84FF-DA3F09EA5258}.Release|Any CPU.ActiveCfg = Release|Any CPU - {A161A378-55BF-48D2-84FF-DA3F09EA5258}.Release|Any CPU.Build.0 = Release|Any CPU {AE271FFA-B5D2-40D8-92E4-71D970142F6D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {AE271FFA-B5D2-40D8-92E4-71D970142F6D}.Debug|Any CPU.Build.0 = Debug|Any CPU {AE271FFA-B5D2-40D8-92E4-71D970142F6D}.Release|Any CPU.ActiveCfg = Release|Any CPU @@ -72,4 +44,12 @@ Global {2A875B02-B84C-43A3-BF16-593F5E6276BC}.Release|Any CPU.ActiveCfg = Release|Any CPU {2A875B02-B84C-43A3-BF16-593F5E6276BC}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(NestedProjects) = preSolution + {AE271FFA-B5D2-40D8-92E4-71D970142F6D} = {B56AF188-D999-4444-AE68-4971A573FAA4} + {732E59B8-C209-495B-8608-77E746A68F22} = {82A9760F-251B-4220-9263-153755FA2EC3} + {2A875B02-B84C-43A3-BF16-593F5E6276BC} = {089CE6DA-C860-48D3-95D2-353C7A71C9CD} + EndGlobalSection EndGlobal diff --git a/README.md b/README.md index 0cab574..638bf73 100644 --- a/README.md +++ b/README.md @@ -38,11 +38,5 @@ NOTE - MWC256 is likely poorly implemented (it is supposed to be faster). As see This is clear from the generated assembly atm but I'm not sure why those branching instructions are generated. `UInt128` support is pretty new so maybe there are some inefficiencies there. -![Scaling iterations](/img/perf-scaling.png "Scaling iterations") - -#### With hardware counters - -Instrumented with more diagnostics, including hardware counters - -![With hardware counters](/img/perf-hardwarecounters.png "With hardware counters") +![Scaling iterations](/img/perf-scaling-2.png "Scaling iterations") diff --git a/benchmark/Fast.PRNGs.Benchmarks/Fast.PRNGs.Benchmarks.csproj b/benchmark/Fast.PRNGs.Benchmarks/Fast.PRNGs.Benchmarks.csproj index 03ae3ed..deed779 100644 --- a/benchmark/Fast.PRNGs.Benchmarks/Fast.PRNGs.Benchmarks.csproj +++ b/benchmark/Fast.PRNGs.Benchmarks/Fast.PRNGs.Benchmarks.csproj @@ -2,7 +2,7 @@ Exe latest - net7.0 + net8.0 disable pdbonly true diff --git a/benchmark/Fast.PRNGs.Benchmarks/Internals/ToDoubleBenchmarks.cs b/benchmark/Fast.PRNGs.Benchmarks/Internals/ToDoubleBenchmarks.cs new file mode 100644 index 0000000..fbf1e22 --- /dev/null +++ b/benchmark/Fast.PRNGs.Benchmarks/Internals/ToDoubleBenchmarks.cs @@ -0,0 +1,30 @@ +namespace Fast.PRNGs.Benchmarks.Internals; + +[ConfigSource] +public class ToDoublesBenchmark +{ + internal const ulong DoubleMask = (1L << 53) - 1; + internal const double Norm53 = 1.0d / (1L << 53); + + [Params(31512512431231UL)] + public ulong Value { get; set; } + + [Benchmark] + public double Original() + { + return (Value & DoubleMask) * Norm53; + } + + [Benchmark] + public double New() + { + return (Value >> 11) * (1.0 / (1ul << 53)); + } + + private class ConfigSourceAttribute : Attribute, IConfigSource + { + public IConfig Config { get; } + + public ConfigSourceAttribute() => Config = new SimpleBenchConfig(8); + } +} diff --git a/benchmark/Fast.PRNGs.Benchmarks/PRNGsScaling.cs b/benchmark/Fast.PRNGs.Benchmarks/PRNGsScaling.cs index 1ce48d5..2e2d785 100644 --- a/benchmark/Fast.PRNGs.Benchmarks/PRNGsScaling.cs +++ b/benchmark/Fast.PRNGs.Benchmarks/PRNGsScaling.cs @@ -1,22 +1,31 @@ +using BenchmarkDotNet.Environments; +using System.Runtime.Intrinsics; + namespace Fast.PRNGs.Benchmarks; -[Config(typeof(Config))] +[ConfigSource] public class PRNGsScaling { + private const int _iterations = 1 << 17; + private Random _random; - private Shishua _shishua; + private Shishua _shishuaSeq; + private Shishua _shishuaVec256; + private Shishua _shishuaVec512; private Xoroshiro128Plus _xoroshiro128plus; private Xoshiro256Plus _xoshiro256plus; private MWC256 _mwc256; - [Params(100_000, 1_000_000)] + [Params(_iterations)] public int Iterations { get; set; } [GlobalSetup] public void Setup() { _random = new Random(); - _shishua = Shishua.Create(); + _shishuaSeq = Shishua.Create(); + _shishuaVec256 = Shishua.Create(); + _shishuaVec512 = Shishua.Create(); _xoroshiro128plus = Xoroshiro128Plus.Create(); _xoshiro256plus = Xoshiro256Plus.Create(); _mwc256 = MWC256.Create(); @@ -25,7 +34,9 @@ public void Setup() [GlobalCleanup] public void Cleanup() { - _shishua.Dispose(); + _shishuaSeq.Dispose(); + _shishuaVec256.Dispose(); + _shishuaVec512.Dispose(); } [Benchmark(Baseline = true)] @@ -38,10 +49,30 @@ public double SystemRandomGen() } [Benchmark] - public double ShishuaGen() + public double ShishuaSeqGen() { for (int i = 0; i < Iterations; i++) - _ = _shishua.NextDouble(); + _ = _shishuaSeq.NextDouble(); + + return default; + } + + [Benchmark] + public double ShishuaVec256Gen() + { + Vector256 result = default; + for (int i = 0; i < Iterations; i += 4) + _shishuaVec256.NextDoubles256(ref result); + + return default; + } + + [Benchmark] + public double ShishuaVec512Gen() + { + Vector512 result = default; + for (int i = 0; i < Iterations; i += 8) + _shishuaVec512.NextDoubles512(ref result); return default; } @@ -73,13 +104,10 @@ public double MWC256Gen() return default; } - private sealed class Config : ManualConfig + private class ConfigSourceAttribute : Attribute, IConfigSource { - public Config() - { - this.SummaryStyle = SummaryStyle.Default.WithRatioStyle(RatioStyle.Trend); - this.AddColumn(RankColumn.Arabic); - this.Orderer = new DefaultOrderer(SummaryOrderPolicy.SlowestToFastest, MethodOrderPolicy.Declared); - } + public IConfig Config { get; } + + public ConfigSourceAttribute() => Config = new SimpleBenchConfig(_iterations * sizeof(double)); } } diff --git a/benchmark/Fast.PRNGs.Benchmarks/SimpleBenchConfig.cs b/benchmark/Fast.PRNGs.Benchmarks/SimpleBenchConfig.cs new file mode 100644 index 0000000..c996b37 --- /dev/null +++ b/benchmark/Fast.PRNGs.Benchmarks/SimpleBenchConfig.cs @@ -0,0 +1,13 @@ +namespace Fast.PRNGs.Benchmarks; + +internal sealed class SimpleBenchConfig : ManualConfig +{ + public SimpleBenchConfig(ulong? byteSizePerIteration = null) + { + this.SummaryStyle = SummaryStyle.Default.WithRatioStyle(RatioStyle.Trend); + this.AddColumn(RankColumn.Arabic); + this.Orderer = new DefaultOrderer(SummaryOrderPolicy.SlowestToFastest, MethodOrderPolicy.Declared); + if (byteSizePerIteration != null) + this.AddColumn(new ThroughputColumn(byteSizePerIteration.Value)); + } +} diff --git a/benchmark/Fast.PRNGs.Benchmarks/ThroughputColumn.cs b/benchmark/Fast.PRNGs.Benchmarks/ThroughputColumn.cs new file mode 100644 index 0000000..be54912 --- /dev/null +++ b/benchmark/Fast.PRNGs.Benchmarks/ThroughputColumn.cs @@ -0,0 +1,41 @@ +using BenchmarkDotNet.Running; + +namespace Fast.PRNGs.Benchmarks; + +public class ThroughputColumn : IColumn +{ + public string Id { get; } + + public string ColumnName { get; } + + private readonly ulong _byteSizePerIteration; + + public ThroughputColumn(ulong byteSizePerIteration) + { + ColumnName = "Throughput"; + Id = nameof(TagColumn) + "." + ColumnName; + + _byteSizePerIteration = byteSizePerIteration; + } + + public bool IsDefault(Summary summary, BenchmarkCase benchmarkCase) => false; + public string GetValue(Summary summary, BenchmarkCase benchmarkCase) + { + var stats = summary[benchmarkCase].ResultStatistics; + if (stats is null || stats.Mean == default || double.IsNaN(stats.Mean)) + return "?"; + + var gbs = (_byteSizePerIteration / 1e9d) / (stats.Mean / 1e9d); + return $"{gbs:0.00} GB/s"; + } + + public bool IsAvailable(Summary summary) => true; + public bool AlwaysShow => true; + public ColumnCategory Category => ColumnCategory.Metric; + public int PriorityInCategory => 0; + public bool IsNumeric => true; + public UnitType UnitType => UnitType.Size; + public string Legend => $"Throughput in GB/s"; + public string GetValue(Summary summary, BenchmarkCase benchmarkCase, SummaryStyle style) => GetValue(summary, benchmarkCase); + public override string ToString() => ColumnName; +} diff --git a/global.json b/global.json index 3672f82..717bcdd 100644 --- a/global.json +++ b/global.json @@ -1,6 +1,7 @@ { "sdk": { - "version": "7.0.201", - "rollForward": "latestFeature" + "version": "8.0.100-preview.5.23303.2", + "rollForward": "latestFeature", + "allowPrerelease": true } -} \ No newline at end of file +} diff --git a/img/perf-scaling-2.png b/img/perf-scaling-2.png new file mode 100644 index 0000000000000000000000000000000000000000..89561cbbaf642ba129775737e12b4a421b850571 GIT binary patch literal 65195 zcmeFYWmuG58}}=s14^p5Gz^G{bV`Q|N=gVwNDBvdJs?5<&M^GD9|;}c?tqOj(ZKFrSq%pgwB5E>ehKNLu}R4 z$=7CY<<)##^Yuo+k9hUuE77We*ebmkCn28%6~JiF@F0nnkenEo`0jJ|3`o`74KfeB zw79gCny@pQx|pztFirFF$jz&+|G8B2bIGIOXMKKFR#wjgIzM6%?w`LR)9c(w{(g-) zkO_34`p>69YU=`z|MOvful@6X|B+55a!uy1Hhwph{j4YQpEl|IWZs6+|JOObQAz)& zm1j={!~WCC8|ME#;ZFxD>Zz(06!MJp!ch>^@n08nmBrjMT9!oQI9{JuYcFaPtTFb0 zfbl;~?S$usg@vWAZqmk!o_-DW-cPmmYD?k2E$Ax1en7ylmRx&jR^vT=m*0DU3Jryh z1CXx=vJR%LH5xv^N-iz`bA_4VgFIpJZ|}9(Hch6E^wee}J8I+pe%X+r7y5 zOU(7de5j4=fQsvzX{>|m(6qyf%>P`-8*gryfW^eD)Lw<3d z7EzZks`{l}bo4kRslkNQb>SC1qytNlA8gq*Zn@19R3dsMoNcL(_Kq1n&LwTi!XL6l z=Z3>yZoD1<(H#HhDvgL6)d)5LgxFLR8_66QE6frirHz*i2;(-oFXJ6lELyGZ*uYt* zTw&KWttOsOKu7z_FmZnI{c#tm&963c!_N_R?GpY*y|@no9ZmBN_Mrl?qmfGqi^5l@ zt3S9?&Apb;x0(adx2RFwKo99LkD9hPev?iwg-#;JdeciD<K;_nPk7LpLZQ$TUYTqq>#){^cbgh21?#giNlD{rgk)2BHy18=0 z9Zj7_5cd2wu7mH=t8zoNB*U)`@i!blMB~@ifjH}AYRy4xr$Iw z`-e{{+6d}r!~7+774@bUmV>>9FNS#D=E;)C42SpV{fe_GYYd(h6wr+?>KH4qn3<`} zzsUbAXdUGjz!PuS>Ej6KZm_>i23)pA#;m8tqE9ycb@8sr2kk4fH_W|&AZ%+v3Npi|usngXB~%2Pd$=t2*Nz(P$llPKf(F;%1NP{1Pj6vMbJ)07sxq ztmcURV62K11&tI~9axk*vN+l8Qo3mqG#Qu-oArzjPxEwWN;I*kiVFO9NbDqi3(LA- z&D8E$&~6lp7_rOvJ;Bfqq2S5uOzTMa$!5XsFw3own_c>hp2WPf(%I(9DuS;GK1}$@ zeGX0O?IT>V_QWe-*r0U7q9W5l*la-T_QZg%qPR-r-?{YOTg{28%kgxB4Y7LcZM-t6|0M%eviWXsaz>GlD5d7jn z%icFF>1xm5npms~Z`=;b8O6B;a<2HRtwVL%(tS%%)Z`~j6$|m&1WoN0) zd$aWsN#HTPUhxT-zgeT=9jsUWzXmN;zff3!W%c{iWmZFR`s9}ahBL%Tr^OZ?lN>l} zY^DIix^5aVzm>^L^s964Z!J?{X_Ys*(|wXx-De?ZNR#ck`c*GbsXS8g)-@@^ctsF= zYK182!rZ7pFQDR`!oT;I75#TK^jeNtcHR~E!cOICFB_}-*U*XjgF+FXpw~zd$%>4H;nbOnsxC|x@n6;S{U;_HXB4i< z@V;pLf-Vt)Z^TxzIwPw@o&~1<+u~2q|0DvruIVGZe-lH{=-2<}iQ)hAO!WVsaFp}9 zYcy#wP-&ah>&aQ`Ba~BJT5`+Ke${;J>QlOaR9C9<*Q6*j7D?l20L<+f$ zv}X2t0>X{-4{)KnS%mP_SvFTq>0Z=Oe$&_q_E1VA4!uYCt;K!%1@gd1Xb}=VFzxPF zrlfn5#!||spyp`GN3eH9V#5c zf#&B{X0%UeixmGPeLDd2#LCN08 z0|gtjXti5H^3mmkF8zmxsC;Q_7aSM|S;*(Yg71{wq`rr|7Da1=B!b($u%?Zg`~vX&FPd+V{%vjOC0=EXkO z87ufs2Zt}AF?b^9?}D|R9QEfTUx~GEvcct!-Z%HWy(|?`9^zk^-#McTMPDp!yoTorHW3=$Z28F!%;=wNTxxm)anSWt?Cs*OX-rE_J=lU(kv&g9@4S0 zypC=@Eh7+1RZfNGdyey%6nvo%D}Tj#m#bggX~oYAJ7)A(*JTtHww~V~m%cbMf-LMfE8HRrk2cug5f?Ie z?(zHuC8Fuc2~T^`jC8~JQ!epSDMIN36#7(P$Q{KOKfW(8bIn}*gasNJ%5UG;eU}$i zmT!2@#NS)dg1=Fghj+$Yv^12WRSPu7b^dt7c_i-5GX+#?QNHp9b!;fS@{n6E zHRXb6$rt()KTA$0@tO_xS6oSo3)zPr_pstca;Oi9)EVX8H{hLaZ{^$)F+?mvwB)2`_885CC6dhvS9%#7z^%N)sGl*auS0CoMIub~;1(PQliSs7g z_eaLaN0-QFhaU>OwVpDKy*Q4wHS?^rucQ?4DH7$xS&c*yvTk;qvsVILMUoFrVH9aT zs($2zz4PljEOj3~aUK!801{^+J3g@Iq*I$ajjaRR&*L&UD+aCVv!gwy&hO*&tquP% z7a#e$%0o_g0Gr4l7n47OIJuEJB~1xp5gnGpiD1e*<+29HOms>HR(8Fnu?2@I+q>p^ z-V{);3V}Hev^#_aR@6ozD?MinFU!y@rubSO%+{Gf zZ6S1brMr@eJ9YDu-OIae$#`06TicCvy4#d{W+fQ4*+uD&VGeM)<-$_5!cM^ViX7MQ z)tdwTF@D-IESvzRV>C+}`q=Lf4{nWseu)D}$uB7Tk?OPg!E5vFsc7CE1werNJTJKl zHHENqy$3;G)@cZ1o=TGxCet2}rKsr$S_&Csn~&cY9y?d+9cxY(o~lk)VkMW$Qh`>$ z<+k~ewA52C?BH&m*HTH*AEfF2?5>qoQM_&FMLrh;&g%>$4AWiIKxdKWWS{Hi^7@Fh zz9%SkkHCN|@r%&&5Iov~AI;8Ps2DT6ShEK{BTd7L`? zkyz-pV`w=S2*nRB?ARl}0Sc^c_2i%Sj{CzU2TDlPm^WF^^{YJkV=0w~^hiOtqiLXB z3f#~-=j9r)iD_AflpJ=acTPd?Nbd`6RT6B~4oUrVyS5KAm`Ha1gdR!=i=5@#Gi_H@ zA{0BB;1(OXcc9HA2Ix@|fv*ZH9t^9HQN0jQv`QfHRpT3?r*fy|=zJ?w82u$Ljx@J( zt~D_)V{lNjrK}I0_UhDpL?Y#;xIG$t9-dzb2S6lR3#sugbFdW@H*k(6-N}jGT(0tm zIMJqgZgOVPg%PZ31*b{<@RUTzYeg+pc7%`!c_%eFE?+l0y$9ge*9LwE4u~#)HATfW z^qJPAT;YuW3eOGnNdGkD+35z8g}ddtJx&0So+y zEA8NO+6Mzl%l_Xts5{M9s^@d5gWPwa(SbumqG4Mc%DUYWotK1IJA&A~a`;j?(0UJ9^x?@Uy68bSWy#CsbWsWO{JmV z9?gjb&RJ%-$f^sI^!Kq+2vWArx1&f#TNrcuFi>7o2yavGs+7bE6-J zg;E;+a)BD+hTFHHd2-^Irm6ORW^U-RYjE}psB`zuPo1>IC z3*3bg?}xDN;2fJkMY-E(f0Fc5tc4z@ACyhY6yeuVKX&FJC;(cSS)TbESLJ>G^@a1=sAX10Sd8!&&hR6^OwAZV}CBb@pUW#MQaVsJ$^y&JQB| z=U{8NtGMIm{a1PE4O1kKfTn!RY*h;ddS!XYf^PvxgWiVQsd~}Y{5H{%3J2dfm6aJ0 zk`X$9%_%kh!M3)&Q0%kQX36_EWb)2FO*&@SOq<4NDN#YE5_bI_Scamu`Ax<55&&Kf@nRDK6mx0$HS;mgWT3X>^@>cvbS{b7jhk zHW@c2>Mzj_H%YLqjt9o(sp1WGisxx`n4Ztrpt3+S^kwGdKB+d=!=-!UeuaF-5G0<_ zCK=vF>3x^SH4kLe>|q@ZTz23L{w%@U8TYZRpdhJV3|4uFUmAsd$lD{Gb3rSH>o2#$ z1{5L3c3jGqznRX)8){)<|%Y zX*)8ant3)0*RSam_+;+$JdU2F}^jzEntMiM{gQNUNK+pj3LA5 zWgTYeREMm^EP~Q3D!QY|XTS`PG}Ai2sLHOvG8=tkq6>?*gHsgwo*RR zr~D&Rc(B!=dX&0_Dt=e=}qGHedTHEJ23pl-#uXwFZjBl zvx6AW8&grGF&NLAQf(4n%~W0WL>9EhV^uBfIqON`-Ul-v${6-)XhObxi%GT{^aV?s zy6oU&4ycB^$#be)Ny|h6pXC@dp=iRNmn~y2N zrI^Dho@A5Pf=fr3a$(#-9F}-KSkH}}W$)XZ^!H@_Sa1=fXl|0|TOV}ND zf6)Z@?e8G>PJ#|zz#45^I<1$`uj~$X+sgp<57_g_yV$*{Zx&B9(~hvu`(0AYGtI5`U6Iz;vZLx3|0rv(#zNRe^9m2Bv?J_VWG`NZO!ukxjX ze*f1STs|khv!TH<)C=TJFWvzq$PBtB%N<5L-vFM)8%*BW)}mJ z(;iXWJt~kKKX?+~7wvje^cK>Z?l&#QQ=8%a@|7j;^JG|d#YU*?>OLmIRzyjoOwquS zyqK!(05+B~EIMc!fr^h%yK69J+?SrJN|!Wcd3zHxD%F|L@ivQ`M7~7?7U=XF?wgyT zt`m~?t5{tWe=J00pT$+Rf_Duh=V&DQZrD!PTa?RVDg;|KudL7Y=eVmZ&L+-5Kff1Y zI0=2|U=agBCBe6%EK^5OnIUyLt5=#hflqu1g%qpo16prB&+?P$ak4WCf6W6TMMYb+KaTJd3K z;Mt|OR%uOX%os~Sc(!1X*hh!~VU+UaNAir#!DfSe8coltFS7~;5JHK>N5GG8JNzt5 zvaL>QLeAKNM`7463*0Wbf=Z!N7Ac#JW&HYGNQa-Xjk~r>M|3q8>3*}E=6(Sc=az`Y zt2x43el0j?Z9p#PI^YxbYpSOJ^Xj(vS)5bIQAdAT<1zHC9#lJmiM{h3S3A!!dE8ad z^ee}(r{4~AY2#A`NUkcnPs07$$KWG`VovPDJ0h+rwrl7?q_&xNLbpMRTy|e>>mW+qdnnZa0@-4*n{B;W`V%YntkxPF$>h?>jW;D~X(21zXLp4SS1;pqIA`1F&M2ou2)D zeuHVr4NJ9pTXmP7mp%wN>}QB(u1t!@HC4Se!)z73f7Bht^qD= z3=q}nLZmH!p>D?oDVroLOx}6s0kt(t!mavDlU;%1d1+helRkx|(|5Q*C9E9Y&`){c z$e%y1y}V5erZ17rS>v0-?djNv;9@PvVOk3T&rAqCa{t%^pVg?2|0fV3-fX9Td~Msp z8p1i^cq}&?SK!!p1u);oJ8}#y?ZsMr&ZZL{h*xAAbKkzK^3+E>+`i^bH#z{ zO6ECi6^}qV=^J5$*w|N_(UMFJZ;#SRw)&iOjf^7+lFMJCw~zE49ivNt%|#qjM8|Cg zA?KgT?|;_o34gHW;jz2!j$S%Iso{o(?}}(<=rjY*?&mo1+t$6esT373qq%m}pB)GL zoW*4vZKQ-NHw`!Fj1a8L&L>+jCQy6!Ip4ooMUM#gk6a1h^$bqyqcezyR_U$k&J%?G z_*6pveJ@H`nDLE`*42P@CSpBKBxa!GyME3sY{n*u20@kCC|htQDK&LWNvfL%t8r5C zI?3|v+P$mxFs>^#+p4I@Snz_<67Y*&O)1(yRXKI*<#{4zq1=B@FigPAKv2DDX)i5^ zyWle+zS+XIarfJs<;H}?$hPt>=St)Szrcb6I{ru2OWp-7q2>mX@FOL0PIayqTyhxh z(zSTg8^46*B;$F$*dCFm5=MXgSv64cn;Tqne$NZlqh14sRwR{wsBv8$iF}MRVDpl+ z^|0*NomY;sn?5@yyb@TRKH%!)^mCm@YY6|__8!@59ea8G1Y!>R&~SW&WTw*-68FRB zkKCWH=*7TK`|cYNCKSSF*5S_$FPBrnhYqli2c^3i$FG~c+sQC+c55I`BD&3$&^Tnp)TCU)I9>Gx86R0| zZz$uoEc0R<0`~rBaMnDS|3}d9aU}0UAJNZ)qk$dIigZA)U#$C!LgKBZXzWLHp_T=x zOTsWR5AH#>WgHM1Ftl{t%0OI zSlT(wQ^l({?pVH{g-&fu_4p1(m!uQ1zK{KO$Af7RugP}ljDPq-%*(@2T~TLq&mf$0 z0AePP-u!+VnXLOmK34;MyDKKz`&j?4!3=~>1SlxzAm($!m$_ zlwj?qB63E{%Zz#ZI)j$tuVWTr{8=ism`c;`^U7hCxWa&R?3G~zJ5AW714>f6<@`Nn zNnoBRB$P=bk2@MsDvev=c)KCJ);KN~lyP@0$)twWf92JVxgt5;v7ZSO`mF10^mBEe z?Hw3x{UdUw%ig;cGj7Y>W7X?Xr&5Nz{noV^gDn`0xWp9`nDRt_u6y8J`N1fYA8}Sf zIS)n~Kf@beai1eg`^C|z+%|3oP#nBxK*CV*R*nsi_!&`r(Q8P}PAIsgvg-Zs#zAu- z2~tMCTU~DJoG09J@ID(-tT}tL39s%QTyGccRPjLli9c0l_Ar2WUsgY=u`Xrp zRPJ#2<2=>yQg&{>->;Vva6oV9GB497O%hLyx=QTk6PC_e2*5rU`{d{fVX>#-@wNNN zu0NS<{=^znnLy3*6Nw2P+Bz8}5ZiA4YZ9>V`Cmnc?E^*$_($;hzw-~g_h}LEs6ZVu zrF7YX)_qUq{%n``t_O7sai)!X4_(qCz zAq)eFi)W*@H8=41)k(X-!|)0W;YW6!upQ)SRMnB{lWRTsITqrbl_>Fg>H^o>jdy0& zJNyei5R%))R$te$SDiMuFXrZu2mAv|$bKX3W7GAT+Cx&H8uXHJ%>&VfhN2Ms7sF_X zs-*&s_QEb=si|+1KOD=x{5t5Qjq%Fqm0DWK<w_K*LjAgUD`Zkb zmx39BJYtx2uWFvewX^v?bwZO-%`jY<$gOXfRJ?IC6ga)+RcU7R!TQ6?vk16FX|*xu z+4F2@U-XI#m2AnM`s|x5V%~4_^xWL!#H~vWPeXogDpv?EaH1lN= zf5H4_=rKJ}Oj|C3wkET9v^jJ^%bRR~*sYPGIG`&S` z+7AW3*t@I{qeuLoSP0bydh{afi07I^o@Zrhr zZaLIC3m9bnT*req1k7!puX5nDyoYm_s><%LTNHa*L!ERrNu;btm{N1DqwS8^kkXL; zW=W4}hvv+>{R;d%jZH#Frvml}T!o6K9W!<2AAOqrD zrzL#;z2XKeT}~Y&DVkxUttr++ZL;;Mf~9Ct7EcuqghvdLS!iNQO`Bbn@u{&~O5xLq?RAIO z>9-m40AE>clD8?wQ}kz#7q8-gp(pOVh18sjtoM(2ETe8L;zPUIehVpsG1%bY!T70t zWZ^T#u^_z@Z~INjCF3UBobzwzDaS_cO!i00KHMd-Bj*e(d-}i8EY4G^RDqP?u;XMu zmqsTeTc#wV63AV1|(?FZMl`kdgUk)L*!F@UJnzD2*_G`_ScJ>o#N%2b<3 z54O_|3-CvUk%s8S?h=O)q_-t&PqsF3P5plEvyNa+Y(Kf9vIOQs;p10pG#x1f2?PKY zfN%5q6R#xe#}q@g$5YL-EQ8H`0~tJO%K8Q$czXvQ);o<{gJ$Dt_$gR#K))t@MZhsJ ztJ(B;JY(^*CTGW#aepMg-YTAyPonl^e-x0Q##%xj3KT7BG4g>6_V43KHg+oAm_{4L zezY$gJT#p?g%c-lXvFdjRnnDm{*(YY4%#hcm}O&CIFLObo=d$6O@{2l+F!e4Gy=}5JJ?UKV~juz zC#V7gS_*z~WzGZgATmDOmsRTltgA3hH+>(oO8Kp1wBxm| zRou0mG3RjQH-DDbkUOUip9DK5`)eD~`Se2f0 zy;vK-Up3I-mj7}QzaParM&65E;TZqWHW0>YjyTcUcz8+ML1RX+&t}Cs#b4YsM$dX2?b_^C-HtmxqMkuCQ#{9HJ1$)=3$EKB9~> z?gxZTydV_3apTX)%azC^{J2U3VH7d@n#7k}z^JtQE9GdTAlo01FBfAh+X0_&$&h@U zUA0P|1_^S=!V*_(=N!Kqyzf`d-ci zzORSBN2lebxv-`RcYMyfj85t(^LWW9m;Up&E7q>8k7KM`YP>*#w}s>@liM#eT?QCx=c zw)aWQpDwyivs{C+yw)f3_^f~jvK@n9<#;Rt@b%g~1F($4+L{U{#1CbFPw2u#=&PFa zJ!&C)Znc{noMe~BrfP(~2WaS2wIJ?$ksYxh{%l)jfO6Foph$&Mykv;OQ+vkS+@(Zv zY#H`u+1s8b6BsU{d$9=6IC5zF8YrSEWpLylV8n-ud=0l-=E8l+pufz-trR`W!<106 zvObl0!1aXVGCRo$484La;}9-XAkaY_>yd>JNnLi?yho^1{QK%0^55}Lm9wvsmj}(t z_+YUzAAGmPk+S{VOD_(EOO0^?nP$W+s-7cI<#E)V@M6JuEs5VzXq7OOSF{%-gO*5K zJI-q=pS7OcGuJ@;6?n+y%+N_Sf0_TJc>bns$hA&-{)ReXS5yvVji?%Db;zQr3yMI2 zU)sgXvfyNetPni?X6eC!&Pos(Ga`6N^2!v17@;9`8fWDKcQdJ=>GXUG9WiMHn#Cf5g^# zYgzb3dxY?Y0oX$Ge2Vj3$`~PJ9**{6ruIX3dXbhVVFILkbNfr~GvG9h*lVpz{{R#U zX?u#ifQ3S*SC5;vo4NFgyBorp)^ph8hT>n=es_OW%K8KY*A`7T%_1a{2ZNU9@K};i zix}Anh%E?^o51NOS_Gj-1!FBDrg)*UzLlEfnmP!4FUQxRTU{HsMe^ zvDM@ie&N-#mn>_U5b8SpxOu~m;qxwX;lfTFfV9Qt#iKV00~n4-W&7n_?7-Rkls9>X z>}RjO1)iHr#nAKzRKmXQfEx*q&=@c8`>9IZ7$9;pal>`P5+)|0ncLHkr?ASPGRnq$tVSi0{~(;Fok$bWtcmSO{9{9+|xLWQHZoK>~fi!tP7O^mnpT zrKHTzNqB9BreyvzaO1vDhziRhN}1S3(p-&W8vz8KYqtB@hrUO4I4m(Rd z(MKswQneVCbpgbe1VrNF7uPb(8(@#&qjLtd8}0kkV{??7x-C0h=1*&zHvhoi5hksv zZ+INW40zp%b1j~>ulZ=P1787OGvvtLh-MeIEgJM&u4e$`spgyN^lrnd8f43gNDsLke5c2p5~`>M9YD=JT`Uw<1WcMPUX^z zDEV7v{|*U>#CYpJfa&k6b)*AblDl^&P^%^tMpQh9t!X{@76ymEnUqN&4OuQ;xYP!i zuur)c@f*&Vo`(b8clzFnUv!eWcEK=A>A+kW@EbOv(Gjb6me%dJ&#arUS&e=SY^>#x zf_=F-E8%-V(^-YYu1ZeYdhns;Xo=lK0)?h9Yp=Y$nPnj@_M_Q1erw+5z!6b-N^9&e zBTBOqsXn6Qk!^2DIBjZPEzL;mGF9i-ZrX%4L1+3zK|*YIW3doiVm`%0R& z+M5VcD$@-8p>ySU-tf>?qy4>Kw2!Z7Mf`&=jzx1hPM?f3|7pTS%6~0Llz2uyU>a5K zb8gpk_HH6}mZ$z^mhbh``)cgaQseHWV!y23C6d0}9)T&Z${%cJ3g0rzTCpy!5Bai9u??|ATP8zb{k!kE%CS^Nz|X=7T~Fa9~-1V3g62-h2`GNm|FlB=}?LM(T@2`L~L{_hvWF}*-7 z+BN*)+h=D9*-usERMQ5A5xHZ$sYzNu&e557Wj(-_E zYe-xC+g?LgXtX`X>ue;WA%|Z{kmCh&3&ACST$>(w-ZA{!_Bog2b6lLjIB#jEP!((d^rtJe*#w#Vk# zRznxpeybc%zoi&5H76bZ)Y{{v%k}$2FyOtsm?qahMrhI_^3nXIeRuw zL$Q_$spI_Yfj;t2Ao*>Vq`@#H(IXwW{iTAnO;VA^!cYE0NC|AQ4)XdUFSe$ZVc_9V zKC#_tu*u^yh`KokFx5n)CUa@Sg!MPRNkmb1#6u zdT;6%0u%%yA0kEUZb{PE0sa`zBUyw)ct=(nGxlV#x9C(4=|f)~z=3Mbfjv@0T?U?7 z(Ey$6u6k>J>*%Hz`a84ii<)BcrO|MReZ%nH`K8-g=!+U&=9a?J#vOD@Ssx}9(uTuy z6)@Rc>Z?4l2fT578aMq3CP(~MN{TR6E$jx^mJ5!wM_9+&Rf9HUSMQZG0pr6VF5_VA z1doZ)rL~r<;q}Ff1suNmtaHTi(Gwe3{R>3Q8zM}$ud4|Gbw~v1nS3pItaZ%t_m~Tn zu?fpMT1&RHboLrC_RVn!%hK0}|Dt}26y?iB-T}BCS(>j@;2W>3A!gTKR0@j^tmSEh zpV==T8b(|g@Q!rQ-H%=R9r1y;2+g#ERGUp4AG}R#d;UcYJ5G$pUchU}OCt~H=13IG z5ra${*~UTOj}4>s8kF=kT%%hyCuA2BaMA{gJqFspJlU-v4M&DGa=nl0?WtDvXgqm_ z*Xas&YAnm`i-YY^H>greV_bH^P;=U74~L-%o?il>+|3(^1bVOcqpu)_ia5;Hq?M3@g`mOKS^FaXtl&-&t7%NKwIA0#3 zZCE|^?E`exZRc|5ZH2Ub-Qtb>)fh<30AuR4DOtRVUGr~6N0o4R)+~yiJ8e7pFYzNz zx#oizB9eb2s{`-m#v$mfB<0O)EUS-yg|C*H4yx~R_ZsP@IcB#4x7i|)kH5&B>Rt4b z&z)Wkp2o(r#IetNzukOJsTjq3yXLryvkv)@L34f4|3w4~XP2b_&x6N36ZVc(M~S`5 z^Jo%O8=7E%DFpg6M}+~l|ORXA9Fb0(fF5Huz6ipR%bD$L*H zqy5YYjwyNAQ_5@#0YbZX+0>1?0|u8=RFIKhdV#I7(l+d@)IUEBr=?W0WuIV-c)`

M)F{%! zJb662%cp;#`XH%vG_GX?t5ArOyr)~P<&4jW zO7>AH&?3%27Jay~BVJx^H|!wSUDqSwpVmq}bsClEv=uNFyRAC9erHh#;8*A2ly?D;0CPl0dGjTpJG#I?$_o5c8Y0Q3C4?k$)Il@RwB1jzViX&@K9lD z`CG0Sl~LnwXZ3|0Fjd8L8@wiU%^pli5Tm3evcvWKtpkMGwltNMo`(2PBTcHMszlHZ zjtFi(z>O@~McPGv*aZD;E;H5L9vaPfL%;D>K#4%)Do^xJtW6R^AM-MaGi6L1lUpI1 z$)i5n(C!Wh?ILm}dI2beM8@IF~dFt}7)+b;X69^}uy_mw;Qe4uTtOzT@Ix zgNzQn8U)+^n>L>w(FCrp!h-cAG7-`f#0-}f%s8_SC_vuPMfyIGC+&R$^$?`pJgWG6{@a?>>Tn11LAua(5jdxAZ^ zdth}&$lf1-9gh{|%sCK6=}XSi`4m3BarSDK3NOQNiC_Ne_kFvwVtF__drZ^YJHB^`&93=)gH^(`~V<^ZPq#_m| zT^oY&GUn|oy|SH(T}qI_B*5C(?;|K~)o(Ru>l%eNt;#)rFrqMCyx~w3md{6ashG%F z>ZSILi1A@be$P=SYyAjAT&#;z29^2B&rwL$7}WB+{$J?R?}zJ^$Ws%8UxUB>Vn{%_ zg2#aUuB5+2(>|JKAp#8WvM1Y`r7c_7dm8(O>xrL46PtpZLl>;*))jz=iS+TMhheoo z(*Fr2#0E41wkrr+qQk=Fj!TDZ=Sy@%3t9-2!g;~5i zV5~Cuh1DrzYwRc^bl&#$5Ps0#70#^l+qG&Vq8-ZkS!Xv)t)nHVFscDS4(65_=@je| zm}*|QQE3^BAeUU7u1*J&>bhX_IrG$Y2ohxT*4%oL$8}*U`r6ts-SiSu!#njyCN0iq zqK#<1VAe%VDE>(GeqSvD{zdo>1_vCM9ErKLk<^^HEFMhqCoYVwCe47%Y-}8UXLZQ% z>?+tsUz5UV>cRcMr`r`K&Qc8SBIT7`)XlDONl*M|ZINs?ZN-ZRDS_55Nezh$-)pMt z7zVELic~yXkA;q#2ErD_!mT3A+ z{>QN|d&KnYdm3Ef7xgg0@vBX&Saw?%TjA1yeZH&+u~kXaA;GF~>OcA}h+L6r`<2>G z2+?OvnL6r)7B29H#+Js2&4%#VJ%sAPQU>yEU>}G4%irak>j1XVwceXfY4Y274kMb% zAi3k4JV(B(nu}RcxR+g=@XbuSnCO=1@i$eb=09=3dpn_(Ai8iD3<3#{;~0wl%dyj6 zGAVx`FS^k@y)>H0^jFy$BJ!WQN-kn8$cHWIv^tDwuvCUprkh%tMc?w=mm5$X@k`)S z1k8ZdtI)G)7DIGy?TM!B{jhPQ39+5eeY8p~4G(Yk`6}9x0{0 zm!)|vsF*PEq%UKhBg=zX4(PjD3Dp3tqBg^=PPaie0JtDcXnT?sMkHc@e&=*A_1lFx zD;)f|vH!?-$zX(b?wavBHc6m8zE34@?oN9hWvbmMA_Ei}$5fl}AeVs~3^%y{m8b_u zTY-8kKY5N(T%pT;ir#4sNWvzLA>0PIXn4pGAxsrsKP!ASW_bbLRNQhaH32<-(q(l$B_ljLi_ zYyr%_C~CE}wQ>}1E-$)J2#SO~!t6h+fLHZF*P`rgj#SyY3O-dHUJ_Y5QS!g4v(@9S zc6RfxY!^H0SsUhEN_XsHfi-P=7h;WbbQ-jp`(et821h)wEtB5U zEdTvHmj?3j`o%IFYnL=HeQ%ulzi#%EF_#rjm>t!hHUXBK%PU6y+RFwja!%hKF!OI2 ze!&{e-|b!e6h9=cd1Oz|GpF2kAF))?mx9?_2ovZhAIHAH>=;B2S(NSCW9*xsf+cI4 zepwxHGfThXfsaJ55zg+N!?PlWOjr^Z36RzG=%)Nh@8O5o$3&?M+iUk%8;+HV4v?;s z(w?uAe-|;gJeSv#XV}iosRLcBbG8R0{25I>HE)T;MpwN&K4`aGDb}in&+QQ=!Z&i2 zn0TYNde25h3(UPi2)v99(vR#JE9JeR0xcISp*ENr{FN(sy2}#=V4vJ!~pH8+|BrC399bordr z)MAh@kxU60RW8o`#0A}8Jl}f_G}GAQ1c2xG#A$BDh}!F1z8aXisLdoLtZ6zzgIff< zhhd<7qvvyg#Nb2`WI;4M)SgIua`ilUai36YWTX4-Pu);<&$9&r8-$?t^_`tE)9zXf zY;K@O?mGPDU$Ii#19D48ZJT1HqrU;IuD5Fe#&v}q$293!7hmC9YxhR%aFeT;oL(92z1@TZcl!V(M#@BY%~ zBPG}~T7oe>__m-jeM+cXmj%4o*gU7&Mo?L8{WchM^?K(TQH2tyuTNUQu;WY#9oJuN z50uv7<)s5eBI1xEH^fV$qYycl;hkYzXKQxp zA1iNGYq1{i1Z<&3YL4y>g{i;LazF0^BR;p8e_gXVA=$h=SoCkcXDZpKB!FNAGrp=z zLl?ttR_8!BFVf7ziYnlf+w;+Hy)6p*={{+SmpZvupst4TxSKaex%9fDYj!x+K7L*c zggCq-)3EErOd2=Jcq{aU{*Sl9=GWf9ne`>j@%7)X#$lCUVf{KpzU|`P{8je&2}jt@ zsq;ZML>nq3`CpHi-0Uqs8NM;c_5We-t;3>hzkY8K1(gy-Izx~F6r(Z zQc~##=>`F5kQ_=nq+`f|ksOerh8SjF=zTxG`?sI{Jn!)yd+&eVs;5l zKI{8k3rgTqe6Qz9gdy(rs(mJ+vnt5_8|D-#EP?h0lIwzFzVXl{>qbmd)aDMs$g)+9 zl;)e(cpA76Nkx~R<~{OrDVGuDIq040Y`}C^BCRB*@0~E0TjX}awMpd#9izh0a?mAU z*h4J?Za&PxFIA&goF_#mj)tuAUYZY1-A!B=yyKXU%jV&pjvuGO%1JvaS)JcW?ony^ zWW(=Ke${G7sSd6CmZ+>Zy`bqF+>$$sjxuL2YzM5-k0 zSWUN9L8SQ3b*_%vz!NrmIT`!-VFgqXjjNW>DnWb?Z~uJJc{?4&HUb#~HmlfK2{w$+M=+MVsQshEI*i9B_jO`C zH^&0Twae75CO~Yq3sqyEb|nWuhbv@s07kO}o#Xdmm|4@jI%R>@lk!rpYWC%;HuG)>?Dx3oiYWy%jQm2 zeF^o)`9Uqo_T62E7^*CyFq)yM1dbkOB< zmc#GH)_k_sOym`V1oj3IFWhCNg!lp?s+OG`>#~MWu*Rc>12eK=L995nv4BSjTMfUT z_ACk>@>a?1ZJo>YjD3JMLE7wMQ-Y8d58SI4w3Ml**UJ>3x0unXB7px6f9O?r!A3w8 z8c>a0v>Q0Gv0ipJK9OISy;RW+uy* zdyPh4h_YD699K3@X}cxgg>oJH)Ij+>%IriRrbPhX)mFHQN)7h4@!A)_SsmX_{X1HAZ0rma z^LjF9JQ9e9w(>6 z&9r-hoCZCcK|bw?tV{d$?J-7#`R$St2Ungr#XDnWn>;YWdubFK0f=h;duAOorP+YL z-qY;8!t59N08}Yk#jFOqjnvW)8Pn6pG++i#gTN`L-#PW0&%r*;6m)mG~DJtCd&laytf0Zj8V>C-5!OnBx^{8>@Tpb^eS|M(> z!1jEC#2Ln*=jLQZ8!3``o0yz$rz0ZXpHDsNTmAdveu+sn z1ow1d>PdfDx*AON61b5e9WJubG~#v4b9s@@G_}Xr6xS4av_IVvB)3k{t5qKUnI)TA zWwvE0V7-R580Y0+lzjjtc-s(M_WR(wBrsQbM@CBh}X4q^~W`>ggz z=l3c3u(0l(PlN~T6y{feDquEP4(93i>s`DRxZmr7%{+#vY>&GYz&m(QX4%y~$R$vL zK0JT2$L0tS27e8&k3IJ9Bjs_{a2ibmHr8GzzdEgLH}-n>AV3kPDnFxfokIWCaW@A} zbN*E{BxFj_jk`+tagV#1Q(Nvam!AhG>YFzN1`j*|1-N1{i`wjt#kZIvEkPOuub{*1 z_tSW5Rt0N&Wty99G~o`w5}PYN!zxSB1Fax2(%#+b**Yj`T9zJrv9KtHKzgGfVlQ2xYOX*OFVQzju7m zYk6&L=*{mVQr+iKd(Kyhxry&<-!nJh6zU^Kft`VinKwfrecPCZM)pa>s^A^W2Qm$-lmz#WIzQf7EKUlj!L;cTcp;0J*` z&!wALWvaZ}ba9K(P$QA$a-fBtpG&xSPTiQAPXdIfM z>)@olC+0Ahotvmwam@SG*8vBrnBi`mY5VAW(;@x0wS_zf!A#WWczZJ7 z$Jw7M<&*Irk&J}lhx;+@zOu&r@3X#owp5S=h{=Tm1!dROobqZ$##k);Zn)}z_WVNB zF_9VfkM}8^fOSNwIk%;!?8-bs9WRi~E#;Lb5-H#!(9n{LG+6eM=f6f(x`GBAT6FqW z{29xAJaF0`;36x`^3A-bzq#KYzS%ttbpdO3MJ2>2K7z-FtcG6L%}lJhSYgcDhFV@@ zd>r4V(lvJ#$LrIltLFHxHJd93Ns)+|aN7>(!GI^tt@sDL2GOr*;%hYfzm>Pj8x-9{C;{}^9-0f{pUIG%uT)4$3!M3^W|EEGujS3FM7Lt$u`h`vxBuSusxmSkBj_z}oy+(U zwA%0F5nS7*zOyQIQckZQfU9DkSsJzPUy)XV}T_CXG3!%Lc}p zxoOgWun#}ezjs*-c8U5&f5tbE|DDE;pc2()DuCIDY69UssQf?YTL4HL=aav>atoFgH-B1+8TPgq_sU_ao4t7NZTtVKKC>7l`wk};1;>+rOKLR&J}muC0DtA2&$@0TP-2U^>w9=96!VC+f`3XCrcEjC|9PVk|8q zyt8phbR3r3E*k1%*yE?se?`dt@v*5X*XR3ALqfaJoL8<55aHi40XINFu=^iMfk^+- zH0KvdP-b32o{OuJchg=a&+ZlF!Nut4dclEeW_47iQGI&j!hCvD3$f=oIv=-!{IHOw zW!7f*6aF+W;KVBK5j%0L2E!Ui>ybyMS5*Ge7bp+ZVa)<$=th3YB9AxgkKcNTM>8(Q zuRYp<=Q~w2_p>~F#6Pf5P!&AOD|TfxlU=y?FDP`eQx}qH=>Q(yjdX4thzNV}F5=Sr z4-vX)w)VG1;b`@D4S>a4vrk0)x2_-s>ejTqXX_%-V7dG{4!D3$u6M~IW#_UZ&6WGy z#_Bf`TN|atqvjw3+c#FGO0AHJw|3939&3$clK%w^YDGWRuZgPl`wfEJrlRPk$Dv=Z zAacFIbVmyY?c057^^5EcIPAXh>-aShGO{fc5PugCL{Svc(jBx;21(GF*)`}V%Su-= zGb%!YIg;Im9?pebn>QDl(EU=iEeV*7K^0n?7QMtMuWdP|NNGk5-DGkRoF@H9xDd5{ zk#}2cce$4oNNuB<)*3J2&Ojr!MTX#TjMeB2y;!fx(r>ZE3zOjQ4uK4qP7KX9?xk3a z?kApl`$bZ>ac`pwgGX#3JCksN`E@-?Eg89@fNmt(-Pk>eir?s4s;Z@=F^Q3Elxxo@ z%A7r!+C1Kg8hv&qAc3lOT)T$T^IKv;7`#YF@Xj@LbYaABc5-3O#bYaPcQD|7MlZy~ zFp19H=NPuX_@=up>8#yg)2-et&SY^PYdDB+v2}JN4;3}gT`)BbT#YdifkreRo7Oyt zlVW{bI^CGNuQcZyHn^`k2upB4_qL4gXHYfwH=9orglN3S1Ak|tZy7;p+^V^avS1*c ztmEzLujUc}`?H<}Z8V>%(xYt~roIbJUbJ-+t5@v;Wl&_w@4llZkrUV;k?W%~LWNmos6v)s`~xbRe(KZ~fhbF|;4QCMSfvAf7MF zKR2g6qCK_(591o3|Dmg$shLhZ-=K?<{pE5ouM(YMp-mIojWFXORo8blH&_Kq!ToELSJ^dLlWu{B?*Kb(&(3FI$Pr0^ znfilCFV|NJyW@3s0SeHel8CB$g52-ll00Kl+R<(pI9j_`cGZ^(N=_hJeb000uVdw8 z|4`b?8im?GWaI8|4&M$<`tIPW!1?h?!FkW2wvTj^;y);o*}o_fqP%uNuBG;}Z?2*F zJa?t*OlyvSHtX6z$ZAD6zUj!uk_^v4l!@t;E?P|OA14F&oA$eH_idBkI$tx?sWv0V zVGLfLtbY*xf|_f;9%W!qMHe7sO3wZ(-KfdtEL^GJ%gB}Ur?jhgMCtTu3s;A;Tk!E44wFL990!{DY|N!>l)g z!bapj0jKT*A^~g`pM^8CG{21$;v8Iyz==P(G*9$cE`-6@$e|-#Y~NH$KJETaNgGZw z`-9UCHz@iV4&$ilkUhFf$>mU3nY;INIL0e=t_LE}3gR5@BB*ti{-u_AKC3!x!c~Xb zqjq2vB*AfrA4m`DIs6%9fx4_`JnR47rs+Hyg|{MzJ)E}|x2v42>E}Oe7%-S^uph^v zcg%p@$VldTG%URO4?xww9DnYxVCrn_@1NUFpQL$n|Ip0)Rfe0Ff#TfT7{d6R#ev7S z74Pjn_m-oofwB{&)s^VkAze7dtFyIU{k@j+25Y7^^YbZaXqJ{eyv*Cekc%me$H5xs ziOoYI{0%z$*cCxD(xT^Jj}1NDRvR7}dfuPp_<~!d=+m(t0h9ohHxC6goGSG;4!aXJ zwFV=xKv6+jO1re?FClR#fps_8bQxDF(446Ic5+@U?W1C~zZi{0zqVFq7eJ=E*S2aE ze13FSx)~erng^g_>j2}!YBwp4{LfO$8>Kq1Y8v}zvxELp$6I_IE0dE!%=2xt9gi~) zgF9NpomX!7awk*2vQP9M3S_y7t()r7{SRc-Mr<*B181SnBwEw(SD&Agmh}$_a%5yV zb%YuxzK+~Kk=1~8uVs3cqAP`4z{>n$CSx&0L$&TnFTMEo9cuwndFT(& zVGORcT#sHTE&1O|5nsSbEh7;3Ghh9T$eFXXrhnA$3Ii^q58Q&9D0k=6C zZk(y3-p4Ya%*Ni(62kxD-~K%Gew073LlAk_58*1)tqT+Ou1 z;78}TXXm3CiqrRy34^IN;x;B&$FqypH}Tc_=&;+?sG?zfVf{gPL4ruH5=sXz_&k9R zHWXE`p8xGAyW<&qYnPEAj|?rgBeu{Sz3^sMNxVMQ>+YkB=gio%DoP6n;aoSstEAYA zD!`&dC!!2uXP;W!RT}d!u=A>#Wph9_{>}SK9r?`3%sVZ#6AN};#;G^8^S+(qHceTM zH*_Ub8o2iBUOs1b5v*U+T)Z=}Q#bpi zhlcvQQnC$W=Mg}-a?V{(?fEF^_vLoy^z{0V}9xPn-tKs*-JI^qhk)3bY z&gIRJrcZx%`gmaaFvu|nzcI%lu;u3M z9pPpSs)IMEV1o@VNipM}<~dLi8WhzNlkoE=6BiK|G)Ht;ID7e)34LdHgJ1&kymyS$ z4YjCit4I>;sP}g}o83y{(iWHAUXj+j-7j7S4Hqu_i=oAy1!1#%WfOUM6o(>r_F0ii zo#%NY^5UF_>&vvA#J7QMI>aPk>6lt;z5AzX2EZ2lxtmA@sAR*VzLEvhmAS`e#xCc$ z1T>^DJZf6aaIA0=;Q*A3PtU>!d$~@TxYhzN=<$Fu+fk07^Chx;5P;Ft%`vP=2bV1i zC$}}9MaD}t(|s!3YamMnLn+5BEasWE=&ovxmxv@DCEpk3z9e(L%QaFm9O(uK6CDf; zzb;*rXUj?fR0`p9&74IHm^e-3^|-umvhi#-3a!f&eKhd+A4jIa3c*N0ZYGKs2F-@O z`VR6-^mw^MX2aji(|EafImrO3^t+);3DD4myuJo?=EFE-5GADpl&g=Wg;y5;=awxR zJhJq9iJ%>FiY-C2DQWB%9kT`xh3s&3&NB1-2iqdt3zawn8My$N z>?w%YK_6$8{8zI_aC2nTj|-IO)A%t<;=wF8ic zG{3uagEQC6))9AjBk9<3d0z=1AnmDRFo4LYIb)ZO;9Emd-zu{@0sH!^)0<;Lk11ty zPBxZRV-$JG>rJ>EfC|Ws{yP-hvM&oM?3s%J){5E;(!7l?zUtXg`S6SW8tWnsqSh*# z;Ju$xOJ2)+=~uC$yLg)YS9P%tlLV?C8R(;r=|RW)L_ExH_;UnKC_r%VA&!Y{CoJIL zqb8Gs%yk=qX$xla)iSh<>(CpO<22pqbolC$FOA998=^%llT#tlQ$#W1TXzum_!@lN z!1^9QOE$R>#?ZRcudsmoj5QK>5EAA}Xgus3e#$Am;};5+T1!GRu1G0nmoqVZ%wbzH z){)^@pm@&=;0{Z0yTE#0rBk)>uj;GC)&@%@F_0SO`BnRY!YcU#r|S!a?7DiHTS-DS z!{NsnMkDn9Ekh%b47Vg-uCl%@bBzkGT0fFp_t8ld>pFjxcLd%)0BTcBJKZ7%cs_ zpvyA{X-@Nlt5I%jwk@^p`8+YS;nt6qx?eb2y?#_5Gh}~s&iJz27!+Zx#(5p51h<_p zvS7EV94cg~T)?iO-2Cu`k;AVbvc~bfh8`Mp03AvB+DIDT$l;0LQ8!hX`WLCAGEbKJ z(-^Fpf#Sk5oT`|=-ru;UpQHTJ=9{uNRRg{mMB?6|rQiUU9i=91Ep;KjNdx|OcPkch z*3(cv4Y408PkGj;eP>NtA36$b6n*RVchKTc3y!4V`?Mqtnp)1M$){}h~B9!+=n3si&%qm&tYfC4o8c?Yz7`gu!pPZ=1UchYW%2P+r@tPV zfg68_dwcFZjUiIf^DJ&Uzc@e}vP1TzF=$NdQ5i5Sf)clu>gZ!F+rAV&hbkH8;X1R= zKAzh)v&5!}{kV)cY$ObFKqJ9j`q(Owlwt|*IY*1+Crq~Ru4A=k!pQSsEz>G;ezG8qWzqDzxeY@WMp5+bi^akE$G z<H5Ux8Sqkm{Pk$zwawSW#|qJ#Dk8x4h%%@O9m`La+%hQmm-9aT!#|6b({cDKf4KQG*s2~xOW#bybrUfKeciQwL{W^ zQA-#oh_F$Mwh;n5dh;$2;qpS zhTB7FU6Z*=IH&|rNK+26(&jyJ$i&7_zPz_P>;eo}&J1 zBG^6;!ZWqy*s&?i@fA;eZ-6K?LK)5XWYb@kxkl_30hyMs)G8Hg)`;IFZ*=8Z%%d7H zI0knd(sDY;cWI5|F56;kAnj?ez8ZO8_Uz|sq0))P&*FM>{OHrJbhqP7bqREh1n3n+ zpNRkcX;7V1zHA-UQJ2A*p>ZuE3q(u;No;q*4p|8fw!%D#sFyt7Y~JRo%J!wA>x-=6 zfkvFa(ccZ-?Zp3vifwvz$}I~Xn5o znm9X>vQBfTr)&V!M$j4vamZ@|KVwY9V+Lve#V!4|Mq%9ksW!crbt_cKyMnBnpc}Vx@2OP&Q)B1aRkYCLQ z7j_PkaB0lTvtBZ@GY~8s`hvl@7$OxxE%RRXqeP_+W=ww3O2lHAy50lriJ|Ex#r4xh zG?qxXweAO>5IR}=b`iJ8tA0uE?=KB<48UpP+IA&)@`a+tRJkx{Tc^4JWwG|AOw`=o zVyEA5f2BA?i<01~1LS5HnojjqN^Adwrm)wgSssC6NSIbV6~mdj9Fa61c`v!N>x@bV zzvLV?%L7g;^i#<_Xi-+GpquUKOwT-@y3Wdh6%O0@Vs)$=hE$6y>GCJ&(pvSfJx#Xc zA*22TsKAMuU0afj;#ofN*zV7lwTN!z@eQEGtqA}88b_3}NKQl9kO;RHv*ICN1m&p3 z4NXKr^EvueaxSmp0R7t3!mQ?nnjdbz@-UMn|5Y3ee(Rng@KuQ&ghgp>@Z_QD9fZM; zTXl)Tb~~;46}3YN9TEIr5^xD@DMcA7j`~T}=k9-QzMWvlBC6@WLACTtRZes2`Dyb} z)6$XBER4_Og*YJfz)!l2c)@+hDYB8ZxZ29)U5pxd1_U>`S3xCpm%)qmqwUEB7am&j zO*aG$$$uCdR-m<^d2*M{n-QQ3p&GsSY2%64&jXc}GymfBtQK=JM2{YjL@HbPXENG zof7ou)z#J#fJxJj|B9WNQxr}|Wj!*s=Ow@Tl+tOep}8NM;SRrElAXGAzTVC^Vj?Uk zn)bdsBDDvrAv|+?ll| zj9mnaUetE&QuzkpM+hj%a~G(Kp|T*|dTa*Z`DgD|n`3&lY3oR^Mj#4UHWZ-YOe;BH zGz0@?UooAqFT>QFD9x*{dV!o1eLsM4?@UC+AYgNC`FeW2K(BxkfeAO7YQV)I&1eTY zR?UvaEq9hq#D6&ZC~1Ui(LrWkee_mEO|ZacP0IbGd*SgukSTu4Q-T7TS5}~{pp|G` z#In=0-nK-~o)9I)YqY|4JxV{3t}1?zR*zf6wnC9Hk(@F>PG9V*I{a=%_sjKaC9z4R zo8Xm#R8N9C3{|}A;d~c9k9XoBuqo{px1^_?8xzSzmFX_9kFAD!$R!M-^Ov`Zv8b&t zYQ-mlWn1Iq4Arv`DMN?Aa1Fn!Ha!Jtj>;*_Iw{qULzi|<)16hy)jgySCFWDx*Q;RT z0XC%Uzk=DfRT>H_5&pQ1W$VL75F^16k3BGYVw5arc;%!1)FRetAMF@o)J$A$=Ssmr2= z?&-hVQm|cnfPT5Riuln?Ap)n&$@=qkD0T9xgjIaoQMI*zP5lZv#sHs7S1gW}?JcM6StiILpc^(|0k+!@S7~ z-vy#i;IT0&%K~9XD4RFEAvj`*szKe7@!CJ;r-)2>U6Vv3vb!zJQ~zdEX?1oDK>4v)5Qre5^4gGZRagPhA2Hy2m>4%Wdhl}h+UiI;_~qBs3+ z$>%=nNxlhlU$)Xe7>FqPF`uL^>+%syu6gc6tXGAebvjn` zrScDRxVnOfTJb^=GcsD{sj_WNXM94Y2g!n4o&l1p5PUbs*Dw5AF!sF#qY?JCm_Y8{ zTer+kjA!SHL5nj$Z<7wd=KJ+8@&MG6WXU ze6hQphs#LSSI-kOPkFyjVdDDcTWZxnsp?yHNZ= z^T<>?nchbFOToICy_yV)Qd;xwf6aN7<_G(%Fk6cnHUb@Ao=vRjh&7? z8QuT!lP?%McCvJpM25JEjk%r}2rZvIV7riBXJ;L_9?5LV!xQlF@hl_7zYm7h`tlz= zLT>#Zn>PFZl-Z_U+idTZM%n;bERf9JuYKnER$rEOvqb|1CCpp?H;>xWOjXn^f(ZM~8$C24QdC?8(|jd|YF)^gsO48ZA2Z zrr{R(E3xR`^LKkc>M9C-kH{2XsC9MzM$tp9+WcgTrmqaR#xFRc^(M(;WrUxQVM!I^ z{^40d`@mW1D$zYNg14NImVfe;xyso=Z}9TekAl`>BKu#qDQ`UoByuuJ`%apR$K#oe ztOx5&O(POZVr~mA-%GsHd`858>7X7zKe5ew>z}FYr*=@>0X24*ygafo4j%Y)m{kdU zjVz0@*NmCpV$^x}2oxmGNWsZARx%en&~F|i^DeKDkx%E{lg;Ok9i6CYyjFs)-VagP zGar|vr*%^rdd4idSnb z=E3G>FMblxd0qMAJkY%+^g`MDKD`+EtZXZ=Y*WdywmGD-)Q+wK0@GdUM;=>Pr*=Kp3>d{RCZU8e>epjHuGH#2WO^b$CZ5-K5Lh@`dYBl(QP=5uPK3 zgm2AaJS2jI@kQXp}e|KscKGSbT zzQ=r+(-gLHXUs{N-RZ~?{wcjgx#(eZ(aYxRo|Yz#Xb?^ zeqt4Gd@xtL`^hMh^-{%e>EwLJY(W5wfNO~ednK!rak&!tpto%S8S;1Ko4tEq3)6qD zet0?}zMTQN^gIFJ#0DfFK|bw@F@cge5fM&jSh(M=HZNwX@?co6P;!8W%YM-YIO^EK z22(CYJUrm-xC_7ri^b>>;n%f3y~p6$XqHEJ#ZL&>viRh=AfmO_Prl<8oI`0mi!hOd z>_9fS0PLVGh}36ss(ttBiQLk)Y+TJTGL;dAybDRgA6H4g>cv!MO&!CF)(FXrBhe8+ z=#2f$hc5@LIWtWQ9^8JeZWxAYjr@kNXb7IyBTimY>)o*K(BDa>sIDeOkfwV!d*dxStI=O{L45rwVRbR8xqRu^zveq zgqCkdr24tooG;%EyOuVeEW*lfO2S3vdaHq=2$l_#BORoZP><*qRM_wX6xGl6{jT?2 z(%cd;-iK7Pr_szF{Tz8MpYA^)>a(HusF%t>TmWR&PaoPR5;J3>;_L;pbvD}Utrw7^QH>%w?Zmj5eM)oV&Tm8{4@q_-C>MGj z&L(F#7C7KIj|0;v?44OQupF|n2VDbom(aGpI2pUiv{1ued015wxhq+uN1`C+kVJkcCVc5s!ht*Q!jS z);vUVD`rk2=H}t!5g*}4t)BHEltJuh%a(hV^&u)Hmu2@hyb~Q(Kbs7A&B2ZWG}ZIl zHv4zzV0Q658_~j%=4y#c_5m1C%ue;fVkRmOq^#eJ5ZKEO3o!clbgWMAtgED)_E*h) zEo{`?@gXdm=0Uu{CAS><&n3Q>2kt_-MkOoF|> zNvHJOj-gsK%rasu$H|O)kccT!8%`9DQx_7ui+y`?od2}cck{9DVwFXlgp|3LKsx{8 z?2NSxey&Q@sxZwhnzPu<2Z_^?=3V=Z-)JQq0yY>IQPRP-+#g7G#-S$mkDT1+n{< z_3C(M0yCE!f$Xc@j}n(0cP=8`P<;E$#nV3az4|>$0Ux+6%w0vdSTLufMFucsmcC+S z+c+tuXJ^4z2b_%HF_hhGMLT%U+6?&vh0-s;?e44;NUCkD^AREBD`eWM3~}>vkU-Y_ zA&t+d_xTHA+{FEuzdJl8GMS$!zHC0*EJRaSH{3#2MS5M)F?9{68-&aZj=A@9c$kI~ z()0qwc7Ox|sgTGw9cSMXOB?F^WmJ_?)%3}jE4vcDpd`%bjH~oD`O-T~0ir%rf|*Ab zR;-i{&YR5je9wK9;CD63bLWFD-}=NoV55YIr4x^pMLN^}aP4Q-cNjoKYet}38d z05S65ZG3xfM(yi(+oC!!x`CslY+^Ivi_B4n!M4X!IN5cVNN?v*yxhzd(dbkAxLSR4 zXUQW!1$!(Hg;1Oa0xU+?$b3J&UTA-|RVE@P)=J3dUivs!!PcR_Ry`XTJs%wW!;|FB z@k4nXKP;R&0vmyV$X80e;{Z)za4nr}w$ZjjjLhibS1=8wtU|Ci{guHS6T`HuYYR&1 zRQjx7MmsSY-~}x{$YU1afZSM;?B%uPuh8w=6XAl)4}>>&n=OCO=5SU}D6Wp!YM`Sg z?HN79CSv#5;vzRH(RDjo|0YS#?<5+pMJLz!Vuk5;#7OJ-bDjr@g^Mio1Ug!U9eU!m z8`e0fQI_~gdMBb}Zm*0+S#xL4iKDt$anc&ZB5%RgQZ;4Rp z{$$=$3;Gv17zscc;q%Uo+YiTz+tje8k0X-n;NyK&XvtMCB}>Cd!1;H2pkXN3T}`t2|B=kGoVRDy&|9yk2_z`Az>Y?Z>rIr5<2Kpj6D6J0?k{ zMDXH5#t2bpzzPmUTxRpmF>sSxQh+=h_mX2eqo`6pe}XhF4pj>MFze_t6Q_*FB3io@ z_mib;9SAoxGAv?xf|NFQaIjSA9bg#<{#PD#6m5mXlyni~0c1BuY&YBqm(CXU&W`E= z-X5~!3(#pAW>o-GxtZPlOPbpIj1A=#&4RXI{h#O?rnepq1UqZxzo+CTz|s4u#R?9c z7kDZC@*dKD_Bk$rxlU=?M_i>^8UeAn0;RC*n?j6yhfzahqtlj&H^!8yL{lCOe;c;b zsQrQ}CemI{^BPc3dv#>~t*<0QUDXGkrTeUodWzN2j$TY^_PF2;7rH~4_zUhrdirJa z3Oor-5lw?b_4>0IG}9x9V}>a~$^;cD(D0pgS}&+y%8`mva{J9Yy>O1G`-nmkE261oAQLJYMDF0DMNF0yB3s#zBB8d+I&% zeViu|?fmW*U8_G*Z0Pn(VPA)SS6HYxPpgqGV#yZ$6z zIQ_PIF`G45q(SXIXu)m$S--645ok>QX2B>g4SP@m-@b-8Os3c(+pwQt}`UdDNlB@1(u zff&oK)cXOs*k8xY6(8!`4bP2xoPuP3+Rp&VHd?Hwrt2D z&;I4Hl1JcNWnL;)9O4uSoQd5RQs2#=wOL=Awb}$C0WfPWOf0@IBl3pjB<%!t!p?E_ z&XVM_o>7I%`p=hN<2$3g=B#rcOJF>JBf(ZJ9kogiF~RlmudkO$J`D+fQi6LrZL0Xz zMdbT^_~}dheoEmB&;+`2wek6xwXhBAOS)G`&Lrd2;c+;BsZl{EH%g!C$J^D4FNyqi z;G%i=_cs#~4>g6L*ZP;m-*2?5W?I*bdQ?aY;R#&pmgU(9HxEf(w8e=rEI)b;hBF6F z<*q#3^USEdh^A9|rP26Vz$6L$D6@OUZ7e>#JQ2UZ+k)gtAEO%4j@EH&o00(<P1zxd7JNf7?_=NR=r4Zua2!3l|*Qm z-WVTXJ(h?mI29-&hQjgDS8R!rp{6VPPf^+ zOOm%~Kn4Eur4iv#ajVr4Qw4(d{+P09pNjmlva$*`-J7PFo?kL0&pH%9r_#-ogeVx9 zE>H}x-84x$YE^U|{N~WwzGOMYellM>-8c+dI-ITtPGa>(^9{DCH}Ni*-Z(9u>tP^V zFK(W6artg`W|=z;3wGKu{YvrGE>d%&X?3&8BRL{Mq*Ya8L#h!laHm8Ku^>oSB4ZeNGYBN%fEiyV_YgF~859lhNv8BPyH;Ye4kM zOG0L)zo5INBzTsa^e~_o_SkMez(2D##y7m*MR#A$l_gfYaXw2Qd^Cbd)1re_Ewen1 z0^q8w?WQAMnao{B{hhvs7+qPg?})@@Zd8Eh>x($IC9z(Cu(-Ft@l<#j{8(fvNiJUL zFp?RegxSM13i+NUiR&H0cUkQ2T9%tsR#(R+>A~d|(az>x*~ik0o|J{OaN<{q$MkGc zZ#f0Us%8+}6KHLg=d0Z%<+~ivj9ptg9B&!hosWSFUBOd%(-+q_&}htA`D1805cYEU z@gr743GKGT&r!3|ZkL@BB;}-mAGkxIlz1?G4052gZPxvcNr${_r-?LM)-#rS7(6xh z4#(!>jog+^{0O@RB;2QSEwX5ou}rWUweZ8XJ=bHPFJAoTw zzeui^iy`65!s(ihTWk@cc6C`R^~~?(fDrl=pbj3>PPaHD+E@zXY-stq_gtQNIlNnN z%cK&qh+SHXKMV8XX#EZ(hdgTj%BR86Zs#fl9+L;`aFsW-6o-C=KE08%auW)8ZtCxs zIs^l37$4U=H#~_I$e@P)GCWKQEpA@4OC|a zN24Tn2&nNqwk^^;6J%H>4@(&E-!^n@8;33sb}m2?9+m;d5RlfaV{$8F{WP8L{DRx) zOP^WFiaj#K39`sW%XR4Kr4}V{$QE#Mc0MGLND7+1reg4DM)(@T3!DEMN)G(`&>25{ z$OjFLs2)n;pSYqTaF>fe=G`l4A0q|z%X+UJBiq@Wmz_WP`vaCKWd8Nt_s-e>eEEMz z0+mMh_c4YSI4(wHpAGUd|ytaJFF=x29=Yk39EUpsC7#rt9H)gss0M-Oj{L zzUBJ<3V$m^L z+4(-tIRSZxo6UV@((;MO_Wzb`cjBgW`(y9(y9_3;M51R$F4oPyHjkl~>RaIw>$gwf zm;a=M7Q+9ega^n~pL6fyYTGMON*N7}?G-DW{2sOtppR+PZu0WrQOmAgI^oM?_1RX^ zTD&E(nrzfO)@~Ig_A{gTaBT5HqoGR^;+PM1+8kR1D*;u2`T^EL)RM64;_SAXxzNv> z!%YYxiFt)Ip>bdRgPrQ#1YA1j6Yov=OR$)--BDR$RynTp>5K9olWY%Z!aDcB(q}v_ zE0RkU*M+MOkprp&1u9+!F2+q#L`O@b{`nAe*|Zs>LS}?}bM{dgCGtlA`z9VMS(c8g z7@}se&d+{C-?s$}aO=aR>R3|!Iqz&0kqOx~%@L_X@xUKG3^hK_7SDL+CvxXuS0cOJ zA+rnzB7LuYKjDm*wpnM1ih7PEFV96MCIWOpX%i(aiS@04T}dx9tzNeDGX|EfRI=8$CbGB%4h;61A-S;LkC;!4~8!JaEjZn-8a!hPpCe1pnQwn z|A|)zVCzrIOVudT)YRDf{m%c11bvG%0>kLo_FC!zbywnC_ROl=OnZ`up%(LNdn$d8 zeDWNk_$3(DZUkRcO&Op#DletB$^;K%#N(+?5q3>iWqCJP3=yIDh~TkCrVLLF^{*f< z_l1h*a+Kuj_|8-18Y?cJ6#oZnZy6S4x3FP9ih@XE&?O)x-CY9G4I;vTw8T&XLku9@ zB_Tc1N_QjO5<|y;bjQ&B-Q)A@{qFtskM|G9!5=tS+;gwB&UIeb3OcYt%Zxt%x%CL( zT`1_d&5u*t`d9DqRAk~_Bm7P_?eoX2S+!VRFPc@Uuk6=z>vP%jE=Q}9ss1K9^s5>1 zrk*)~;XMNzd7Y22fN8YgYlu^Wy-4K8r0m8B!e>u6;hH$BYIVL_`a+Q2qUXCu_8fCA zX>S<7UIkA33|Ax08K%!DfPnJco*|(sRK41oYXfEL7tLl#YmsV z|1fIwZ#q~(AWKUe%D8~?PekjTS8BX2;m|d_3SI?4xk7ji?FF6NL<3^A|0D!--8K&_ zS0DLtWBbwcqL5$&P$Mz9lmGI&{n(;tr164OW3}u_0E6fv>S4w9Es5O| zm3H1f5tZP%16(v5EVB;>f_lamEefkPo7|9iIA9Q(KR{-upu1_S%ZQ5Oy4oK)vI(6|z;D74s`rhHg zjdeP9+bO(vVDa`cW#OFrY}NNK>}m>ySl27A_rM+6MNHb_$O)p}BA%5n_i)&?O&dD1 z3{zb}b52~%9MUd>iJ>oi^$~jc3aLwYRh}NePX2%J!tYg13IwE}e6dL>)y^JzFe-Ch z%NY`(*;jTXjQBm&k&$h9)f(tcA^!MtjQ`S(+nD<8QI@)7H!M0%bJaIb)|nig0t1d% z`u&!y_no|c3q{bqp%u~7gGlDafNbLuryXqKMzY|%d*m@S z*}S%z^2(x^;kP@zDE=k~8V!OtNW!FVKx*p}+2KP=CzfFFv(JA7d;*?-b3_kkHmS{4>v+IAwUn<8*)+wvQO8K_%*jlctSjG0A?Y zzf_X0Id`jnv)sDKp!bW$of;iasP|_;*OyQI8_gskdYNp**q@hF*=TkP9&pu|mC9Ra z$10rI)-uI64?DvAOg5=Yp=4hx~RIbM614a1jb70@?x7#VtkEY(onh!iX zp?}5gLWEW*@3eNlFX2u-W{6bO)BiW8z(rYlVXFJtox=XV7fsMupd_v-gjN+A5 z!&MmQ*g-?5SN5-=Qf+2Fx~x&8X=_5pin2}}$#SmJAx;!UuAI)VG;rhZqb^^==`L79 z^SbD*&6omShlHj7h;+IDl7~PL58e5hSH@e`dsN;o>qq zX-+|WDmL65(%a?D`jRF}h*{+%g&9#2E4B{c!+?*#Q=gr1(W7|x-FR-Lx5-2nSKQ9# zBlGQ+JAe0@LhHi}7g(ElU4z@-x)rJ|LJS1n>D^O||iRbyi zDEQfe-x-|?e6#0zGq{h{E4Oc2EM|Byv$;L_Z$6Lw!H675MX4b{4Am}E z$CfE#tbxy0!Uu-c`C6n)e&`Xtk+wZBjTKn_WYYd!yHnL`N@(2??3b^#5SW5EmO<;R z1YYu3s%`~Xued*;NqCUn7t!mR(Q(8x&$jnY$3Nm@OquCNubi+mwGBdkGrSCdedfa1 zBwG6Ng9@^($Or)PMrL9g%^m!16+z%dkHZS78j|%K5JuOSkq1^?(FD4+JWG#*N~a=j zfFt+KXQULGqC0wV^i*1;hKg?^$v0#$Wd->Jqo6b-{))~VOrvq)v#e0V9+%y3r{JEygP%+m1cukhk?GY)T| z>`e^%@v1C8L1dNSV|T0_P(d|0kuoH*{0Z{TLH-|V_MKf#MYWR#uF!&W~83Q^RyiVonv5xbCtiZB^(ew_I-Plw@od(@Q1 ztQr~)vkXFqH}gD>Uy+1LUp~yzxas-MK2BZtEPU?VSn+-VnP&}uKXLp@?qvcV%?az! zDM>Bf;o;9;rv|2&yk~t12;`$GldlA?5pIW%6qyENdL4P&rvFe{(<&$CO)CKWTw+7E zHsAO*T5Ni6#}{4@16?mP&T~oVR%Lu#cIzD$WJSsji}&yJOKyySVnBUk-TKD^VkcVS z0mlI;cE|`fQTv}nEgPu)gCcuF;aOF_Nms-XMGxSi)~y#eO`tr`mgavv(@(Rn-Y)W| zB_AQqF7Lc(2s@s%T<739t#lGKh>D#{QYE0tpyqofI;zOV_VBEzQ%&>Do&|u}K(VUx3dQNhr^Bim`Yj%b-aJ{haNx4!1fZ?5UlEFOWd#@KL z^yvap@w(TMNe;*-+*sBJ-17olW3+9ov^?onjh$12(I!`Y3qw&2}}K*2{Knqpa3DRTmyxxKn_`8h)~gO=_;T*ocLkGmQqlIgWI7JG7^)@WbVvu*bv zq2|1j?e>@JT&Va>!MU}Y@KXiEt|IRWAZ#2XfKTc$^xoxol=toW*kz11h?%A?r$3^y z0}p*_f`~H=Y;YLD>ffV$Qp5u|8vCat9*%aVM1%{Xcjr-tI%C2s0+6$LKbr~xy;b$z zVzg%6auZDHI#{=emG`Ch-Q?g!f6MVm{sQk#t??Z`R~voN#CbI0guP|wVUMg1BZKOKT1V|jz1@Xl zi?O3zwsS|sQOKRar@Nb*HW+?~dav=%za9;}FqX#~7DT2kSIJSG9BLYKtn>x=4ttj7 zJ?L02Lo3N+n)`mW=VZd5j7-6b1x>^`;h`^wwOkocbuAXq8ACn}#L+bD852T_{JgWl9pKdHWN1H@7K(Sy5Z(DObwoy!MJy3R>n`zyAsd5ie&ZEiZ0h+UV0bw`EHqg@!kG|nb|?D>%%v?Zz)E0+L; zfo^KAkxS#ZrsIhUZ|rim)l(h4)Pv_FPW~L^TU-rjZr#vM;}iKY&EhANTO73CA@%9! zLjY8L-6g8G?+sUFoJ7YnThJeoJcCikQ5* z0qX<;S?e_u#uIid*s?5(K5hfm?hN1*?*aPvixwN4Dc2CF31p0qwPB6an>7Mh!l(Qr z#IjF>0>*TWV=7JP3a-YBcSj%Tw!iI%Dl2JT>C81pz1>@5|Vf{D-4IE~Suf zMBXU+EQJ5 zwjKz3{)txqov+@I`p<7)Gyiw2`hO=qtME^etJ(CrW`<*hia=`-6e&M>v6j;LLn`=P z=KqNglE#Sw7;#YK)~yD}9fav_Vf{;l{{J)>vjRQ(K0gEq=L$yr+Ofx;LmtA-52;=IJ)a^Z2%`oZaGj6VRopN0s}JnUy>tQy8G*0$-nLE0#J1S`6k?8wj-pI zrOKq#Ju~Mc=0*}qy*+jLD8LnYeOhDC9Zu4TU-i%!cf4ufXD}6mq1Q?DnyY|BS2xH- z51fAC`ID$w?OGd~cZ0Nxw)~9~Ia0Vj)Gm48k*a}wmWHPU@$uY}GzQ1VX(PdW@15J+Uuj%FB&2KdXaJlnd32SS&q2<*XFyeY<0LMrmZL9n|h1Dmta9N%oM zm-x|FXc~TC(Kwc{C6UqckCIDn+s~b5vYah?DeCP0UliX!m(@T(ba!(GV9M5s>;R@r zAGDsttwn`nS#M|LfU%QcJt@%;)I56*-E*=VwQGQgGHFMlhVUa(jlxO!^W6e`8dS*9 z)iv821s#il@!L4~l$WtrEVxNmQX&Eeu?j~{`X4kD!>H(NQ1wB%@SU1#t<`4M<<97z|A z0UzQo0qv7bYs``H)*qy4r9E!hz1(xX;ue8y z<}{tO;VuJo`q*OT0qGTB9)9EZLP5r6T+ZD}$}KgZ`*{WMQ-Y}WhS=w!4dUyP$lIJP zQ_I>Im77u=#bfQQt!nxMzwX%a53C>n1JmKxll`Vw)Dui`kvm-*_{+3t!q^1wnOT8ogmUYqbX z!^)XYhH4%DBaEoQ_H?Fy*RhXD40~#f@0R~I%`dMKJxwg|ufW<)41wfY${>$MVDmzC zeCPqh2E`21ulHQKNIKQfep5biMNE4e-WJU=o2i0f$&|?={|iW+ z)SSA5Y#D!8aEtQ93^Eqog&7iMRZ+?{QZSW{1wWk~%7{bx0JlnB`ObI=4WGwhzjKov z*%Cq4()kV7fl8w@{e?z0YBii|%YLbJu-{1PN{SK&za427%N^*r-A9g(&D3Oynp}b*ztO zARPKXtn!t-nTYMLD4uojBS4>MykwmT|HJVnU%qZ6_pp54P3~{?M?3qn%-GdV1ez1B zieT6!9uSFnBgCYnK8W^Mn94O(TyrF17f*N~pP2r`fEEg3>wHp`MvX*~>6aGjV+9*6@~B!vw_6AKVU?(;R2uP}4|U z7+PuVCbGO6oV}q4dIepm?$9E{`|0$sPfyfs&LBjI`^vmZ!WVdiU#j zi)h(;DZ}+#Mh%UmWo;v%zy8QIT$@i{lcen}N)MT(Pc3@C>_?N8`p-Zk{AbeI|GU5xVQd*QtAuEGRabCrl9*f#RuFE({5+ckva(}kK|C0G4&O>46;a%C}H#S+p8vRqirt-^j{{2ToBHj)%`%Qcn zD*JJv=UlvgcNQ4uERlk_x|kb3(YY7tDWCE~z9i!k-G4UY;-gxeVQL^!@gr#|%rC?P zr|XG|%wrma8bG&U)Jo+8v4(K}SZkzq{s?0=K*pkQS37 zZMv-`b#D3QTWz;i9>Sv{DE!osx6vNrT6lOs5 zBFeoc#dI=6)`(-*P1Kl7x`w`5mNWLr)6DmG3FCG{e!#kcM?tXfzgZe;GRsyT?<(as zVQx0gVbx6$X(_!&L+5J0?!9~ndjPm#q-YV!*fbYf!B;EyWLa=IndLn{QdAW9RB94r zn#~D#7o#V%_!Ry2=h?4V<-hIX?Tb5B`&wbA)7A9>AGDMpThy;-il*vSKEjt)3AlS} z#}VwjoqC@U+_s`tBD2=&a6@^#-qr<*KP-(C7@R%^D91CB{5a2=rWx#&?l{5;zdYsX zoav>uH${&s6F_mBIUmxBiOE6lL=VeEZjzIvXJ6~yl1MyjlNEUD7#d5gjF=ins)gK z#S&oYCD^+KR5!Fs{vl>HzyA=k7M=}&m~Byc zt26%*_XY1tM5 z?mT{ixxd{Rce`BSn!ITD<;Z>*AUl!mb?q{H!K3_@e<<;ZLku4Cu62~y)=Vka-`t06DuCEq&d@ z5)hWi?Y~w-D(mjWy+Kz<4#7fmG@9p{L@6`YvmaSp-)QZ-ndOYr0vzl-^4PUyd%d*i zDb>j-Dk7aSo#oL()bs*sdOGk;=4Q#nhZmKTZ`!+9_ddPLSV^L~>A|}tepC{+g?(0T zvdu{02cfq#bjR0V!;b}x5{gfGBAnBw=_(&O1V>#-U5yo>Z9Uwjt04a)>M<{^dhAvi z8tSzLa6xBnz3Q!I`>LdYO7K@JGz*27iHRcE;WfZnH)c({K4J-8dM>=+z;b(>w^ZWg zANa$6C4r3}_!XmD^wlzmT`NXf8M~9n-Z0O16nqU<<2Ppw-QL$GRFUO4W!q8!*5)VK z$$R}ynX1$^#QTrSNW#NlGgWdquRAC#JbjT0=6+6&C8Ukt4iEJnXH#wE*~fkW>BtwDkf#Vn9b!r7+&JDwhc8 z-L3b{WUN)eI#i82GIVjUqQHoe42^Muerdkbp*`17egI8@A*EUgr=1wgNe0pLto}{1 zO%=zn9JG+sCifPI)oEB7%B4RpgYY{nInb%JQJ*(97Jc!9)UIcERf{i>X-lQN=oW>< z%Fr(|SH((@mpD7R7~7V?AS zr7mmK8s=w$MJ|gfT}*z~mgVIU*K@?d_hg_K;x^0`c~^sc%!Z0t=~u0}k&%(j$UW6)2B{s&n}0m$k#)z0|OWQW{ZWc}6(kKn(1;(}v*fJiY)!u#{(#k2pu8TEgs zv@brPy1E(*NUoL-<%zADr`dnjzgdYZJ{sf?(+$9{lK5Y+>HcrB>SW|!=10WOeT8q* zJ+2*4nR7q(#?tfc4VizX-nWQWxS*HLlY9;z4i$EZ3n_X2@fufc0ji$D<=u=b`DOiUh5?kHRs=cKbDiJJyRvqBvEIDzb$f(bB+eeuQaJ!-9Q`U3C{f$}!*n|6 z6Kn6TCTgyEMe?{K1-n=nYz!MFnt*yT)KiE8EUu`;?v2=$Ud^DP$3<;i3t30SKPgsS zO81`mr#Y-D$mSB!(H z_L)t>Rkm@YFGRg>PJlY6+hOk^sR?(=g+B1^{PJdaXtg(ok(P!wz*e1kaiB$v-W|F=}NuAX0R ziR_xEgRK5dsO}5Y+lQkBWqWeZbfB6?4EJw&4wbvEK&a5;ggaq(pRnk-)s2P_irFrb z5>h4M5)uh<*rnJu-VXbaIoms=ElcFxOAGjT3=w$5ni%c=`j0zzIquL0h90-f<-L>I zf>?^l0!>S)0nv`Q1ypJ*UL=^5yUAXj&0ChN&b57bx`Q4&?s^LUsqqdL`=B~|h)6Zr1 z+Hh`i3*#;b=zelsHGW4+4UB3~D{0cwhwv%hjlG@Y^-V0I%L--nNV04HJetLjr}6&# zs0pB{>mfI~zybjYj95r35(UD}E^RYzEA;`-^Q<`05rNMJrOVJlMPIMIGR`~Mue~1m zt-GvxojsF&n);51+U*ng0+=lYaK*$get>v);QO)rre@l$`lfNB$56QW+opf!P&G5~80gUM%$8PILjqn{CzD5+|(aK++ zFVRrtYvQ_L@f8$@LSN~rvO?6naOo>krMED~G(es1dd@EeLJiRwH~Egra~xs@gH>vQ1gD`RM1BbM@ARt=kMIo3GTY)!GQv~8*q zpB)n+{XG9mAnS~qa#G-MUn}xllbN{zk~y2Fe6;>*n0T0xgM*3xA<;g^aH+kj9{}0H zp1i6{5WjFHXD=5VIrjwOx|jP(wCi^=kfi3`I2+PMZ9EaRMxkh)izGw4>B`;=8XhtD z^!egMZtMr*_*ja`O^J<1f6kSDTx>fb{DB){MM>d^y&XhVDvn1rp2#>VGHBVsb_)h`A%~H|*CYw3b}yyN zrBjJVz@iC=d+Qzh4EbA7AKSEbaWMiO-B%%WTj9x4lkz?sJ?e%?poVVYtLRVH$a9JC zN(b5@h9G~?z_NFnRM00{pJ&RqXyhy3+}E% z_5200lJQ@1tENXKICCsbV%{~``vJ8rWtA!CvkHrm+^7MG5X)R+dp0$Nw;N=ht}vi>}B?Q zsV#yjWEn-2(DzB7g&Q=ggHP5MIQ-}P;~K^3(!7^kZ!L+w+EIW$b|yn6hW7!s*s0bo zS9SS5@?&3})=hp+^A~AHtgQDQ$#j&iMSO3ITO+D&l9vAx4&5i8u=wjFy$BcazJ9D9d)gSQj4s}p6o#y#r&^)E zO+(Y~e^fPTV^tK-Qb8f#pMXQwIIpwKPlCe>q_O=%*as>500EC#Z)V;7#aKEyHq6DP z>eLMdQ2_-nZE%VNYXZaW{cXg zh(?{K9XP*W6 zU(#*q=1FiY4tt*~t~XWf>EUH%q-Un2Lg+8xxRClc2$E|$jLPhk&1ip>;O+!eIV{h(E3~#4M1L!06WcXk^9A zWt@u%cihD^<<GHr8D|9+b>AG!aG^>OEQm&9|%C&<^yg0jh3FZNG! zX3iC+r6Jp!Z)n*)Tx%ayG5nwhqP^j$kBFsF!;2bMLulO)%S9yM;aF8ufLS00NCZB`PH&p@Ue%}Fo29=$8K)N%`P!$u@ z&L9g=IC&)PnFrKbPI69Q;t4#AO|yKYw|${zqN>G&mSD(1_E=DJeY*$|1V{+~C||Mq zW_?w86VT}a9NZN7uvgZ+(pfPp3H+k^qq56J91>cbwfbJ*wV!qSn%TgA_Ll)u_$#jX{uz$%ta30z4a?4e*06Y zpcZW)lO{}Do_nYFF*-K_DqPF3$n!)R&)sKW_#R-=fxrKcNmpNEJpLWnYucnaAz48V z?k9qz5_y|WO;0qUd>9(X9wv~K-OCikxo3{8*Mv=9nT|)4jg|Kt{T{eC)4**T*u9?_ z6Yo_@@?NI$uWkWSCt>J{PP)P_la+BU@n#3#VkZgTYC49}82p{uzGHuin&_hTuu;kf zT~fEsv)s1gAMuUYm*VE%re0x2o-5JPf-iKeOAdd6>Ec|kVtJd@JS_81D%JDH{(hP3 zNS!d_d5CUXqEfkny0?V)NPd z>jplXjWlCZO#<7&iromFq5)hadVKQa%h9@{r}fG7Sk-ppqb6i746Qvmv2l2$Ad#P| zrMR&Q)|%zZ|KxA!zX17Lp7o6t$dRXlWy8|O3?i?%STin>Pl=95uQ6G=#+39LH6sq!i zj*{b4kT(dFTh~JLqA_-AoesREXE&~s9dH-5MX&hs z?FV&5+~$oJ-c-N{MttK=NTh@G>7Kq50$#UzOZ^}4aH`VzUig){28F44aH-_nvTBte z%ezM$oViT__v?giP+eBcwt6HsaXZ-wJZGWmk^uB2hKVHw8!Tl^VjN|tWAY!feIy+ zrcQEi#8_!nIFG51OtWk&=9T#W5$0Oi)6n3vab3ktw*t3(F#>4E7mY`uEj-U*Cq3Ao zu;2LEuPw@pL0+R|IZ?NnF9x^#+39!>tyEjzfEq=!xNE#Tw_2n7zYoQ)#KbF{(Ee!F z0us5nhg$G-%7crcGtvf;)n{wB!AA0uXBw6bt_V7xaB6Eyr6<=Cb z`v>WEU}Mzi@KcoY{j#B{BCRT`q4%`|Kd^n<-v|}X%@4MzRg|-E_%Ia>D%QK~3&{Gt z*30siWajQ(1eNOjxYgMB5sd{#X^!d7tCM^_+XH;i2`+#H~>B>i15r8)0GCW+B-IylSd{oc?hFVb8YJBI^x(r7@~F9%g2s} zJ^hV7J?o*mayaOE?97_3qL{G>Y8+fd^yI!n#)9+3r0F3NmV!ZJQH7Tpn7u{rb&?6E zPot@RNg@@=Y5=lUz&u1N4>Yf!Nv$Nv;Du{c2O7xfyo#r0Z5$7F*2p#gVcLmQXU5$Y zE~GkU*P6DiF}xV{UZn0}2AOA;Qjt1Lp;v80vRdF)O)?Vg^(#T(W}}#TSCMp(O>M-= zJP^3DFlL^Xt{TU;N{Xh~B@fHF?u6aIu( z4u6{9tNv2_#qnG(s~&)@v$h_KzD+4s;!Tz@iQ?P4BN!7tVvt_?TYhS`YD{ZD#8RBN zz5}w?x>V39QA+JsOXfpZmcTahj#B)f@0&>}GZ$f(Y{bhHDwTN)Dt;H_O556xma8;< za_vR|k=>WCRI3W`MZ7#NiW}w(K<84N{!VG&=1oJ7WAzrM(+Qw$*nU?t&wcp=3QID4 z^Nen;v^VLBa<$It+6NRfn5(85i!}2|`Hy1cWVK%DPkyA1%Q)A&tyyEx&3E?pb89bU zqUhG|`M>$~ln^F(>{!P>lLYKHgW$Q?e90&P7_ca!m%8o7um^2wmutdcYZ3T7*FY;* zqg$0Y?D9PhWK$9_9mZ%)V+)s`M@Fy~h&!P8FA5RAKT6_65WKDx&_iuwr7|fi>KM?- zoiqO()V(bZ6LL8&q2-c&vw$i(y&5*h&OHij1PVll)Z{7g-~FQb^O2Ww{9$OPH>(+L zZ^$$Uhc92*&)i$e4~(I!0g5|==gK#HW6Z5NdZxu=3A{$QV~Whl+B8m4$?*lhh0#&9 z5FQT(T(9nxc~{rCZ!W3!I{_h{H8wLVi9ZU1(Zi;6^;Q+%jgXl$+EuKlBc7Xp22a?V zlX?wPoVrM$Mxoa<2oQr=!oMK}R$0);f;_bnWd7^-iPAIch1j3!{HOlBIjSm-s=MJc ziIomn@vK4fQP)cx4g251+$mn~G`wyp6Zx2uVBi$dk{}CAT|f!wYl?6GcDCY1UhtcH zI2Y-!C&KW|Wo4THo$G&^5V6?>lpxC9In%OL<^cT@{aZbIE&ldppUvl(MIO|39)Vbw zBl=>eoHaHh-vakE{VagHLjJ*dQo$nu=KuMv*8h%?7W!Yz@c#|(b=sG$3$E9G?WWq- znRB;33?-J&Z^n%NkNXt)vpQXJ_4>-PsIn)YFoQK;Ywf5S~?VwYTje zcI(D(*7|wrSqu>d$PI=fdvgHdlRQ*_M7Li%{c>nXb^osG^Rd?*#OeHh`aLzRlGDcQ zANUtDo>P%F;`TV|1HDhn)rOGSA0w9%%#;W~;Q^4hKL#;CJ_HH)~Qb^ue z6*ODsb?6vT=hljEVlxm)u-HwAjj4$5Tl(PZ>MBjARA?!_)C+3gXZ~+r_)}7sVbV}u z#}Pjbe*1v#!2^F8AK#?EjH@`hjKXbgOqL&lA6&1ooRK8XigJ%Td6>KJU5gcHbBl^< z&wo_58YXecz<-UK2o2fm=*7|)?qOlCcC1@nC*p>#*2rq z9U1EylH4&T<8bktJQaP@wojZ#T=0^+IB!WYOULsk??y^z5bomP8E5y$ncv;k4i3Md z)(LDDFrDTDk2hlcOAz_?y$D)mO4!+CNRra`RnT*(m3w;d)iWmk`7suQO+_AQy8)Hr z*TEvSRN^W#w()hW!2Q?qKUAe@(i>-MWH~NN+rRbdz9+z0ur9IewJwptN{0%)69pR{ z-vr_uK8x^c%+JkwL4kZVbV*Mq*6>pGDl=|_<$~y=xCKq6mtsRPzdK&lWdziFqUEKj zBgR?v5zrn8y}>v`kY%v4Ig<0@KmH}Qo<@X2Dp)=@Tzt6_XE~rQC)17aa-DzHIun4E zl{X~F;Et?>zfawj%|g;YU^W`v6L+&e1~Q5DenTdzB4yuwSxM2J?T2t%2;sTW)0znYUT4`6yb;~cHk<39clbMqP4rFzWZXUp zYQQ(BsvR~i{>=|x{T8i@8oiL&{b)d+eJ;AeBXunJMGYd6;9LL-+AWjgaXYr03=HY& zi{O&-98ggqBgTxun6wv{c9|WVr*gn07vh;I^@MoJSP;(q@*fU9Eg}dP-psZ%a&@gV zZ~eTns>l|CV4Uawox(qd4;mrfh>jt;QciiEK>%CPYmq2ApXsDzX7*ybliFE-*zsqm=ZhUf01Y|DSzR`B8qmc+g%CW?o?`|EXuiB0`**y1u2@6lS z*=aW~UgX^G`f)WHKqOnC5RZkCu`YHXq3`B?McCuFny>)8otUQ;2H3RgsS-?0grwfA zVtb@IhGGG5sW_|MpAsJ!uDcM1Iq;Jqq5c4lvsMfP_gmvv-dDCn-+pMF9#MG|dU}_< zcP~i6>8XUj5THF8SeF&q?WlX!xV?bv3Eg7~XkKVB&3~FKOOt zQ(*wtsiVhIc+Bulne{y-y7cccxh{b|^Qv(vvi?ygzGOQ5aWe(nrRz)yJFx;<;lL5! z?lL9Wu8rX@i9$cbx!wS=Sm7yYnOyc&Q1DG^!0;*S7!>ni)6gjR{n50=9^%@$Xeb5! zSD{O??E~OZi}sZctmUXPA89PHTKrnc6sqK8;$Eui$V?7D9F-usDRAeT=OoEcXElH- z=E__@FKP9D)1Kj|{Jyu3O|cn2?~c5~!0LExw95K?XMqZ6z58&Qrs=8jd@9osd@_uA*N+q6Eo)1rn!S!^3vV*EZM*y*m z+m|O>PV~TN(i@05iJ~8@KbvV*qpNX7$EMtAJIN8@OVH*RmR^xeQLuO@ou(oI-eds^ zNmQkK8@y;UdbrKEhJBhTyH-gsB7c- zsSu3|M_w~|fm+TCd6qK$zF}IV5`e1Wqn)J}Awdb@k0+YqURpgN8dAN-A$1gR|JNB@vb94I&@nn0onDqLGnFutJb^ta4T9K1HU843W%XC*at)sKEPQF zuizN@{U`*lohadDIG9B<3s-b3K>lSav>t0@;hj|}_7fei%K|KYWyD1)-TwN?e$^;C zRBx|@-?T_IMY#!p_B;?%c~>B*`-$cgw*yVXyIo`OCQlt}n&@nRxY%B;W4HUl!gW^_ z1PjRId+|%Hj5J;CvAu6FlB|pC{w)k8l^j_`KN-t|9anpKT&<=rA&g!N)k?|Pqn>YZ$UlWg4*KMQC;W6`h~)Pk6{jey3f zSM)YOH22L#%1{*!6hI`2|YYDkSLL*UpQfM5bP9W5T|a4!LpS{ zZ`H;S@Otu!{%{;mmvTVICnbVP zrf~GYB;!aHW~$N3cG)f|I4^QQT%9CbRftV1w=gE%Yq!b_=*r`HfSyL(c&;@nJCl#g zB^UIbR=Zu^*RU?;GS+Z~Q)9uVe9Z}*k$Bf?2^K10j{-Icl8k!WaC6NlFF3oC z{M&5=P?U5teEQc4=1_;I(=o4{4B|ww=}@~I4>6EAW0;`l6ZxnJscWe3Nm;Q_g%3XBA#0NE{as)y5jIw@|a z(`=R7*N?5>$-IYoeb`%@4puXqmj~?zG`Px>malI^ob&x{>EcE@5GvEAPjRxGlK35y z(YdGuY~DiJ+Z61@!wZKca(U5rr7XqP4;xR#$!OMquxm#OA7N?@r53HV*NlSHNge4) zPu9Eo;+-$h_P@UCxFB=kD}fHpSMm&kBqJfYo`1QbVn-Y`^;zKZGA{Of~E|R=BG*%1XAH)3wPC3S7tJ(p4cJS`gabJ*UEueKut% z?_$8qmk=IG2Y&A~$VY1>YgzxTE^~72Q}^%SGWDTY3FeP}G4e~tEPE&Y37r|$9~7x< z>upFk?p8>+-=p<1zG;X|4p?(C9I~dB?+GCIfIymRdQ3m!n|`yFg8{4jE{2ms=~XP7 zFCqn>=Pko@%9?3HJ*4J_!?K*V8NMWBIS&%Brn!Uq+W906@5Xi69+*b8V+QWmZF>wEc8WG5-}-a@YIRgm&smeIuLt3@HQ>3q}(eVTf=@1F4B zL#o6O^r}v_!}$wJj{|xBZYB#iS92kT6^^;VdyYu8N%@W!GK6M-Y4xoa4k=3O7R|+! zr~TbTSM(p|C1{wy3GhmQ`j?Ls@FRq0WK7{auGvs8m~Ue< z36Y`g{cyL+1VR`F!W}X%z)v82V-_^X5u%m}IJxrNsnhu=QE=*SebopJxj;C9yupmhrvk5%5FR zvbnVP1)W?`?L^RKl##U*&F<}6VVo+%d5Yv>OI)|DxcKVLWbHn^(m-_D&P<0a2P1cn z(W$CQoh$H@LrL~)S?EMK$szl$jHQ@`m*J9Gw*SeZm8MG^`RUa{BAK@AOnBo7bws7R z-@2`{*9=+tmXjg!k#gd&xJ{@aI>~}9UL1nD&URMu9fuNPJ|5zymJ>x4 z6&{46%cPAk0f)eFl4Tcv45>X|4j$0kZBL+7ym+|cj39+{rou?=w>)~Tcr4pl&;A@} zG}!0k;e~%Li+j|j-Q>YL zlQ`+|LVroIm{u;YFiXqI^d~8VGaGXz&13R?n9+y`iLY;YcxAG1qC{0uz3v4b-|uc+#YQ ztabvu_-9F$sqOyGd8RcTo4y0e_i+C(xrbsfCS>$30BQqEYxY^5R47GKxb9(**> zT<5^ZqGaFOA&%jIK45jRu7+IpyRz4g!#_N>MX4UA8FpaN3QmG`^1ZP(8ytS>=7Z&E zz@>@_`&sLh0U4`%oR*&h=}X__sg~{@uzH>LIZrI8ZB4q&nVz%lboWxIvs*lhmzjOrA+oZ9Danb>J1mD6Ftyh?{mR7^0rhW6+7SFHu^5D66qd( z&=5!qFJ3GnfgrM{y(^5KGw9K{Ra=0+eRGB0C%I~nrwt#n#8$A(mTj}?j zUj?8fu`>>}O+B;;Tzdr6uSe5Et&wMLI`S`pE`rkHmaxg@pH&g;FCIohAbHyxq zECf>r)Byx8(1uu{P{2y(r9L2a5WyH*j<3`!h~uUnxMJ@KsB`tqgBnx2?Rj40>9G-(OKAaJaDI82f^0_n)Kt zgp{D_HMm;=*3`9~h>YS#3Q=gh{cB>39_ee%+oqh_CfOD;JCxhLZ)<9@>TWs!=R{g! zjmD?;*~gtz?U??8G~%28_ObY1v2VGoud6E#m(WDa&s}m;T=nV86|4+~^B3t+LrAdn zY$YqYdRXN{^*n{d8D+NBTmcgh)U-m=CSU@R4w!%tDwKp=Tb%4=*KAy_>&weuZePj4 z2qa`U;9e5q(+`0t075`H8hZu4t>rJ!G*+dMp#!mqi#FO!2lVL>>co93ct}0)LU8^w zE^x3)U%joyqcsY2KT&40$>bmLZ328Mfa_dePNZ98Lw)0I#G9)l0Z_BE@Sn&-!H>AR zjk{4WwTaS?yGfle9Wm6z>};bD`h6VaYBv%#h(&e3(e7{z_Bj_4F;qn$=-kbjezQrZ zp8xEu@i?;@sM^r3x6Kfd$51fg_Clti{8vY3Yt0=asVS))){E4xT703$KwIuDa8I%| z&8yng98^~qbwV=BajRIgJaF*P7=$#?&EYp*?3uPpV##9E*S|5o`RM~e?MB(jbXC@@ zz(gc)#9;-!1Aq=}28(t^nH$wwEKf^~H*H98>@k3Z$oI{8in#3uP|h4P3~wR0a8Ydw z*~n{Mogp*%+0MBSvsPrP=;Yf%?+YYgfn^T=8A9+1tBcAh_B;Z=-+h@dmnR^ukMH7> z^kFTHb~YBP#DQvs8~Oq@E3&2Ffw^bA_%jbt79a1Wg&iG8s69CC)3BVvd}U6UKNOx1 znf7sxr=0K~&vi}M*Z2}RQ?{&+z4uW=*d1E!HcaSFm7#E+>Y~SIy4-yY@@m7h4|`aP_e74i z>eHmgG5l}SOSn#%S!oJ!xvz7PCGRfq)~P@C5!F6NBy2h8MNJ4!ce^jL&hdjUowP{J zRDabDO2iMQQ(x8HiNE)HCt#yW=QonGz{IDfuzJ@d+GKzWU{2>;T*332h~JM~JMeA@7p|aPvA9(ld^^t<#ByObn zyB*Y`RPjxzuo5VceA3>c5O4-ANGZzEZwZ@#84MUdATS9^ z1n7DFLxe+M3J>YO8Uz|;OITifB71!UIv`0j@|nKm5!E#(vg$#(V%%0B^JHSv{ntWmG`XpZQhc`56!&Y4r{x^gw9OKxLpt5lCgZmyhR<>d53&5 z;8Aghf7w-uCbcA+$G{y>)+yW|DIM@uWc)X%?4>b103#5hvBPJ?W#dZ&S$ZDDG6ieMYceMVw0woE=nkHT z%t-C?*huvep30;@dDAuDoZ1%A!!BUtvr*R-?l70M+3teB_$*2(XcmWl4}m@%=*3A> ztxq3JCAivODljRT{OFuAYhz;)N)p|$p3tz=r=xOyciAk6dYWBoI;jByg}ae8zAB%V z{te=eHN)+iPyBI%C0?jaw?x=dO`6`L)cUeoNr8PYC;jr+C4#8ITRr42h+ADYL8yB> zCU%;=X{@CI;aNztrrODY$26oy@O+Te+U7HwKq1r!lhxwlshUiivo)(yn`f8ccQEu8 z$V%iGG<1`1xoTzhD?Llva;&(?6LkyhcXHST;+FtpGgS>r=@chdIo_(Tx3yYLFM6`L z0|@SXVX}zt{T3%9-Fpo`PDPi;gCECiYIf3ph1SzJggIDqzH$T{E!uYcvx7VPd-uLc z6F7a0!2v4HrFvzH_{>)w)F#(w9CT~SV#!1qU8xOJ&rhSB+8R||{bKj!qAF=@_yJT( zN}($uaI1%_kS1RP!m_F)M39ZLSBwEB=V*B$gA&3m|A;1`ca}LLAi(FYeDkH0Xm>jm z)-N?Mvef;vYCW%dQ`NFBXt&wn#5XzPhyoEZR|wvGY8+CZusu=Y;;aLmGx}+{SFFDl zT#_t~EXwr=cH3Cgc%7GKw(Q_HUJul{GNCL@o1frZ8S@CA^iqO%TWB-8&oL~Q_AR!dUW8|30^=4lq*2V2TEAdUYs3(W813gU6US#mn)(0 zBw2a{&wsVgGa0>436VIA#WkBLdQD5BlD7{D4IfS{Q>G$i)ib(FL}@wgg`?&?Sblpr z=iob*dL^?DAB8nj(ZPioqght?uH?)^jKZWtJVHvGC6{|-y!elqEctxzc0LY#xSEOK zJlrjb+NW^sRZ_=HGGu#`RHos2R5=kJ-p={e0#i;v5A4{bkn+~2A4c!^RRj4oUYv|1 zJvyN9A}mFqzQF`SM4OIl$-}p^&%=?ihy4f{PG=zow_JlEIr*TT*0*DHoNCANRra}& z8N|BQa(bK5(V9Z7g9eW%hg*JBNfMwOe^St^_+EISf*5;;0x$x%sV-KC3|sbXA9g;F-7;gI2-*_-_?g29eQ-d{s_eLs zW7G7gOWvm}yptu}&7>@EG8&V9q$BddablgfrBR z6$j>z<%V-2$4psk5ipu(nN%>JC3q0Vbby+x$tOPC@|2N{!3IRim>w?*D=IMUIS|{* zlQNV;!J+y=#IQ4qjnw+AYP}e4+*x>l1n&cn6^gAVILBeqX%-_?*}(o^q}9%7zTwBd zJXA3ovjL(nzMfg4JJFaE zJVyK@KQ48^RPS-C0)Ku@b8-HNS_z&(UX!;L6Qb`%3mLXifm9mrc4}^!>CkAqHIRc7uEAphIgsO0r=dqQ5LV>V5CBXM zKO^bTfiY?(e?q!)?F~8s^m}IEHAm>Pg|o|}=V@&i%wrqEfGOm^ksVGrk2)mWWIeAt zP-C(l_13hJ67U;iRBVRrfaG2)+vdYOSvJl7!F8?^UtCAjA{c4?43xjfkrzJ~-aw;& z;jqTWc?T5JU@*oqmNdhaq7B!$;b`-h)+xGA%?(HQG&~QYaq%o3*LbC5ZqV}M*|dd_ za9^ltCg2uGX17^Tn<><&OdIjwRuzx))2;tOXx)g@M?d(>)WV06Bt>oGuP4fx?l0YT z0~!aAKhoe2!ll;Q?;zW%Zvi?La5BcmQFMtO)Q%-)3>?_=qJ5sWxirs25pYUoi(V{U z>_c1XHx>tZR5rXm3=h9$clq=7XObL#x@LR%Js8QrjC{9;>lM%)W3&(<$`is_&1YUI zU%9Li?$!sp*?hT4D#l)^^UIg>yD5{_#-J#I!<{P)Sn}K!726IbO{>yF;G=Br0v2Lg zMR2_D(-uAv&=mU1PChnPvj~rW^Sce~F10Cje&V@UuG$Puy(oa+z5KAw$wB^5>xF~c zlhC^gxE*)guLB|Maz)uh7qX{33qKD8eD6Ath;yr?w|$&r8s6I5*wu^n$0z zBlm|^n=vD)*scS{6+)ctz*!&A&1skP<9X=8sG?lUM);|#Ukc&o%;ju9XOv!wO0~i| za&x<~Vh{%uHjbaHuRgFpRt-3@c2{B&+Bk0J5%?S?$}H&_7F$XqYSn8EpyEDP8-)Uu z7f{&YC!6m4p-RA!VCa1U=yHaVsoJ)dP(|#}tjJ!EV3V!7@RW=}fI}d_qokTD8sF^) zr~9-^w1-hmliE9)pJgKZ@g*CTj==IHwS+qcGW7={=VM9naeI73!gJ{L_ITN}G(d=Q zB=xk?R)%)=fp&65^R|d)AajS+{t;%CVjQL zmhWZTda&<>Z01^os78M8Z+vb0t>)3YLRwJZAyZb@vro*EA1t~PpIC#O>6U3O!~)sg zso$F6AaQ>x@y?;Y)0wY-1E*u<^ARWUJ2?g)!90a-i>;v%x)N81JBqhk*r`X(Jlg_L z-Rs2;NFOO9_{Tnp^d!D|0DT;LM_26BB4?gGm*%ruQ$Ox6ykp$?v-^{@%2vx3bjsnx zPrL*rrePArrOI+?e;Z2~vNgzP*c)AfRaMygjf55v10#$#&d}+0@pwTH&9%!sVFq=7 z-(?dghY6%OP>(Ce1M9wk1D0*PVG2u@hnSS}=Z7IJ+j3=vlD{7f8<3EOAAW)x!9d@b zBQ?f`0Mm!VK0HG{MUT4~hz3#*N$bO9eXd%BZbbiztVYAb0RG4mL42OErLYeI*ISAL zaBal8ma{>5(J-|Rn%KM~ny3BEfXTKZfT;aC$?}JaMn;|K+9Rw;v^Hirht!m=%Es$a zG`qVxmE>aden*ty3+EeeJmuCvTS-+zdU73CL7^8ZZ$hdcTmu&uCnsG>uj1Zx;H>XF zaJA=8HY4|D^A@F`XSGTAbZTTLuH=TGXG4l{`G)?L{mbuWhvqJuzm_6hCg`$;&}W)1 z*6mp@l)DRgAp5xzwGQ={8=yhM?#d$;A0O$!x@{^B@sl42Sar5kOOwQGwiN%~ysvIu zDywcYBWW`CP^*a}U$;nHtizktTJh69{i=F=dDWq&bGbHPP24T>3;VHdyAZ8xO@WGob2yDQNwNnqO7y+n7P$@w9k z?Rp-bt#47lKi~9BpY+irrS>&uW;6_ZEy@XHymCgZPL}md&Rx+ox=iS9QTT#Zv~ytg zUi}etjT0pePn#Gng)k$jR>N?O_I4px8+*(LuF?n%f=z9B*;7O(r7q1`$>B5I=a_u} zU*ZhC-Y1tmC-|f*+Q<2Y#HQ$klV`TmjTz&?sePgzbDF67&xV$O5HC11vJ+IztPWYv zh;vg_N?5Jg$%hT32*~R^fG zONUVjVCj7V^B(BiLkGZFX0->2b%VAAs-TREm`MCb-Mmyj$d3{DG0iB^T&4yV#MmQQ0^oHsQn;EpdWq zuf|);_;#e7eu(SWPL??o{__o6zNi#VrD1uDe3KDSq9-cT*}>>)C_JFDi8oX2iAB8~?B%5>YFohXNZa&xoa#BYRW4GuLjO z-@iak+sv}P)Thj|Vqr&azR2w^Dv5K56M$i7X?9%qYdam$woTki_B5gVZs%Mu^%K4( z*<~HU<##_*Jh5t!4Vo$^b)c(0=nquC!lgydMr6rkA!6uHI@Ngq^!iKj00ap7|6wNK zrKvelxr@xFx~QL%T>?n8>DWvy48Y+_<%>UXxr2@bPnS($eLP(tz-fg2+PK;SFs>Gh zo|yv5fB698>hIS+$W7R&BAVMnf)uXZ^st&QDiO}IOj|KM#yn+T#krL3Q?OImx zH`IGe-oTHt0PeIzGL1$`ng<@j84bZx3Iu2TF*fcDwUz&cN1jLc?ZUe>-oPo>H&Jx{0`xk>^%nL3?JvPot&n7c*uc?=ik~pn1B5t=NbJDMy47rx) zb*`gj@;)*>M~y@LaoYEj;hP5|T~u{ufwUzZB>7&M4^K8*P;L03L)K!DpFC@L^`5;- z0v#aP@;_mzNY2^}RHC0QaRMBx+4OG{zaCqC^+TDLY&&ChEZZ zk=f;-u4Zc=DSIH9ly{8Ny_KkA<5afn8EgpXz7mX6{P4xO z&TPqwT&JxIn);i3$vRyt7TXA_VM_RG1kbzZ z3edp@Wq`+{oLaNzZndMc`1}n`kfh3Lxv5x;4)EN9O$;w@Jf60hkD|BkxX~QE!T{Kr zg9%lQ#8B8>u?m-xbs7hs8C^Y3x+_ca)h|Qiq$c~-Y->QO1cO%FVG`0( zyO{(xIxhIb3OR~xf2+cN@SBBp5;5o}>gF>^aQ1XAD+aY;9SI_c$)rrd?*%L)#Zv!= z9PByyuG5#ccQhh;5hj4NfK)R?p9B+Ptu zPvXSIlMP8l70|;~~R%m25cOpBR zx)lJT10+;bBEWpkAzm*g2!=N-1e{FA4_*Wy`eu{`GgAQJNQd&PP zM8BH2bvbuM4Vk4WXx4}q{$3!C{^bF80_1HWD~$g{q@wR0)p-@fs-x7T<<|d*25dxN z&XJx{Ye_ZlnZ5r~7U=3SZOKmy$B~7g?qu;~f-7;fXP`Il#%k$>@rj;$Ek4)8QX#qB zuq4cncQB}A>ZL**{Z50l%QP1t(o9$UQV|@AoHKbec)nd9#fNuJ-BAHH6vB7EXX|xD z^Ss=!)HY}pT!um2SDfo_jBmB*}^6<3*!Q_=4&#B)t*vGZN_s7>Dido;q; zXWzBEh$v6K3B^X^VsglNK$Ua(9epYTZiqH&4t?=KH8olU`CWFXz3I|U!jL4QNIZLxk-zpuNMP`uciR3j)1ys z*FYN_?-<^Zzr(A5J^FQj=-_dD+BJuFvHn$K$IWNC+B?Ob4v19RaHIp{*Kr^z`~ynT ziVtF}*)|%Nb5|vrcTeA_)ZUlJ3c-Oc8hfA}sDXlc>#Qh2PGu9PdB4+LyRNt^#UKNR zbY^e<#t5;mBk{K%uqv~lefRwv&A?q6jitmmtc0g#clseyAp9rA(pVd%YJ#iBBT zbI`N6Wi@PYH_7b{FRo7DpWMz^DbGSTK()%}N;lWOGs<}WGY2YN&w+kzx3D@D0Tyw@ zkRdJK6*0(QbD&TL(1`wWx{*@ydz|4PPal+1Au-%{^p@?~hi`nhOemx92g%`6?S;vIaDn-9H+eau zr5B9}$X1El{2YGM>W|E-;JpCfA*%k3 zn~Q}4HlMb7faVPN;wjYU=ZlsXD=8Us9(j2t{|DFfcR=Jkzq$jtVd;~&Nf6Q6m@odo9WYXIW(3od!ciL+dZ9xk8rEM}-G zlYEcrPUdN{ox>AFR~;0_@P3O2mM&j>)h%)b1>vSmJk{=T6V@rrb-!-C|J==2D|FW> zvi>c0=HWl1RqcAT##UwBS!xqb3!nNC&v)Ty$1sUzbKUg$vU2d0F|?vzdeUo{j*Ml& zMY!ED%<5o>${aJrQrdp}^@w+Qb!qhVAsehbxL3CQ@XL5Kf&a;c@_8XS@1lNHM^62D ziLDCbfvc>;a{-j-Nn%8KDm_Qe9NJW1kUlk}XVUpc=hl3){{XA{mCW~3cD?lV^*e6z zvq92NR-fm<{gy}G!rn7gAIaDg6?q>eTQQpM%dSWN-iNVe(j2&U5vD2g^O*!c?5j>Z z3S(UTRln^_Eh`)?=ve0craed1r3;_#=;b2_2M$Hz>6cf>KY-+=&5BXa=eY%DZuotW zqAA#Nj=|(vH1}!>W1v6-bDHoteGYHL5=#=T%^bL#ciA?plr>fiH%05*UFJJ+R0Qu% zJvHY5)X*x5hx4+m<`aI?m(EsXdT7bit!9TrBCMoIp+3-hg~Rro&9D2^uCq=Ei;Gr` zRiTUUa(W;P+fRhv#_(Vn2W)FZYzzbe`MiRXx-0UdIrFp6SP*6Gvk1}XP59Nm1b5SN zlT22d(*-}W>3p$#`T?*T`^5=J1%Uy-aR+O~@#y2)V?kDvX3wp1Q}I7r6OGcj3NvfU zTYCL575=jO|B93HeI15?QNIYfh%CImg!-gAHt(I@f{g`9Bc0fN)XMg1Pu`X_ya)>i z+p%laRoU^K=NtKh7MdLmf3!-eR+VYC#PwfZ0ZEiIP{g4@iLXlfzC?T$A(@S;9ex8AjcuA6ttaxa z{Ec5gq=Yx;`+h`N+l($%AivTjzk07{4NqMK;`A7ZQwa|)hYJe#EW9B`vCF&zv_mTT zr6f-AfQNhzA8^k}eI+Pco4_XptW=anmdY+|dN-}pG$6*i7Oj`$M46|*^WrSpEh8~B zAElx0fSfycC`99*4UEix^57(LD|RvSJE};npw7%az}IcKIvP9;__|%Mbc|7YB)C0{ z)QC{;|D10Aw&uwv61`S1{5{Axk^*H!fB(Uze;xhbA7(LhMgM8%bNy`v*$Bq}d=ESN s-=#Yw|5i2pZ*;Hzf4?AcChF?uJ0TkW_Y}o}H-NtvvdS{WQXfA59}BFoegFUf literal 0 HcmV?d00001 diff --git a/src/External/Directory.Build.props b/src/External/Directory.Build.props deleted file mode 100644 index 4536bdf..0000000 --- a/src/External/Directory.Build.props +++ /dev/null @@ -1,8 +0,0 @@ - - - enable - disable - latest - false - - diff --git a/src/External/RawIntrinsics/AVX.ManuallyAdded.cs b/src/External/RawIntrinsics/AVX.ManuallyAdded.cs deleted file mode 100644 index 0a7ee43..0000000 --- a/src/External/RawIntrinsics/AVX.ManuallyAdded.cs +++ /dev/null @@ -1,25 +0,0 @@ -using System.Runtime.Intrinsics; - -namespace RawIntrinsics -{ - public static partial class AVX - { - ///

- /// Return vector of type __m256d with all elements set to zero. - /// - /// __m256d dst {FP64} - public static __m256d _mm256_setzero_pd() => System.Runtime.Intrinsics.Vector256.Zero; - - /// - /// Return vector of type __m256 with all elements set to zero. - /// - /// __m256 dst {FP32} - public static __m256 _mm256_setzero_ps() => System.Runtime.Intrinsics.Vector256.Zero; - - /// - /// Return vector of type __m256i with all elements set to zero. - /// - /// __m256i dst {M256} - public static __m256i _mm256_setzero_si256() => System.Runtime.Intrinsics.Vector256.Zero; - } -} \ No newline at end of file diff --git a/src/External/RawIntrinsics/AVX.cs b/src/External/RawIntrinsics/AVX.cs deleted file mode 100644 index c10c3d8..0000000 --- a/src/External/RawIntrinsics/AVX.cs +++ /dev/null @@ -1,1336 +0,0 @@ -namespace RawIntrinsics -{ - public static unsafe partial class AVX - { - /// - /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements of "dst". - /// - /// VBROADCASTSS xmm, m32 - /// float {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_broadcast_ss(float* mem_addr) => System.Runtime.Intrinsics.X86.Avx.BroadcastScalarToVector128(mem_addr); - - /// - /// Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst". - /// - /// VCMPPD xmm, xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// int {IMM} - /// __m128d dst {FP64} - public static __m128d _mm_cmp_pd(__m128d a, __m128d b, int imm8) => System.Runtime.Intrinsics.X86.Avx.Compare(a.FP64, b.FP64, (System.Runtime.Intrinsics.X86.FloatComparisonMode)imm8); - - /// - /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst". - /// - /// VCMPPS xmm, xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// int {IMM} - /// __m128 dst {FP32} - public static __m128 _mm_cmp_ps(__m128 a, __m128 b, int imm8) => System.Runtime.Intrinsics.X86.Avx.Compare(a.FP32, b.FP32, (System.Runtime.Intrinsics.X86.FloatComparisonMode)imm8); - - /// - /// Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// VCMPSD xmm, xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// int {IMM} - /// __m128d dst {FP64} - public static __m128d _mm_cmp_sd(__m128d a, __m128d b, int imm8) => System.Runtime.Intrinsics.X86.Avx.CompareScalar(a.FP64, b.FP64, (System.Runtime.Intrinsics.X86.FloatComparisonMode)imm8); - - /// - /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// VCMPSS xmm, xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// int {IMM} - /// __m128 dst {FP32} - public static __m128 _mm_cmp_ss(__m128 a, __m128 b, int imm8) => System.Runtime.Intrinsics.X86.Avx.CompareScalar(a.FP32, b.FP32, (System.Runtime.Intrinsics.X86.FloatComparisonMode)imm8); - - /// - /// Load packed double-precision (64-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set). - /// - /// VMASKMOVPD xmm, xmm, m128 - /// double {FP64} - /// __m128i {MASK} - /// __m128d dst {FP64} - public static __m128d _mm_maskload_pd(double* mem_addr, __m128i mask) => System.Runtime.Intrinsics.X86.Avx.MaskLoad(mem_addr, mask.FP64); - - /// - /// Load packed single-precision (32-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set). - /// - /// VMASKMOVPS xmm, xmm, m128 - /// float {FP32} - /// __m128i {MASK} - /// __m128 dst {FP32} - public static __m128 _mm_maskload_ps(float* mem_addr, __m128i mask) => System.Runtime.Intrinsics.X86.Avx.MaskLoad(mem_addr, mask.FP32); - - /// - /// Store packed double-precision (64-bit) floating-point elements from "a" into memory using "mask". - /// - /// VMASKMOVPD m128, xmm, xmm - /// double {FP64} - /// __m128i {MASK} - /// __m128d {FP64} - /// void {} - public static void _mm_maskstore_pd(double* mem_addr, __m128i mask, __m128d a) => System.Runtime.Intrinsics.X86.Avx.MaskStore(mem_addr, mask.FP64, a.FP64); - - /// - /// Store packed single-precision (32-bit) floating-point elements from "a" into memory using "mask". - /// - /// VMASKMOVPS m128, xmm, xmm - /// float {FP32} - /// __m128i {MASK} - /// __m128 {FP32} - /// void {} - public static void _mm_maskstore_ps(float* mem_addr, __m128i mask, __m128 a) => System.Runtime.Intrinsics.X86.Avx.MaskStore(mem_addr, mask.FP32, a.FP32); - - /// - /// Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst". - /// - /// VPERMILPD xmm, xmm, imm8 - /// __m128d {FP64} - /// int {IMM} - /// __m128d dst {FP64} - public static __m128d _mm_permute_pd(__m128d a, int imm8) => System.Runtime.Intrinsics.X86.Avx.Permute(a.FP64, (byte)imm8); - - /// - /// Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst". - /// - /// VPERMILPS xmm, xmm, imm8 - /// __m128 {FP32} - /// int {IMM} - /// __m128 dst {FP32} - public static __m128 _mm_permute_ps(__m128 a, int imm8) => System.Runtime.Intrinsics.X86.Avx.Permute(a.FP32, (byte)imm8); - - /// - /// Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst". - /// - /// VPERMILPD xmm, xmm, xmm - /// __m128d {FP64} - /// __m128i {UI64} - /// __m128d dst {FP64} - public static __m128d _mm_permutevar_pd(__m128d a, __m128i b) => System.Runtime.Intrinsics.X86.Avx.PermuteVar(a.FP64, b.SI64); - - /// - /// Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst". - /// - /// VPERMILPS xmm, xmm, xmm - /// __m128 {FP32} - /// __m128i {UI32} - /// __m128 dst {FP32} - public static __m128 _mm_permutevar_ps(__m128 a, __m128i b) => System.Runtime.Intrinsics.X86.Avx.PermuteVar(a.FP32, b.SI32); - - /// - /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value. - /// - /// VTESTPD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// bool dst {UI8} - public static bool _mm_testc_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Avx.TestC(a.FP64, b.FP64); - - /// - /// Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value. - /// - /// VTESTPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// bool dst {UI8} - public static bool _mm_testc_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Avx.TestC(a.FP32, b.FP32); - - /// - /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. - /// - /// VTESTPD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// bool dst {UI8} - public static bool _mm_testnzc_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Avx.TestNotZAndNotC(a.FP64, b.FP64); - - /// - /// Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. - /// - /// VTESTPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// bool dst {UI8} - public static bool _mm_testnzc_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Avx.TestNotZAndNotC(a.FP32, b.FP32); - - /// - /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value. - /// - /// VTESTPD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// bool dst {UI8} - public static bool _mm_testz_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Avx.TestZ(a.FP64, b.FP64); - - /// - /// Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value. - /// - /// VTESTPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// bool dst {UI8} - public static bool _mm_testz_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Avx.TestZ(a.FP32, b.FP32); - - /// - /// Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - /// - /// VADDPD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_add_pd(__m256d a, __m256d b) => System.Runtime.Intrinsics.X86.Avx.Add(a.FP64, b.FP64); - - /// - /// Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - /// - /// VADDPS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_add_ps(__m256 a, __m256 b) => System.Runtime.Intrinsics.X86.Avx.Add(a.FP32, b.FP32); - - /// - /// Alternatively add and subtract packed double-precision (64-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". - /// - /// VADDSUBPD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_addsub_pd(__m256d a, __m256d b) => System.Runtime.Intrinsics.X86.Avx.AddSubtract(a.FP64, b.FP64); - - /// - /// Alternatively add and subtract packed single-precision (32-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". - /// - /// VADDSUBPS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_addsub_ps(__m256 a, __m256 b) => System.Runtime.Intrinsics.X86.Avx.AddSubtract(a.FP32, b.FP32); - - /// - /// Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - /// - /// VANDPD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_and_pd(__m256d a, __m256d b) => System.Runtime.Intrinsics.X86.Avx.And(a.FP64, b.FP64); - - /// - /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - /// - /// VANDPS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_and_ps(__m256 a, __m256 b) => System.Runtime.Intrinsics.X86.Avx.And(a.FP32, b.FP32); - - /// - /// Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". - /// - /// VANDNPD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_andnot_pd(__m256d a, __m256d b) => System.Runtime.Intrinsics.X86.Avx.AndNot(a.FP64, b.FP64); - - /// - /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". - /// - /// VANDNPS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_andnot_ps(__m256 a, __m256 b) => System.Runtime.Intrinsics.X86.Avx.AndNot(a.FP32, b.FP32); - - /// - /// Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". - /// - /// VBLENDPD ymm, ymm, ymm, imm8 - /// __m256d {FP64} - /// __m256d {FP64} - /// int {IMM} - /// __m256d dst {FP64} - public static __m256d _mm256_blend_pd(__m256d a, __m256d b, int imm8) => System.Runtime.Intrinsics.X86.Avx.Blend(a.FP64, b.FP64, (byte)imm8); - - /// - /// Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". - /// - /// VBLENDPS ymm, ymm, ymm, imm8 - /// __m256 {FP32} - /// __m256 {FP32} - /// int {IMM} - /// __m256 dst {FP32} - public static __m256 _mm256_blend_ps(__m256 a, __m256 b, int imm8) => System.Runtime.Intrinsics.X86.Avx.Blend(a.FP32, b.FP32, (byte)imm8); - - /// - /// Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". - /// - /// VBLENDVPD ymm, ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d {MASK} - /// __m256d dst {FP64} - public static __m256d _mm256_blendv_pd(__m256d a, __m256d b, __m256d mask) => System.Runtime.Intrinsics.X86.Avx.BlendVariable(a.FP64, b.FP64, mask.FP64); - - /// - /// Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". - /// - /// VBLENDVPS ymm, ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 {MASK} - /// __m256 dst {FP32} - public static __m256 _mm256_blendv_ps(__m256 a, __m256 b, __m256 mask) => System.Runtime.Intrinsics.X86.Avx.BlendVariable(a.FP32, b.FP32, mask.FP32); - - /// - /// Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit) floating-point elements) to all elements of "dst". - /// - /// VBROADCASTF128 ymm, m128 - /// __m128d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_broadcast_pd(__m128d* mem_addr) => System.Runtime.Intrinsics.X86.Avx.BroadcastVector128ToVector256((double*)mem_addr); - - /// - /// Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) floating-point elements) to all elements of "dst". - /// - /// VBROADCASTF128 ymm, m128 - /// __m128 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_broadcast_ps(__m128* mem_addr) => System.Runtime.Intrinsics.X86.Avx.BroadcastVector128ToVector256((float*)mem_addr); - - /// - /// Broadcast a double-precision (64-bit) floating-point element from memory to all elements of "dst". - /// - /// VBROADCASTSD ymm, m64 - /// double {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_broadcast_sd(double* mem_addr) => System.Runtime.Intrinsics.X86.Avx.BroadcastScalarToVector256(mem_addr); - - /// - /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements of "dst". - /// - /// VBROADCASTSS ymm, m32 - /// float {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_broadcast_ss(float* mem_addr) => System.Runtime.Intrinsics.X86.Avx.BroadcastScalarToVector256(mem_addr); - - /// - /// Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". - /// - /// VROUNDPD ymm, ymm, imm8 - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_ceil_pd(__m256d a) => System.Runtime.Intrinsics.X86.Avx.Ceiling(a.FP64); - - /// - /// Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". - /// - /// VROUNDPS ymm, ymm, imm8 - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_ceil_ps(__m256 a) => System.Runtime.Intrinsics.X86.Avx.Ceiling(a.FP32); - - /// - /// Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst". - /// - /// VCMPPD ymm, ymm, ymm, imm8 - /// __m256d {FP64} - /// __m256d {FP64} - /// int {IMM} - /// __m256d dst {FP64} - public static __m256d _mm256_cmp_pd(__m256d a, __m256d b, int imm8) => System.Runtime.Intrinsics.X86.Avx.Compare(a.FP64, b.FP64, (System.Runtime.Intrinsics.X86.FloatComparisonMode)imm8); - - /// - /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst". - /// - /// VCMPPS ymm, ymm, ymm, imm8 - /// __m256 {FP32} - /// __m256 {FP32} - /// int {IMM} - /// __m256 dst {FP32} - public static __m256 _mm256_cmp_ps(__m256 a, __m256 b, int imm8) => System.Runtime.Intrinsics.X86.Avx.Compare(a.FP32, b.FP32, (System.Runtime.Intrinsics.X86.FloatComparisonMode)imm8); - - /// - /// Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - /// - /// VCVTDQ2PD ymm, xmm - /// __m128i {SI32} - /// __m256d dst {FP64} - public static __m256d _mm256_cvtepi32_pd(__m128i a) => System.Runtime.Intrinsics.X86.Avx.ConvertToVector256Double(a.SI32); - - /// - /// Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - /// - /// VCVTDQ2PS ymm, ymm - /// __m256i {SI32} - /// __m256 dst {FP32} - public static __m256 _mm256_cvtepi32_ps(__m256i a) => System.Runtime.Intrinsics.X86.Avx.ConvertToVector256Single(a.SI32); - - /// - /// Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - /// - /// VCVTPD2DQ xmm, ymm - /// __m256d {FP64} - /// __m128i dst {UI32} - public static __m128i _mm256_cvtpd_epi32(__m256d a) => System.Runtime.Intrinsics.X86.Avx.ConvertToVector128Int32(a.FP64); - - /// - /// Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - /// - /// VCVTPD2PS xmm, ymm - /// __m256d {FP64} - /// __m128 dst {FP32} - public static __m128 _mm256_cvtpd_ps(__m256d a) => System.Runtime.Intrinsics.X86.Avx.ConvertToVector128Single(a.FP64); - - /// - /// Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - /// - /// VCVTPS2DQ ymm, ymm - /// __m256 {FP32} - /// __m256i dst {UI32} - public static __m256i _mm256_cvtps_epi32(__m256 a) => System.Runtime.Intrinsics.X86.Avx.ConvertToVector256Int32(a.FP32); - - /// - /// Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - /// - /// VCVTPS2PD ymm, xmm - /// __m128 {FP32} - /// __m256d dst {FP64} - public static __m256d _mm256_cvtps_pd(__m128 a) => System.Runtime.Intrinsics.X86.Avx.ConvertToVector256Double(a.FP32); - - /// - /// Copy the lower 32-bit integer in "a" to "dst". - /// - /// VMOVD r32, xmm - /// __m256i {UI32} - /// int dst {UI32} - public static int _mm256_cvtsi256_si32(__m256i a) => (int)System.Runtime.Intrinsics.X86.Avx2.ConvertToUInt32(a.UI32); - - /// - /// Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". - /// - /// VCVTTPD2DQ xmm, ymm - /// __m256d {FP64} - /// __m128i dst {UI32} - public static __m128i _mm256_cvttpd_epi32(__m256d a) => System.Runtime.Intrinsics.X86.Avx.ConvertToVector128Int32WithTruncation(a.FP64); - - /// - /// Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". - /// - /// VCVTTPS2DQ ymm, ymm - /// __m256 {FP32} - /// __m256i dst {UI32} - public static __m256i _mm256_cvttps_epi32(__m256 a) => System.Runtime.Intrinsics.X86.Avx.ConvertToVector256Int32WithTruncation(a.FP32); - - /// - /// Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". - /// - /// VDIVPD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_div_pd(__m256d a, __m256d b) => System.Runtime.Intrinsics.X86.Avx.Divide(a.FP64, b.FP64); - - /// - /// Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". - /// - /// VDIVPS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_div_ps(__m256 a, __m256 b) => System.Runtime.Intrinsics.X86.Avx.Divide(a.FP32, b.FP32); - - /// - /// Conditionally multiply the packed single-precision (32-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8". - /// - /// VDPPS ymm, ymm, ymm, imm8 - /// __m256 {FP32} - /// __m256 {FP32} - /// int {IMM} - /// __m256 dst {FP32} - public static __m256 _mm256_dp_ps(__m256 a, __m256 b, int imm8) => System.Runtime.Intrinsics.X86.Avx.DotProduct(a.FP32, b.FP32, (byte)imm8); - - /// - /// Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". - /// - /// VEXTRACTF128 xmm, ymm, imm8 - /// __m256d {FP64} - /// int {IMM} - /// __m128d dst {FP64} - public static __m128d _mm256_extractf128_pd(__m256d a, int imm8) => System.Runtime.Intrinsics.X86.Avx.ExtractVector128(a.FP64, (byte)imm8); - - /// - /// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". - /// - /// VEXTRACTF128 xmm, ymm, imm8 - /// __m256 {FP32} - /// int {IMM} - /// __m128 dst {FP32} - public static __m128 _mm256_extractf128_ps(__m256 a, int imm8) => System.Runtime.Intrinsics.X86.Avx.ExtractVector128(a.FP32, (byte)imm8); - - /// - /// Extract 128 bits (composed of integer data) from "a", selected with "imm8", and store the result in "dst". - /// - /// VEXTRACTF128 xmm, ymm, imm8 - /// __m256i {M128} - /// int {IMM} - /// __m128i dst {M128} - public static __m128i _mm256_extractf128_si256(__m256i a, int imm8) => System.Runtime.Intrinsics.X86.Avx.ExtractVector128(a.UI8, (byte)imm8); - - /// - /// Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". - /// - /// VROUNDPD ymm, ymm, imm8 - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_floor_pd(__m256d a) => System.Runtime.Intrinsics.X86.Avx.Floor(a.FP64); - - /// - /// Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". - /// - /// VROUNDPS ymm, ymm, imm8 - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_floor_ps(__m256 a) => System.Runtime.Intrinsics.X86.Avx.Floor(a.FP32); - - /// - /// Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". - /// - /// VHADDPD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_hadd_pd(__m256d a, __m256d b) => System.Runtime.Intrinsics.X86.Avx.HorizontalAdd(a.FP64, b.FP64); - - /// - /// Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". - /// - /// VHADDPS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_hadd_ps(__m256 a, __m256 b) => System.Runtime.Intrinsics.X86.Avx.HorizontalAdd(a.FP32, b.FP32); - - /// - /// Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". - /// - /// VHSUBPD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_hsub_pd(__m256d a, __m256d b) => System.Runtime.Intrinsics.X86.Avx.HorizontalSubtract(a.FP64, b.FP64); - - /// - /// Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". - /// - /// VHSUBPS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_hsub_ps(__m256 a, __m256 b) => System.Runtime.Intrinsics.X86.Avx.HorizontalSubtract(a.FP32, b.FP32); - - /// - /// Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". - /// - /// VINSERTF128 ymm, ymm, xmm, imm8 - /// __m256d {FP64} - /// __m128d {FP64} - /// int {IMM} - /// __m256d dst {FP64} - public static __m256d _mm256_insertf128_pd(__m256d a, __m128d b, int imm8) => System.Runtime.Intrinsics.X86.Avx.InsertVector128(a.FP64, b.FP64, (byte)imm8); - - /// - /// Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". - /// - /// VINSERTF128 ymm, ymm, xmm, imm8 - /// __m256 {FP32} - /// __m128 {FP32} - /// int {IMM} - /// __m256 dst {FP32} - public static __m256 _mm256_insertf128_ps(__m256 a, __m128 b, int imm8) => System.Runtime.Intrinsics.X86.Avx.InsertVector128(a.FP32, b.FP32, (byte)imm8); - - /// - /// Copy "a" to "dst", then insert 128 bits from "b" into "dst" at the location specified by "imm8". - /// - /// VINSERTF128 ymm, ymm, xmm, imm8 - /// __m256i {M256} - /// __m128i {M128} - /// int {IMM} - /// __m256i dst {M128} - public static __m256i _mm256_insertf128_si256(__m256i a, __m128i b, int imm8) => System.Runtime.Intrinsics.X86.Avx.InsertVector128(a.UI8, b.UI8, (byte)imm8); - - /// - /// Load 256-bits of integer data from unaligned memory into "dst". This intrinsic may perform better than "_mm256_loadu_si256" when the data crosses a cache line boundary. - /// - /// VLDDQU ymm, m256 - /// __m256i {M256} - /// __m256i dst {M256} - public static __m256i _mm256_lddqu_si256(__m256i* mem_addr) => System.Runtime.Intrinsics.X86.Avx.LoadDquVector256((sbyte*)mem_addr); - - /// - /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into "dst". "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - /// - /// VMOVAPD ymm, m256 - /// double {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_load_pd(double* mem_addr) => System.Runtime.Intrinsics.X86.Avx.LoadAlignedVector256(mem_addr); - - /// - /// Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into "dst". "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - /// - /// VMOVAPS ymm, m256 - /// float {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_load_ps(float* mem_addr) => System.Runtime.Intrinsics.X86.Avx.LoadAlignedVector256(mem_addr); - - /// - /// Load 256-bits of integer data from memory into "dst". "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - /// - /// VMOVDQA ymm, m256 - /// __m256i {M256} - /// __m256i dst {M256} - public static __m256i _mm256_load_si256(__m256i* mem_addr) => System.Runtime.Intrinsics.X86.Avx.LoadAlignedVector256((sbyte*)mem_addr); - - /// - /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into "dst". "mem_addr" does not need to be aligned on any particular boundary. - /// - /// VMOVUPD ymm, m256 - /// double {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_loadu_pd(double* mem_addr) => System.Runtime.Intrinsics.X86.Avx.LoadVector256(mem_addr); - - /// - /// Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into "dst". "mem_addr" does not need to be aligned on any particular boundary. - /// - /// VMOVUPS ymm, m256 - /// float {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_loadu_ps(float* mem_addr) => System.Runtime.Intrinsics.X86.Avx.LoadVector256(mem_addr); - - /// - /// Load 256-bits of integer data from memory into "dst". "mem_addr" does not need to be aligned on any particular boundary. - /// - /// VMOVDQU ymm, m256 - /// __m256i {M256} - /// __m256i dst {M256} - public static __m256i _mm256_loadu_si256(__m256i* mem_addr) => System.Runtime.Intrinsics.X86.Avx.LoadVector256((sbyte*)mem_addr); - - /// - /// Load packed double-precision (64-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set). - /// - /// VMASKMOVPD ymm, ymm, m256 - /// double {FP64} - /// __m256i {MASK} - /// __m256d dst {FP64} - public static __m256d _mm256_maskload_pd(double* mem_addr, __m256i mask) => System.Runtime.Intrinsics.X86.Avx.MaskLoad(mem_addr, mask.FP64); - - /// - /// Load packed single-precision (32-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set). - /// - /// VMASKMOVPS ymm, ymm, m256 - /// float {FP32} - /// __m256i {MASK} - /// __m256 dst {FP32} - public static __m256 _mm256_maskload_ps(float* mem_addr, __m256i mask) => System.Runtime.Intrinsics.X86.Avx.MaskLoad(mem_addr, mask.FP32); - - /// - /// Store packed double-precision (64-bit) floating-point elements from "a" into memory using "mask". - /// - /// VMASKMOVPD m256, ymm, ymm - /// double {FP64} - /// __m256i {MASK} - /// __m256d {FP64} - /// void {} - public static void _mm256_maskstore_pd(double* mem_addr, __m256i mask, __m256d a) => System.Runtime.Intrinsics.X86.Avx.MaskStore(mem_addr, mask.FP64, a.FP64); - - /// - /// Store packed single-precision (32-bit) floating-point elements from "a" into memory using "mask". - /// - /// VMASKMOVPS m256, ymm, ymm - /// float {FP32} - /// __m256i {MASK} - /// __m256 {FP32} - /// void {} - public static void _mm256_maskstore_ps(float* mem_addr, __m256i mask, __m256 a) => System.Runtime.Intrinsics.X86.Avx.MaskStore(mem_addr, mask.FP32, a.FP32); - - /// - /// Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". - /// - /// VMAXPD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_max_pd(__m256d a, __m256d b) => System.Runtime.Intrinsics.X86.Avx.Max(a.FP64, b.FP64); - - /// - /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". - /// - /// VMAXPS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_max_ps(__m256 a, __m256 b) => System.Runtime.Intrinsics.X86.Avx.Max(a.FP32, b.FP32); - - /// - /// Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". - /// - /// VMINPD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_min_pd(__m256d a, __m256d b) => System.Runtime.Intrinsics.X86.Avx.Min(a.FP64, b.FP64); - - /// - /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". - /// - /// VMINPS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_min_ps(__m256 a, __m256 b) => System.Runtime.Intrinsics.X86.Avx.Min(a.FP32, b.FP32); - - /// - /// Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst". - /// - /// VMOVDDUP ymm, ymm - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_movedup_pd(__m256d a) => System.Runtime.Intrinsics.X86.Avx.DuplicateEvenIndexed(a.FP64); - - /// - /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". - /// - /// VMOVSHDUP ymm, ymm - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_movehdup_ps(__m256 a) => System.Runtime.Intrinsics.X86.Avx.DuplicateOddIndexed(a.FP32); - - /// - /// Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". - /// - /// VMOVSLDUP ymm, ymm - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_moveldup_ps(__m256 a) => System.Runtime.Intrinsics.X86.Avx.DuplicateEvenIndexed(a.FP32); - - /// - /// Set each bit of mask "dst" based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in "a". - /// - /// VMOVMSKPD r32, ymm - /// __m256d {FP64} - /// int dst {UI32} - public static int _mm256_movemask_pd(__m256d a) => System.Runtime.Intrinsics.X86.Avx.MoveMask(a.FP64); - - /// - /// Set each bit of mask "dst" based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in "a". - /// - /// VMOVMSKPS r32, ymm - /// __m256 {FP32} - /// int dst {UI32} - public static int _mm256_movemask_ps(__m256 a) => System.Runtime.Intrinsics.X86.Avx.MoveMask(a.FP32); - - /// - /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - /// - /// VMULPD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_mul_pd(__m256d a, __m256d b) => System.Runtime.Intrinsics.X86.Avx.Multiply(a.FP64, b.FP64); - - /// - /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - /// - /// VMULPS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_mul_ps(__m256 a, __m256 b) => System.Runtime.Intrinsics.X86.Avx.Multiply(a.FP32, b.FP32); - - /// - /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - /// - /// VORPD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_or_pd(__m256d a, __m256d b) => System.Runtime.Intrinsics.X86.Avx.Or(a.FP64, b.FP64); - - /// - /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - /// - /// VORPS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_or_ps(__m256 a, __m256 b) => System.Runtime.Intrinsics.X86.Avx.Or(a.FP32, b.FP32); - - /// - /// Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". - /// - /// VPERMILPD ymm, ymm, imm8 - /// __m256d {FP64} - /// int {IMM} - /// __m256d dst {FP64} - public static __m256d _mm256_permute_pd(__m256d a, int imm8) => System.Runtime.Intrinsics.X86.Avx.Permute(a.FP64, (byte)imm8); - - /// - /// Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". - /// - /// VPERMILPS ymm, ymm, imm8 - /// __m256 {FP32} - /// int {IMM} - /// __m256 dst {FP32} - public static __m256 _mm256_permute_ps(__m256 a, int imm8) => System.Runtime.Intrinsics.X86.Avx.Permute(a.FP32, (byte)imm8); - - /// - /// Shuffle 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". - /// - /// VPERM2F128 ymm, ymm, ymm, imm8 - /// __m256d {FP64} - /// __m256d {FP64} - /// int {IMM} - /// __m256d dst {FP64} - public static __m256d _mm256_permute2f128_pd(__m256d a, __m256d b, int imm8) => System.Runtime.Intrinsics.X86.Avx.Permute2x128(a.FP64, b.FP64, (byte)imm8); - - /// - /// Shuffle 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". - /// - /// VPERM2F128 ymm, ymm, ymm, imm8 - /// __m256 {FP32} - /// __m256 {FP32} - /// int {IMM} - /// __m256 dst {FP32} - public static __m256 _mm256_permute2f128_ps(__m256 a, __m256 b, int imm8) => System.Runtime.Intrinsics.X86.Avx.Permute2x128(a.FP32, b.FP32, (byte)imm8); - - /// - /// Shuffle 128-bits (composed of integer data) selected by "imm8" from "a" and "b", and store the results in "dst". - /// - /// VPERM2F128 ymm, ymm, ymm, imm8 - /// __m256i {M256} - /// __m256i {M256} - /// int {IMM} - /// __m256i dst {M256} - public static __m256i _mm256_permute2f128_si256(__m256i a, __m256i b, int imm8) => System.Runtime.Intrinsics.X86.Avx.Permute2x128(a.UI8, b.UI8, (byte)imm8); - - /// - /// Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst". - /// - /// VPERMILPD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256i {UI64} - /// __m256d dst {FP64} - public static __m256d _mm256_permutevar_pd(__m256d a, __m256i b) => System.Runtime.Intrinsics.X86.Avx.PermuteVar(a.FP64, b.SI64); - - /// - /// Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst". - /// - /// VPERMILPS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256i {UI32} - /// __m256 dst {FP32} - public static __m256 _mm256_permutevar_ps(__m256 a, __m256i b) => System.Runtime.Intrinsics.X86.Avx.PermuteVar(a.FP32, b.SI32); - - /// - /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - /// - /// VRCPPS ymm, ymm - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_rcp_ps(__m256 a) => System.Runtime.Intrinsics.X86.Avx.Reciprocal(a.FP32); - - /// - /// Round the packed double-precision (64-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed double-precision floating-point elements in "dst". [round_note] - /// - /// VROUNDPD ymm, ymm, imm8 - /// __m256d {FP64} - /// int {IMM} - /// __m256d dst {FP64} - public static __m256d _mm256_round_pd(__m256d a, int rounding) => System.Runtime.Intrinsics.X86.Avx.RoundCurrentDirection(a.FP64); - - /// - /// Round the packed single-precision (32-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed single-precision floating-point elements in "dst". [round_note] - /// - /// VROUNDPS ymm, ymm, imm8 - /// __m256 {FP32} - /// int {IMM} - /// __m256 dst {FP32} - public static __m256 _mm256_round_ps(__m256 a, int rounding) => System.Runtime.Intrinsics.X86.Avx.RoundCurrentDirection(a.FP32); - - /// - /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - /// - /// VRSQRTPS ymm, ymm - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_rsqrt_ps(__m256 a) => System.Runtime.Intrinsics.X86.Avx.ReciprocalSqrt(a.FP32); - - /// - /// Broadcast 16-bit integer "a" to all all elements of "dst". This intrinsic may generate the "vpbroadcastw". - /// - /// - /// short {UI16} - /// __m256i dst {UI16} - public static __m256i _mm256_set1_epi16(short a) => System.Runtime.Intrinsics.Vector256.Create((ushort)a); - - /// - /// Broadcast 32-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastd". - /// - /// - /// int {UI32} - /// __m256i dst {UI32} - public static __m256i _mm256_set1_epi32(int a) => System.Runtime.Intrinsics.Vector256.Create((uint)a); - - /// - /// Broadcast 64-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastq". - /// - /// - /// long {UI64} - /// __m256i dst {UI64} - public static __m256i _mm256_set1_epi64x(long a) => System.Runtime.Intrinsics.Vector256.Create((ulong)a); - - /// - /// Broadcast 8-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastb". - /// - /// - /// byte {UI8} - /// __m256i dst {UI8} - public static __m256i _mm256_set1_epi8(byte a) => System.Runtime.Intrinsics.Vector256.Create(a); - - /// - /// Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst". - /// - /// - /// double {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_set1_pd(double a) => System.Runtime.Intrinsics.Vector256.Create(a); - - /// - /// Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst". - /// - /// - /// float {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_set1_ps(float a) => System.Runtime.Intrinsics.Vector256.Create(a); - - /// - /// Set packed 16-bit integers in "dst" with the supplied values in reverse order. - /// - /// - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// __m256i dst {UI16} - public static __m256i _mm256_setr_epi16(short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) => System.Runtime.Intrinsics.Vector256.Create((ushort)e15, (ushort)e14, (ushort)e13, (ushort)e12, (ushort)e11, (ushort)e10, (ushort)e9, (ushort)e8, (ushort)e7, (ushort)e6, (ushort)e5, (ushort)e4, (ushort)e3, (ushort)e2, (ushort)e1, (ushort)e0); - - /// - /// Set packed 32-bit integers in "dst" with the supplied values in reverse order. - /// - /// - /// int {UI32} - /// int {UI32} - /// int {UI32} - /// int {UI32} - /// int {UI32} - /// int {UI32} - /// int {UI32} - /// int {UI32} - /// __m256i dst {UI32} - public static __m256i _mm256_setr_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) => System.Runtime.Intrinsics.Vector256.Create((uint)e7, (uint)e6, (uint)e5, (uint)e4, (uint)e3, (uint)e2, (uint)e1, (uint)e0); - - /// - /// Set packed 64-bit integers in "dst" with the supplied values in reverse order. - /// - /// - /// long {UI64} - /// long {UI64} - /// long {UI64} - /// long {UI64} - /// __m256i dst {UI64} - public static __m256i _mm256_setr_epi64x(long e3, long e2, long e1, long e0) => System.Runtime.Intrinsics.Vector256.Create((ulong)e3, (ulong)e2, (ulong)e1, (ulong)e0); - - /// - /// Set packed 8-bit integers in "dst" with the supplied values in reverse order. - /// - /// - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// __m256i dst {UI8} - public static __m256i _mm256_setr_epi8(byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24, byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16, byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9, byte e8, byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) => System.Runtime.Intrinsics.Vector256.Create(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0); - - /// - /// Set packed __m256 vector "dst" with the supplied values. - /// - /// VINSERTF128 ymm, ymm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_setr_m128(__m128 lo, __m128 hi) => System.Runtime.Intrinsics.Vector256.Create(lo.FP32, hi.FP32); - - /// - /// Set packed __m256d vector "dst" with the supplied values. - /// - /// VINSERTF128 ymm, ymm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_setr_m128d(__m128d lo, __m128d hi) => System.Runtime.Intrinsics.Vector256.Create(lo.FP64, hi.FP64); - - /// - /// Set packed __m256i vector "dst" with the supplied values. - /// - /// VINSERTF128 ymm, ymm, xmm, imm8 - /// __m128i {M128} - /// __m128i {M128} - /// __m256i dst {M128} - public static __m256i _mm256_setr_m128i(__m128i lo, __m128i hi) => System.Runtime.Intrinsics.Vector256.Create(lo.SI32, hi.SI32); - - /// - /// Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order. - /// - /// - /// double {FP64} - /// double {FP64} - /// double {FP64} - /// double {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_setr_pd(double e3, double e2, double e1, double e0) => System.Runtime.Intrinsics.Vector256.Create(e3, e2, e1, e0); - - /// - /// Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order. - /// - /// - /// float {FP32} - /// float {FP32} - /// float {FP32} - /// float {FP32} - /// float {FP32} - /// float {FP32} - /// float {FP32} - /// float {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_setr_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) => System.Runtime.Intrinsics.Vector256.Create(e7, e6, e5, e4, e3, e2, e1, e0); - - /// - /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst". - /// - /// VSHUFPD ymm, ymm, ymm, imm8 - /// __m256d {FP64} - /// __m256d {FP64} - /// int {IMM} - /// __m256d dst {FP64} - public static __m256d _mm256_shuffle_pd(__m256d a, __m256d b, int imm8) => System.Runtime.Intrinsics.X86.Avx.Shuffle(a.FP64, b.FP64, (byte)imm8); - - /// - /// Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". - /// - /// VSHUFPS ymm, ymm, ymm, imm8 - /// __m256 {FP32} - /// __m256 {FP32} - /// int {IMM} - /// __m256 dst {FP32} - public static __m256 _mm256_shuffle_ps(__m256 a, __m256 b, int imm8) => System.Runtime.Intrinsics.X86.Avx.Shuffle(a.FP32, b.FP32, (byte)imm8); - - /// - /// Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - /// - /// VSQRTPD ymm, ymm - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_sqrt_pd(__m256d a) => System.Runtime.Intrinsics.X86.Avx.Sqrt(a.FP64); - - /// - /// Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - /// - /// VSQRTPS ymm, ymm - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_sqrt_ps(__m256 a) => System.Runtime.Intrinsics.X86.Avx.Sqrt(a.FP32); - - /// - /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory. "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - /// - /// VMOVAPD m256, ymm - /// double {FP64} - /// __m256d {FP64} - /// void {} - public static void _mm256_store_pd(double* mem_addr, __m256d a) => System.Runtime.Intrinsics.X86.Avx.StoreAligned(mem_addr, a.FP64); - - /// - /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory. "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - /// - /// VMOVAPS m256, ymm - /// float {FP32} - /// __m256 {FP32} - /// void {} - public static void _mm256_store_ps(float* mem_addr, __m256 a) => System.Runtime.Intrinsics.X86.Avx.StoreAligned(mem_addr, a.FP32); - - /// - /// Store 256-bits of integer data from "a" into memory. "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - /// - /// VMOVDQA m256, ymm - /// __m256i {M256} - /// __m256i {M256} - /// void {} - public static void _mm256_store_si256(__m256i* mem_addr, __m256i a) => System.Runtime.Intrinsics.X86.Avx.StoreAligned((sbyte*)mem_addr, a.SI8); - - /// - /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. - /// - /// VMOVUPD m256, ymm - /// double {FP64} - /// __m256d {FP64} - /// void {} - public static void _mm256_storeu_pd(double* mem_addr, __m256d a) => System.Runtime.Intrinsics.X86.Avx.Store(mem_addr, a.FP64); - - /// - /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. - /// - /// VMOVUPS m256, ymm - /// float {FP32} - /// __m256 {FP32} - /// void {} - public static void _mm256_storeu_ps(float* mem_addr, __m256 a) => System.Runtime.Intrinsics.X86.Avx.Store(mem_addr, a.FP32); - - /// - /// Store 256-bits of integer data from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. - /// - /// VMOVDQU m256, ymm - /// __m256i {M256} - /// __m256i {M256} - /// void {} - public static void _mm256_storeu_si256(__m256i* mem_addr, __m256i a) => System.Runtime.Intrinsics.X86.Avx.Store((sbyte*)mem_addr, a.SI8); - - /// - /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - /// - /// VMOVNTPD m256, ymm - /// double {FP64} - /// __m256d {FP64} - /// void {} - public static void _mm256_stream_pd(double* mem_addr, __m256d a) => System.Runtime.Intrinsics.X86.Avx.StoreAlignedNonTemporal(mem_addr, a.FP64); - - /// - /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - /// - /// VMOVNTPS m256, ymm - /// float {FP32} - /// __m256 {FP32} - /// void {} - public static void _mm256_stream_ps(float* mem_addr, __m256 a) => System.Runtime.Intrinsics.X86.Avx.StoreAlignedNonTemporal(mem_addr, a.FP32); - - /// - /// Store 256-bits of integer data from "a" into memory using a non-temporal memory hint. "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - /// - /// VMOVNTDQ m256, ymm - /// __m256i {M256} - /// __m256i {M256} - /// void {} - public static void _mm256_stream_si256(__m256i* mem_addr, __m256i a) => System.Runtime.Intrinsics.X86.Avx.StoreAlignedNonTemporal((sbyte*)mem_addr, a.SI8); - - /// - /// Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - /// - /// VSUBPD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_sub_pd(__m256d a, __m256d b) => System.Runtime.Intrinsics.X86.Avx.Subtract(a.FP64, b.FP64); - - /// - /// Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - /// - /// VSUBPS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_sub_ps(__m256 a, __m256 b) => System.Runtime.Intrinsics.X86.Avx.Subtract(a.FP32, b.FP32); - - /// - /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value. - /// - /// VTESTPD ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// bool dst {UI8} - public static bool _mm256_testc_pd(__m256d a, __m256d b) => System.Runtime.Intrinsics.X86.Avx.TestC(a.FP64, b.FP64); - - /// - /// Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value. - /// - /// VTESTPS ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// bool dst {UI8} - public static bool _mm256_testc_ps(__m256 a, __m256 b) => System.Runtime.Intrinsics.X86.Avx.TestC(a.FP32, b.FP32); - - /// - /// Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "CF" value. - /// - /// VPTEST ymm, ymm - /// __m256i {M256} - /// __m256i {M256} - /// bool k {UI8} - public static bool _mm256_testc_si256(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx.TestC(a.UI8, b.UI8); - - /// - /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. - /// - /// VTESTPD ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// bool dst {UI8} - public static bool _mm256_testnzc_pd(__m256d a, __m256d b) => System.Runtime.Intrinsics.X86.Avx.TestNotZAndNotC(a.FP64, b.FP64); - - /// - /// Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. - /// - /// VTESTPS ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// bool dst {UI8} - public static bool _mm256_testnzc_ps(__m256 a, __m256 b) => System.Runtime.Intrinsics.X86.Avx.TestNotZAndNotC(a.FP32, b.FP32); - - /// - /// Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. - /// - /// VPTEST ymm, ymm - /// __m256i {M256} - /// __m256i {M256} - /// bool dst {UI8} - public static bool _mm256_testnzc_si256(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx.TestNotZAndNotC(a.UI8, b.UI8); - - /// - /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value. - /// - /// VTESTPD ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// bool dst {UI8} - public static bool _mm256_testz_pd(__m256d a, __m256d b) => System.Runtime.Intrinsics.X86.Avx.TestZ(a.FP64, b.FP64); - - /// - /// Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value. - /// - /// VTESTPS ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// bool dst {UI8} - public static bool _mm256_testz_ps(__m256 a, __m256 b) => System.Runtime.Intrinsics.X86.Avx.TestZ(a.FP32, b.FP32); - - /// - /// Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "ZF" value. - /// - /// VPTEST ymm, ymm - /// __m256i {M256} - /// __m256i {M256} - /// bool k {UI8} - public static bool _mm256_testz_si256(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx.TestZ(a.UI8, b.UI8); - - /// - /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - /// - /// VUNPCKHPD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_unpackhi_pd(__m256d a, __m256d b) => System.Runtime.Intrinsics.X86.Avx.UnpackHigh(a.FP64, b.FP64); - - /// - /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - /// - /// VUNPCKHPS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_unpackhi_ps(__m256 a, __m256 b) => System.Runtime.Intrinsics.X86.Avx.UnpackHigh(a.FP32, b.FP32); - - /// - /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - /// - /// VUNPCKLPD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_unpacklo_pd(__m256d a, __m256d b) => System.Runtime.Intrinsics.X86.Avx.UnpackLow(a.FP64, b.FP64); - - /// - /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - /// - /// VUNPCKLPS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_unpacklo_ps(__m256 a, __m256 b) => System.Runtime.Intrinsics.X86.Avx.UnpackLow(a.FP32, b.FP32); - - /// - /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - /// - /// VXORPD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_xor_pd(__m256d a, __m256d b) => System.Runtime.Intrinsics.X86.Avx.Xor(a.FP64, b.FP64); - - /// - /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - /// - /// VXORPS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_xor_ps(__m256 a, __m256 b) => System.Runtime.Intrinsics.X86.Avx.Xor(a.FP32, b.FP32); - - } -} diff --git a/src/External/RawIntrinsics/AVX2.cs b/src/External/RawIntrinsics/AVX2.cs deleted file mode 100644 index 931839c..0000000 --- a/src/External/RawIntrinsics/AVX2.cs +++ /dev/null @@ -1,1726 +0,0 @@ -namespace RawIntrinsics -{ - public static unsafe partial class AVX2 - { - /// - /// Blend packed 32-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst". - /// - /// VPBLENDD xmm, xmm, xmm, imm8 - /// __m128i {UI32} - /// __m128i {UI32} - /// int {IMM} - /// __m128i dst {UI32} - public static __m128i _mm_blend_epi32(__m128i a, __m128i b, int imm8) => System.Runtime.Intrinsics.X86.Avx2.Blend(a.UI32, b.UI32, (byte)imm8); - - /// - /// Broadcast the low packed 8-bit integer from "a" to all elements of "dst". - /// - /// VPBROADCASTB xmm, xmm - /// __m128i {UI8} - /// __m128i dst {UI8} - public static __m128i _mm_broadcastb_epi8(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.BroadcastScalarToVector128(a.UI8); - - /// - /// Broadcast the low packed 32-bit integer from "a" to all elements of "dst". - /// - /// VPBROADCASTD xmm, xmm - /// __m128i {UI32} - /// __m128i dst {UI32} - public static __m128i _mm_broadcastd_epi32(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.BroadcastScalarToVector128(a.UI32); - - /// - /// Broadcast the low packed 64-bit integer from "a" to all elements of "dst". - /// - /// VPBROADCASTQ xmm, xmm - /// __m128i {UI64} - /// __m128i dst {UI64} - public static __m128i _mm_broadcastq_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.BroadcastScalarToVector128(a.UI64); - - /// - /// Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst". - /// - /// MOVDDUP xmm, xmm - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_broadcastsd_pd(__m128d a) => System.Runtime.Intrinsics.X86.Avx2.BroadcastScalarToVector128(a.FP64); - - /// - /// Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst". - /// - /// VBROADCASTSS xmm, xmm - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_broadcastss_ps(__m128 a) => System.Runtime.Intrinsics.X86.Avx2.BroadcastScalarToVector128(a.FP32); - - /// - /// Broadcast the low packed 16-bit integer from "a" to all elements of "dst". - /// - /// VPBROADCASTW xmm, xmm - /// __m128i {UI16} - /// __m128i dst {UI16} - public static __m128i _mm_broadcastw_epi16(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.BroadcastScalarToVector128(a.UI16); - - /// - /// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - /// - /// VPGATHERDD xmm, vm32x, xmm - /// int {UI32} - /// __m128i {SI32} - /// int {IMM} - /// __m128i dst {UI32} - public static __m128i _mm_i32gather_epi32(int* base_addr, __m128i vindex, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherVector128((uint*)base_addr, vindex.SI32, (byte)scale); - - /// - /// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - /// - /// VPGATHERDQ xmm, vm32x, xmm - /// long {UI64} - /// __m128i {SI32} - /// int {IMM} - /// __m128i dst {UI64} - public static __m128i _mm_i32gather_epi64(long* base_addr, __m128i vindex, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherVector128((ulong*)base_addr, vindex.SI32, (byte)scale); - - /// - /// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - /// - /// VGATHERDPD xmm, vm32x, xmm - /// double {FP64} - /// __m128i {SI32} - /// int {IMM} - /// __m128d dst {FP64} - public static __m128d _mm_i32gather_pd(double* base_addr, __m128i vindex, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherVector128(base_addr, vindex.SI32, (byte)scale); - - /// - /// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - /// - /// VGATHERDPS xmm, vm32x, xmm - /// float {FP32} - /// __m128i {SI32} - /// int {IMM} - /// __m128 dst {FP32} - public static __m128 _mm_i32gather_ps(float* base_addr, __m128i vindex, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherVector128(base_addr, vindex.SI32, (byte)scale); - - /// - /// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - /// - /// VPGATHERQD xmm, vm64x, xmm - /// int {UI32} - /// __m128i {SI64} - /// int {IMM} - /// __m128i dst {UI32} - public static __m128i _mm_i64gather_epi32(int* base_addr, __m128i vindex, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherVector128((uint*)base_addr, vindex.SI64, (byte)scale); - - /// - /// Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - /// - /// VPGATHERQQ xmm, vm64x, xmm - /// long {UI64} - /// __m128i {SI64} - /// int {IMM} - /// __m128i dst {UI64} - public static __m128i _mm_i64gather_epi64(long* base_addr, __m128i vindex, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherVector128((ulong*)base_addr, vindex.SI64, (byte)scale); - - /// - /// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - /// - /// VGATHERQPD xmm, vm64x, xmm - /// double {FP64} - /// __m128i {SI64} - /// int {IMM} - /// __m128d dst {FP64} - public static __m128d _mm_i64gather_pd(double* base_addr, __m128i vindex, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherVector128(base_addr, vindex.SI64, (byte)scale); - - /// - /// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - /// - /// VGATHERQPS xmm, vm64x, xmm - /// float {FP32} - /// __m128i {SI64} - /// int {IMM} - /// __m128 dst {FP32} - public static __m128 _mm_i64gather_ps(float* base_addr, __m128i vindex, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherVector128(base_addr, vindex.SI64, (byte)scale); - - /// - /// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - /// - /// VPGATHERDD xmm, vm32x, xmm - /// __m128i {UI32} - /// int {UI32} - /// __m128i {SI32} - /// __m128i {MASK} - /// int {IMM} - /// __m128i dst {UI32} - public static __m128i _mm_mask_i32gather_epi32(__m128i src, int* base_addr, __m128i vindex, __m128i mask, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherMaskVector128(src.UI32, (uint*)base_addr, vindex.SI32, mask.UI32, (byte)scale); - - /// - /// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - /// - /// VPGATHERDQ xmm, vm32x, xmm - /// __m128i {UI64} - /// long {UI64} - /// __m128i {SI32} - /// __m128i {MASK} - /// int {IMM} - /// __m128i dst {UI64} - public static __m128i _mm_mask_i32gather_epi64(__m128i src, long* base_addr, __m128i vindex, __m128i mask, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherMaskVector128(src.UI64, (ulong*)base_addr, vindex.SI32, mask.UI64, (byte)scale); - - /// - /// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - /// - /// VGATHERDPD xmm, vm32x, xmm - /// __m128d {FP64} - /// double {FP64} - /// __m128i {SI32} - /// __m128d {MASK} - /// int {IMM} - /// __m128d dst {FP64} - public static __m128d _mm_mask_i32gather_pd(__m128d src, double* base_addr, __m128i vindex, __m128d mask, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherMaskVector128(src.FP64, base_addr, vindex.SI32, mask.FP64, (byte)scale); - - /// - /// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - /// - /// VGATHERDPS xmm, vm32x, xmm - /// __m128 {FP32} - /// float {FP32} - /// __m128i {SI32} - /// __m128 {MASK} - /// int {IMM} - /// __m128 dst {FP32} - public static __m128 _mm_mask_i32gather_ps(__m128 src, float* base_addr, __m128i vindex, __m128 mask, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherMaskVector128(src.FP32, base_addr, vindex.SI32, mask.FP32, (byte)scale); - - /// - /// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - /// - /// VPGATHERQD xmm, vm64x, xmm - /// __m128i {UI32} - /// int {UI32} - /// __m128i {SI64} - /// __m128i {MASK} - /// int {IMM} - /// __m128i dst {UI32} - public static __m128i _mm_mask_i64gather_epi32(__m128i src, int* base_addr, __m128i vindex, __m128i mask, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherMaskVector128(src.UI32, (uint*)base_addr, vindex.SI64, mask.UI32, (byte)scale); - - /// - /// Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - /// - /// VPGATHERQQ xmm, vm64x, xmm - /// __m128i {UI64} - /// long {UI64} - /// __m128i {SI64} - /// __m128i {MASK} - /// int {IMM} - /// __m128i dst {UI64} - public static __m128i _mm_mask_i64gather_epi64(__m128i src, long* base_addr, __m128i vindex, __m128i mask, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherMaskVector128(src.UI64, (ulong*)base_addr, vindex.SI64, mask.UI64, (byte)scale); - - /// - /// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - /// - /// VGATHERQPD xmm, vm64x, xmm - /// __m128d {FP64} - /// double {FP64} - /// __m128i {SI64} - /// __m128d {MASK} - /// int {IMM} - /// __m128d dst {FP64} - public static __m128d _mm_mask_i64gather_pd(__m128d src, double* base_addr, __m128i vindex, __m128d mask, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherMaskVector128(src.FP64, base_addr, vindex.SI64, mask.FP64, (byte)scale); - - /// - /// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - /// - /// VGATHERQPS xmm, vm64x, xmm - /// __m128 {FP32} - /// float {FP32} - /// __m128i {SI64} - /// __m128 {MASK} - /// int {IMM} - /// __m128 dst {FP32} - public static __m128 _mm_mask_i64gather_ps(__m128 src, float* base_addr, __m128i vindex, __m128 mask, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherMaskVector128(src.FP32, base_addr, vindex.SI64, mask.FP32, (byte)scale); - - /// - /// Load packed 32-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element). - /// - /// VPMASKMOVD xmm, xmm, m128 - /// int {UI32} - /// __m128i {MASK} - /// __m128i dst {UI32} - public static __m128i _mm_maskload_epi32(int* mem_addr, __m128i mask) => System.Runtime.Intrinsics.X86.Avx2.MaskLoad((uint*)mem_addr, mask.UI32); - - /// - /// Load packed 64-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element). - /// - /// VPMASKMOVQ xmm, xmm, m128 - /// long {UI64} - /// __m128i {MASK} - /// __m128i dst {UI64} - public static __m128i _mm_maskload_epi64(long* mem_addr, __m128i mask) => System.Runtime.Intrinsics.X86.Avx2.MaskLoad((ulong*)mem_addr, mask.UI64); - - /// - /// Store packed 32-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). - /// - /// VPMASKMOVD m128, xmm, xmm - /// int {UI32} - /// __m128i {MASK} - /// __m128i {UI32} - /// void {} - public static void _mm_maskstore_epi32(int* mem_addr, __m128i mask, __m128i a) => System.Runtime.Intrinsics.X86.Avx2.MaskStore((uint*)mem_addr, mask.UI32, a.UI32); - - /// - /// Store packed 64-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). - /// - /// VPMASKMOVQ m128, xmm, xmm - /// long {UI64} - /// __m128i {MASK} - /// __m128i {UI64} - /// void {} - public static void _mm_maskstore_epi64(long* mem_addr, __m128i mask, __m128i a) => System.Runtime.Intrinsics.X86.Avx2.MaskStore((ulong*)mem_addr, mask.UI64, a.UI64); - - /// - /// Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - /// - /// VPSLLVD xmm, xmm, xmm - /// __m128i {UI32} - /// __m128i {UI32} - /// __m128i dst {UI32} - public static __m128i _mm_sllv_epi32(__m128i a, __m128i count) => System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogicalVariable(a.UI32, count.UI32); - - /// - /// Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - /// - /// VPSLLVQ xmm, xmm, xmm - /// __m128i {UI64} - /// __m128i {UI64} - /// __m128i dst {UI64} - public static __m128i _mm_sllv_epi64(__m128i a, __m128i count) => System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogicalVariable(a.UI64, count.UI64); - - /// - /// Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". - /// - /// VPSRAVD xmm, xmm, xmm - /// __m128i {SI32} - /// __m128i {UI32} - /// __m128i dst {UI32} - public static __m128i _mm_srav_epi32(__m128i a, __m128i count) => System.Runtime.Intrinsics.X86.Avx2.ShiftRightArithmeticVariable(a.SI32, count.UI32); - - /// - /// Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - /// - /// VPSRLVD xmm, xmm, xmm - /// __m128i {UI32} - /// __m128i {UI32} - /// __m128i dst {UI32} - public static __m128i _mm_srlv_epi32(__m128i a, __m128i count) => System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogicalVariable(a.UI32, count.UI32); - - /// - /// Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - /// - /// VPSRLVQ xmm, xmm, xmm - /// __m128i {UI64} - /// __m128i {UI64} - /// __m128i dst {UI64} - public static __m128i _mm_srlv_epi64(__m128i a, __m128i count) => System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogicalVariable(a.UI64, count.UI64); - - /// - /// Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst". - /// - /// VPABSW ymm, ymm - /// __m256i {SI16} - /// __m256i dst {UI16} - public static __m256i _mm256_abs_epi16(__m256i a) => System.Runtime.Intrinsics.X86.Avx2.Abs(a.SI16); - - /// - /// Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst". - /// - /// VPABSD ymm, ymm - /// __m256i {SI32} - /// __m256i dst {UI32} - public static __m256i _mm256_abs_epi32(__m256i a) => System.Runtime.Intrinsics.X86.Avx2.Abs(a.SI32); - - /// - /// Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst". - /// - /// VPABSB ymm, ymm - /// __m256i {SI8} - /// __m256i dst {UI8} - public static __m256i _mm256_abs_epi8(__m256i a) => System.Runtime.Intrinsics.X86.Avx2.Abs(a.SI8); - - /// - /// Add packed 16-bit integers in "a" and "b", and store the results in "dst". - /// - /// VPADDW ymm, ymm, ymm - /// __m256i {UI16} - /// __m256i {UI16} - /// __m256i dst {UI16} - public static __m256i _mm256_add_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Add(a.UI16, b.UI16); - - /// - /// Add packed 32-bit integers in "a" and "b", and store the results in "dst". - /// - /// VPADDD ymm, ymm, ymm - /// __m256i {UI32} - /// __m256i {UI32} - /// __m256i dst {UI32} - public static __m256i _mm256_add_epi32(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Add(a.UI32, b.UI32); - - /// - /// Add packed 64-bit integers in "a" and "b", and store the results in "dst". - /// - /// VPADDQ ymm, ymm, ymm - /// __m256i {UI64} - /// __m256i {UI64} - /// __m256i dst {UI64} - public static __m256i _mm256_add_epi64(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Add(a.UI64, b.UI64); - - /// - /// Add packed 8-bit integers in "a" and "b", and store the results in "dst". - /// - /// VPADDB ymm, ymm, ymm - /// __m256i {UI8} - /// __m256i {UI8} - /// __m256i dst {UI8} - public static __m256i _mm256_add_epi8(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Add(a.UI8, b.UI8); - - /// - /// Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". - /// - /// VPADDSW ymm, ymm, ymm - /// __m256i {SI16} - /// __m256i {SI16} - /// __m256i dst {UI16} - public static __m256i _mm256_adds_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.AddSaturate(a.SI16, b.SI16); - - /// - /// Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". - /// - /// VPADDSB ymm, ymm, ymm - /// __m256i {SI8} - /// __m256i {SI8} - /// __m256i dst {UI8} - public static __m256i _mm256_adds_epi8(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.AddSaturate(a.SI8, b.SI8); - - /// - /// Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". - /// - /// VPADDUSW ymm, ymm, ymm - /// __m256i {UI16} - /// __m256i {UI16} - /// __m256i dst {UI16} - public static __m256i _mm256_adds_epu16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.AddSaturate(a.UI16, b.UI16); - - /// - /// Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". - /// - /// VPADDUSB ymm, ymm, ymm - /// __m256i {UI8} - /// __m256i {UI8} - /// __m256i dst {UI8} - public static __m256i _mm256_adds_epu8(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.AddSaturate(a.UI8, b.UI8); - - /// - /// Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst". - /// - /// VPALIGNR ymm, ymm, ymm, imm8 - /// __m256i {UI8} - /// __m256i {UI8} - /// int {IMM} - /// __m256i dst {UI8} - public static __m256i _mm256_alignr_epi8(__m256i a, __m256i b, int imm8) => System.Runtime.Intrinsics.X86.Avx2.AlignRight(a.UI8, b.UI8, (byte)imm8); - - /// - /// Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and store the result in "dst". - /// - /// VPAND ymm, ymm, ymm - /// __m256i {M256} - /// __m256i {M256} - /// __m256i dst {M256} - public static __m256i _mm256_and_si256(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.And(a.SI8, b.SI8); - - /// - /// Compute the bitwise NOT of 256 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". - /// - /// VPANDN ymm, ymm, ymm - /// __m256i {M256} - /// __m256i {M256} - /// __m256i dst {M256} - public static __m256i _mm256_andnot_si256(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.AndNot(a.SI8, b.SI8); - - /// - /// Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". - /// - /// VPAVGW ymm, ymm, ymm - /// __m256i {UI16} - /// __m256i {UI16} - /// __m256i dst {UI16} - public static __m256i _mm256_avg_epu16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Average(a.UI16, b.UI16); - - /// - /// Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". - /// - /// VPAVGB ymm, ymm, ymm - /// __m256i {UI8} - /// __m256i {UI8} - /// __m256i dst {UI8} - public static __m256i _mm256_avg_epu8(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Average(a.UI8, b.UI8); - - /// - /// Blend packed 16-bit integers from "a" and "b" within 128-bit lanes using control mask "imm8", and store the results in "dst". - /// - /// VPBLENDW ymm, ymm, ymm, imm8 - /// __m256i {UI16} - /// __m256i {UI16} - /// int {IMM} - /// __m256i dst {UI16} - public static __m256i _mm256_blend_epi16(__m256i a, __m256i b, int imm8) => System.Runtime.Intrinsics.X86.Avx2.Blend(a.UI16, b.UI16, (byte)imm8); - - /// - /// Blend packed 32-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst". - /// - /// VPBLENDD ymm, ymm, ymm, imm8 - /// __m256i {UI32} - /// __m256i {UI32} - /// int {IMM} - /// __m256i dst {UI32} - public static __m256i _mm256_blend_epi32(__m256i a, __m256i b, int imm8) => System.Runtime.Intrinsics.X86.Avx2.Blend(a.UI32, b.UI32, (byte)imm8); - - /// - /// Blend packed 8-bit integers from "a" and "b" using "mask", and store the results in "dst". - /// - /// VPBLENDVB ymm, ymm, ymm, ymm - /// __m256i {UI8} - /// __m256i {UI8} - /// __m256i {MASK} - /// __m256i dst {UI8} - public static __m256i _mm256_blendv_epi8(__m256i a, __m256i b, __m256i mask) => System.Runtime.Intrinsics.X86.Avx2.BlendVariable(a.UI8, b.UI8, mask.UI8); - - /// - /// Broadcast the low packed 8-bit integer from "a" to all elements of "dst". - /// - /// VPBROADCASTB ymm, xmm - /// __m128i {UI8} - /// __m256i dst {UI8} - public static __m256i _mm256_broadcastb_epi8(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.BroadcastScalarToVector256(a.UI8); - - /// - /// Broadcast the low packed 32-bit integer from "a" to all elements of "dst". - /// - /// VPBROADCASTD ymm, xmm - /// __m128i {UI32} - /// __m256i dst {UI32} - public static __m256i _mm256_broadcastd_epi32(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.BroadcastScalarToVector256(a.UI32); - - /// - /// Broadcast the low packed 64-bit integer from "a" to all elements of "dst". - /// - /// VPBROADCASTQ ymm, xmm - /// __m128i {UI64} - /// __m256i dst {UI64} - public static __m256i _mm256_broadcastq_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.BroadcastScalarToVector256(a.UI64); - - /// - /// Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst". - /// - /// VBROADCASTSD ymm, xmm - /// __m128d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_broadcastsd_pd(__m128d a) => System.Runtime.Intrinsics.X86.Avx2.BroadcastScalarToVector256(a.FP64); - - /// - /// Broadcast 128 bits of integer data from "a" to all 128-bit lanes in "dst". - /// - /// VBROADCASTI128 ymm, m128 - /// __m128i {M128} - /// __m256i dst {M128} - public static __m256i _mm256_broadcastsi128_si256(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.BroadcastVector128ToVector256((sbyte*)&a); - - /// - /// Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst". - /// - /// VBROADCASTSS ymm, xmm - /// __m128 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_broadcastss_ps(__m128 a) => System.Runtime.Intrinsics.X86.Avx2.BroadcastScalarToVector256(a.FP32); - - /// - /// Broadcast the low packed 16-bit integer from "a" to all elements of "dst". - /// - /// VPBROADCASTW ymm, xmm - /// __m128i {UI16} - /// __m256i dst {UI16} - public static __m256i _mm256_broadcastw_epi16(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.BroadcastScalarToVector256(a.UI16); - - /// - /// Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". - /// - /// VPSLLDQ ymm, ymm, imm8 - /// __m256i {M128} - /// int {IMM} - /// __m256i dst {M128} - public static __m256i _mm256_bslli_epi128(__m256i a, int imm8) => System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical128BitLane(a.SI8, (byte)imm8); - - /// - /// Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst". - /// - /// VPSRLDQ ymm, ymm, imm8 - /// __m256i {M128} - /// int {IMM} - /// __m256i dst {M128} - public static __m256i _mm256_bsrli_epi128(__m256i a, int imm8) => System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical128BitLane(a.SI8, (byte)imm8); - - /// - /// Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst". - /// - /// VPCMPEQW ymm, ymm, ymm - /// __m256i {UI16} - /// __m256i {UI16} - /// __m256i dst {UI16} - public static __m256i _mm256_cmpeq_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.CompareEqual(a.UI16, b.UI16); - - /// - /// Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst". - /// - /// VPCMPEQD ymm, ymm, ymm - /// __m256i {UI32} - /// __m256i {UI32} - /// __m256i dst {UI32} - public static __m256i _mm256_cmpeq_epi32(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.CompareEqual(a.UI32, b.UI32); - - /// - /// Compare packed 64-bit integers in "a" and "b" for equality, and store the results in "dst". - /// - /// VPCMPEQQ ymm, ymm, ymm - /// __m256i {UI64} - /// __m256i {UI64} - /// __m256i dst {UI64} - public static __m256i _mm256_cmpeq_epi64(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.CompareEqual(a.UI64, b.UI64); - - /// - /// Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst". - /// - /// VPCMPEQB ymm, ymm, ymm - /// __m256i {UI8} - /// __m256i {UI8} - /// __m256i dst {UI8} - public static __m256i _mm256_cmpeq_epi8(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.CompareEqual(a.UI8, b.UI8); - - /// - /// Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst". - /// - /// VPCMPGTW ymm, ymm, ymm - /// __m256i {SI16} - /// __m256i {SI16} - /// __m256i dst {UI16} - public static __m256i _mm256_cmpgt_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.CompareGreaterThan(a.SI16, b.SI16); - - /// - /// Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst". - /// - /// VPCMPGTD ymm, ymm, ymm - /// __m256i {SI32} - /// __m256i {SI32} - /// __m256i dst {UI32} - public static __m256i _mm256_cmpgt_epi32(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.CompareGreaterThan(a.SI32, b.SI32); - - /// - /// Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in "dst". - /// - /// VPCMPGTQ ymm, ymm, ymm - /// __m256i {SI64} - /// __m256i {SI64} - /// __m256i dst {UI64} - public static __m256i _mm256_cmpgt_epi64(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.CompareGreaterThan(a.SI64, b.SI64); - - /// - /// Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst". - /// - /// VPCMPGTB ymm, ymm, ymm - /// __m256i {SI8} - /// __m256i {SI8} - /// __m256i dst {UI8} - public static __m256i _mm256_cmpgt_epi8(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.CompareGreaterThan(a.SI8, b.SI8); - - /// - /// Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - /// - /// VPMOVSXWD ymm, xmm - /// __m128i {SI16} - /// __m256i dst {SI32} - public static __m256i _mm256_cvtepi16_epi32(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.ConvertToVector256Int32(a.SI16); - - /// - /// Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - /// - /// VPMOVSXWQ ymm, xmm - /// __m128i {SI16} - /// __m256i dst {SI64} - public static __m256i _mm256_cvtepi16_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.ConvertToVector256Int64(a.SI16); - - /// - /// Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - /// - /// VPMOVSXDQ ymm, xmm - /// __m128i {SI32} - /// __m256i dst {SI64} - public static __m256i _mm256_cvtepi32_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.ConvertToVector256Int64(a.SI32); - - /// - /// Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". - /// - /// VPMOVSXBW ymm, xmm - /// __m128i {SI8} - /// __m256i dst {SI16} - public static __m256i _mm256_cvtepi8_epi16(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.ConvertToVector256Int16(a.SI8); - - /// - /// Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - /// - /// VPMOVSXBD ymm, xmm - /// __m128i {SI8} - /// __m256i dst {SI32} - public static __m256i _mm256_cvtepi8_epi32(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.ConvertToVector256Int32(a.SI8); - - /// - /// Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst". - /// - /// VPMOVSXBQ ymm, xmm - /// __m128i {SI8} - /// __m256i dst {SI64} - public static __m256i _mm256_cvtepi8_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.ConvertToVector256Int64(a.SI8); - - /// - /// Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - /// - /// VPMOVZXWD ymm, xmm - /// __m128i {UI16} - /// __m256i dst {UI32} - public static __m256i _mm256_cvtepu16_epi32(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.ConvertToVector256Int32(a.UI16); - - /// - /// Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - /// - /// VPMOVZXWQ ymm, xmm - /// __m128i {UI16} - /// __m256i dst {UI64} - public static __m256i _mm256_cvtepu16_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.ConvertToVector256Int64(a.UI16); - - /// - /// Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - /// - /// VPMOVZXDQ ymm, xmm - /// __m128i {UI32} - /// __m256i dst {UI64} - public static __m256i _mm256_cvtepu32_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.ConvertToVector256Int64(a.UI32); - - /// - /// Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". - /// - /// VPMOVZXBW ymm, xmm - /// __m128i {UI8} - /// __m256i dst {UI16} - public static __m256i _mm256_cvtepu8_epi16(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.ConvertToVector256Int16(a.UI8); - - /// - /// Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - /// - /// VPMOVZXBD ymm, xmm - /// __m128i {UI8} - /// __m256i dst {UI32} - public static __m256i _mm256_cvtepu8_epi32(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.ConvertToVector256Int32(a.UI8); - - /// - /// Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst". - /// - /// VPMOVZXBQ ymm, xmm - /// __m128i {UI8} - /// __m256i dst {UI64} - public static __m256i _mm256_cvtepu8_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Avx2.ConvertToVector256Int64(a.UI8); - - /// - /// Extract 128 bits (composed of integer data) from "a", selected with "imm8", and store the result in "dst". - /// - /// VEXTRACTI128 xmm, ymm, imm8 - /// __m256i {M128} - /// int {IMM} - /// __m128i dst {M128} - public static __m128i _mm256_extracti128_si256(__m256i a, int imm8) => System.Runtime.Intrinsics.X86.Avx2.ExtractVector128(a.SI8, (byte)imm8); - - /// - /// Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". - /// - /// VPHADDW ymm, ymm, ymm - /// __m256i {UI16} - /// __m256i {UI16} - /// __m256i dst {UI16} - public static __m256i _mm256_hadd_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.HorizontalAdd(a.SI16, b.SI16); - - /// - /// Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". - /// - /// VPHADDD ymm, ymm, ymm - /// __m256i {UI32} - /// __m256i {UI32} - /// __m256i dst {UI32} - public static __m256i _mm256_hadd_epi32(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.HorizontalAdd(a.SI32, b.SI32); - - /// - /// Horizontally add adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". - /// - /// VPHADDSW ymm, ymm, ymm - /// __m256i {SI16} - /// __m256i {SI16} - /// __m256i dst {UI16} - public static __m256i _mm256_hadds_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.HorizontalAddSaturate(a.SI16, b.SI16); - - /// - /// Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". - /// - /// VPHSUBW ymm, ymm, ymm - /// __m256i {UI16} - /// __m256i {UI16} - /// __m256i dst {UI16} - public static __m256i _mm256_hsub_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.HorizontalSubtract(a.SI16, b.SI16); - - /// - /// Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". - /// - /// VPHSUBD ymm, ymm, ymm - /// __m256i {UI32} - /// __m256i {UI32} - /// __m256i dst {UI32} - public static __m256i _mm256_hsub_epi32(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.HorizontalSubtract(a.SI32, b.SI32); - - /// - /// Horizontally subtract adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". - /// - /// VPHSUBSW ymm, ymm, ymm - /// __m256i {SI16} - /// __m256i {SI16} - /// __m256i dst {UI16} - public static __m256i _mm256_hsubs_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.HorizontalSubtractSaturate(a.SI16, b.SI16); - - /// - /// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - /// - /// VPGATHERDD ymm, vm32x, ymm - /// int {UI32} - /// __m256i {SI32} - /// int {IMM} - /// __m256i dst {UI32} - public static __m256i _mm256_i32gather_epi32(int* base_addr, __m256i vindex, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherVector256((uint*)base_addr, vindex.SI32, (byte)scale); - - /// - /// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - /// - /// VPGATHERDQ ymm, vm32x, ymm - /// long {UI64} - /// __m128i {SI32} - /// int {IMM} - /// __m256i dst {UI64} - public static __m256i _mm256_i32gather_epi64(long* base_addr, __m128i vindex, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherVector256((ulong*)base_addr, vindex.SI32, (byte)scale); - - /// - /// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - /// - /// VGATHERDPD ymm, vm32x, ymm - /// double {FP64} - /// __m128i {SI32} - /// int {IMM} - /// __m256d dst {FP64} - public static __m256d _mm256_i32gather_pd(double* base_addr, __m128i vindex, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherVector256(base_addr, vindex.SI32, (byte)scale); - - /// - /// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - /// - /// VGATHERDPS ymm, vm32x, ymm - /// float {FP32} - /// __m256i {SI32} - /// int {IMM} - /// __m256 dst {FP32} - public static __m256 _mm256_i32gather_ps(float* base_addr, __m256i vindex, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherVector256(base_addr, vindex.SI32, (byte)scale); - - /// - /// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - /// - /// VPGATHERQD xmm, vm64y, xmm - /// int {UI32} - /// __m256i {SI64} - /// int {IMM} - /// __m128i dst {UI32} - public static __m128i _mm256_i64gather_epi32(int* base_addr, __m256i vindex, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherVector128((uint*)base_addr, vindex.SI64, (byte)scale); - - /// - /// Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - /// - /// VPGATHERQQ ymm, vm64x, ymm - /// long {UI64} - /// __m256i {SI64} - /// int {IMM} - /// __m256i dst {UI64} - public static __m256i _mm256_i64gather_epi64(long* base_addr, __m256i vindex, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherVector256((ulong*)base_addr, vindex.SI64, (byte)scale); - - /// - /// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - /// - /// VGATHERQPD ymm, vm64x, ymm - /// double {FP64} - /// __m256i {SI64} - /// int {IMM} - /// __m256d dst {FP64} - public static __m256d _mm256_i64gather_pd(double* base_addr, __m256i vindex, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherVector256(base_addr, vindex.SI64, (byte)scale); - - /// - /// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - /// - /// VGATHERQPS xmm, vm64y, xmm - /// float {FP32} - /// __m256i {SI64} - /// int {IMM} - /// __m128 dst {FP32} - public static __m128 _mm256_i64gather_ps(float* base_addr, __m256i vindex, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherVector128(base_addr, vindex.SI64, (byte)scale); - - /// - /// Copy "a" to "dst", then insert 128 bits (composed of integer data) from "b" into "dst" at the location specified by "imm8". - /// - /// VINSERTI128 ymm, ymm, xmm, imm8 - /// __m256i {M256} - /// __m128i {M128} - /// int {IMM} - /// __m256i dst {M128} - public static __m256i _mm256_inserti128_si256(__m256i a, __m128i b, int imm8) => System.Runtime.Intrinsics.X86.Avx2.InsertVector128(a.SI8, b.SI8, (byte)imm8); - - /// - /// Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". - /// - /// VPMADDWD ymm, ymm, ymm - /// __m256i {SI16} - /// __m256i {SI16} - /// __m256i dst {SI32} - public static __m256i _mm256_madd_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.MultiplyAddAdjacent(a.SI16, b.SI16); - - /// - /// Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst". - /// - /// VPMADDUBSW ymm, ymm, ymm - /// __m256i {UI8} - /// __m256i {SI8} - /// __m256i dst {SI16} - public static __m256i _mm256_maddubs_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.MultiplyAddAdjacent(a.UI8, b.SI8); - - /// - /// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - /// - /// VPGATHERDD ymm, vm32x, ymm - /// __m256i {UI32} - /// int {UI32} - /// __m256i {SI32} - /// __m256i {MASK} - /// int {IMM} - /// __m256i dst {UI32} - public static __m256i _mm256_mask_i32gather_epi32(__m256i src, int* base_addr, __m256i vindex, __m256i mask, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherMaskVector256(src.UI32, (uint*)base_addr, vindex.SI32, mask.UI32, (byte)scale); - - /// - /// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - /// - /// VPGATHERDQ ymm, vm32x, ymm - /// __m256i {UI64} - /// long {UI64} - /// __m128i {SI32} - /// __m256i {MASK} - /// int {IMM} - /// __m256i dst {UI64} - public static __m256i _mm256_mask_i32gather_epi64(__m256i src, long* base_addr, __m128i vindex, __m256i mask, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherMaskVector256(src.UI64, (ulong*)base_addr, vindex.SI32, mask.UI64, (byte)scale); - - /// - /// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - /// - /// VGATHERDPD ymm, vm32x, ymm - /// __m256d {FP64} - /// double {FP64} - /// __m128i {SI32} - /// __m256d {MASK} - /// int {IMM} - /// __m256d dst {FP64} - public static __m256d _mm256_mask_i32gather_pd(__m256d src, double* base_addr, __m128i vindex, __m256d mask, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherMaskVector256(src.FP64, base_addr, vindex.SI32, mask.FP64, (byte)scale); - - /// - /// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - /// - /// VGATHERDPS ymm, vm32x, ymm - /// __m256 {FP32} - /// float {FP32} - /// __m256i {SI32} - /// __m256 {MASK} - /// int {IMM} - /// __m256 dst {FP32} - public static __m256 _mm256_mask_i32gather_ps(__m256 src, float* base_addr, __m256i vindex, __m256 mask, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherMaskVector256(src.FP32, base_addr, vindex.SI32, mask.FP32, (byte)scale); - - /// - /// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - /// - /// VPGATHERQD xmm, vm64y, xmm - /// __m128i {UI32} - /// int {UI32} - /// __m256i {SI64} - /// __m128i {MASK} - /// int {IMM} - /// __m128i dst {UI32} - public static __m128i _mm256_mask_i64gather_epi32(__m128i src, int* base_addr, __m256i vindex, __m128i mask, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherMaskVector128(src.UI32, (uint*)base_addr, vindex.SI64, mask.UI32, (byte)scale); - - /// - /// Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - /// - /// VPGATHERQQ ymm, vm64x, ymm - /// __m256i {UI64} - /// long {UI64} - /// __m256i {SI64} - /// __m256i {MASK} - /// int {IMM} - /// __m256i dst {UI64} - public static __m256i _mm256_mask_i64gather_epi64(__m256i src, long* base_addr, __m256i vindex, __m256i mask, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherMaskVector256(src.UI64, (ulong*)base_addr, vindex.SI64, mask.UI64, (byte)scale); - - /// - /// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - /// - /// VGATHERQPD ymm, vm64x, ymm - /// __m256d {FP64} - /// double {FP64} - /// __m256i {SI64} - /// __m256d {MASK} - /// int {IMM} - /// __m256d dst {FP64} - public static __m256d _mm256_mask_i64gather_pd(__m256d src, double* base_addr, __m256i vindex, __m256d mask, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherMaskVector256(src.FP64, base_addr, vindex.SI64, mask.FP64, (byte)scale); - - /// - /// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - /// - /// VGATHERQPS xmm, vm64y, xmm - /// __m128 {FP32} - /// float {FP32} - /// __m256i {SI64} - /// __m128 {MASK} - /// int {IMM} - /// __m128 dst {FP32} - public static __m128 _mm256_mask_i64gather_ps(__m128 src, float* base_addr, __m256i vindex, __m128 mask, int scale) => System.Runtime.Intrinsics.X86.Avx2.GatherMaskVector128(src.FP32, base_addr, vindex.SI64, mask.FP32, (byte)scale); - - /// - /// Load packed 32-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element). - /// - /// VPMASKMOVD ymm, ymm, m256 - /// int {UI32} - /// __m256i {MASK} - /// __m256i dst {UI32} - public static __m256i _mm256_maskload_epi32(int* mem_addr, __m256i mask) => System.Runtime.Intrinsics.X86.Avx2.MaskLoad((uint*)mem_addr, mask.UI32); - - /// - /// Load packed 64-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element). - /// - /// VPMASKMOVQ ymm, ymm, m256 - /// long {UI64} - /// __m256i {MASK} - /// __m256i dst {UI64} - public static __m256i _mm256_maskload_epi64(long* mem_addr, __m256i mask) => System.Runtime.Intrinsics.X86.Avx2.MaskLoad((ulong*)mem_addr, mask.UI64); - - /// - /// Store packed 32-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). - /// - /// VPMASKMOVD m256, ymm, ymm - /// int {UI32} - /// __m256i {MASK} - /// __m256i {UI32} - /// void {} - public static void _mm256_maskstore_epi32(int* mem_addr, __m256i mask, __m256i a) => System.Runtime.Intrinsics.X86.Avx2.MaskStore((uint*)mem_addr, mask.UI32, a.UI32); - - /// - /// Store packed 64-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). - /// - /// VPMASKMOVQ m256, ymm, ymm - /// long {UI64} - /// __m256i {MASK} - /// __m256i {UI64} - /// void {} - public static void _mm256_maskstore_epi64(long* mem_addr, __m256i mask, __m256i a) => System.Runtime.Intrinsics.X86.Avx2.MaskStore((ulong*)mem_addr, mask.UI64, a.UI64); - - /// - /// Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". - /// - /// VPMAXSW ymm, ymm, ymm - /// __m256i {SI16} - /// __m256i {SI16} - /// __m256i dst {UI16} - public static __m256i _mm256_max_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Max(a.SI16, b.SI16); - - /// - /// Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst". - /// - /// VPMAXSD ymm, ymm, ymm - /// __m256i {SI32} - /// __m256i {SI32} - /// __m256i dst {UI32} - public static __m256i _mm256_max_epi32(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Max(a.SI32, b.SI32); - - /// - /// Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst". - /// - /// VPMAXSB ymm, ymm, ymm - /// __m256i {SI8} - /// __m256i {SI8} - /// __m256i dst {UI8} - public static __m256i _mm256_max_epi8(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Max(a.SI8, b.SI8); - - /// - /// Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst". - /// - /// VPMAXUW ymm, ymm, ymm - /// __m256i {UI16} - /// __m256i {UI16} - /// __m256i dst {UI16} - public static __m256i _mm256_max_epu16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Max(a.UI16, b.UI16); - - /// - /// Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst". - /// - /// VPMAXUD ymm, ymm, ymm - /// __m256i {UI32} - /// __m256i {UI32} - /// __m256i dst {UI32} - public static __m256i _mm256_max_epu32(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Max(a.UI32, b.UI32); - - /// - /// Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". - /// - /// VPMAXUB ymm, ymm, ymm - /// __m256i {UI8} - /// __m256i {UI8} - /// __m256i dst {UI8} - public static __m256i _mm256_max_epu8(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Max(a.UI8, b.UI8); - - /// - /// Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". - /// - /// VPMINSW ymm, ymm, ymm - /// __m256i {SI16} - /// __m256i {SI16} - /// __m256i dst {UI16} - public static __m256i _mm256_min_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Min(a.SI16, b.SI16); - - /// - /// Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst". - /// - /// VPMINSD ymm, ymm, ymm - /// __m256i {SI32} - /// __m256i {SI32} - /// __m256i dst {UI32} - public static __m256i _mm256_min_epi32(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Min(a.SI32, b.SI32); - - /// - /// Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst". - /// - /// VPMINSB ymm, ymm, ymm - /// __m256i {SI8} - /// __m256i {SI8} - /// __m256i dst {UI8} - public static __m256i _mm256_min_epi8(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Min(a.SI8, b.SI8); - - /// - /// Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst". - /// - /// VPMINUW ymm, ymm, ymm - /// __m256i {UI16} - /// __m256i {UI16} - /// __m256i dst {UI16} - public static __m256i _mm256_min_epu16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Min(a.UI16, b.UI16); - - /// - /// Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst". - /// - /// VPMINUD ymm, ymm, ymm - /// __m256i {UI32} - /// __m256i {UI32} - /// __m256i dst {UI32} - public static __m256i _mm256_min_epu32(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Min(a.UI32, b.UI32); - - /// - /// Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". - /// - /// VPMINUB ymm, ymm, ymm - /// __m256i {UI8} - /// __m256i {UI8} - /// __m256i dst {UI8} - public static __m256i _mm256_min_epu8(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Min(a.UI8, b.UI8); - - /// - /// Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst". - /// - /// VPMOVMSKB r32, ymm - /// __m256i {UI8} - /// int dst {UI32} - public static int _mm256_movemask_epi8(__m256i a) => System.Runtime.Intrinsics.X86.Avx2.MoveMask(a.UI8); - - /// - /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". Eight SADs are performed for each 128-bit lane using one quadruplet from "b" and eight quadruplets from "a". One quadruplet is selected from "b" starting at on the offset specified in "imm8". Eight quadruplets are formed from sequential 8-bit integers selected from "a" starting at the offset specified in "imm8". - /// - /// VMPSADBW ymm, ymm, ymm, imm8 - /// __m256i {UI8} - /// __m256i {UI8} - /// int {IMM} - /// __m256i dst {UI8} - public static __m256i _mm256_mpsadbw_epu8(__m256i a, __m256i b, int imm8) => System.Runtime.Intrinsics.X86.Avx2.MultipleSumAbsoluteDifferences(a.UI8, b.UI8, (byte)imm8); - - /// - /// Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". - /// - /// VPMULDQ ymm, ymm, ymm - /// __m256i {SI32} - /// __m256i {SI32} - /// __m256i dst {SI64} - public static __m256i _mm256_mul_epi32(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Multiply(a.SI32, b.SI32); - - /// - /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst". - /// - /// VPMULUDQ ymm, ymm, ymm - /// __m256i {UI32} - /// __m256i {UI32} - /// __m256i dst {UI64} - public static __m256i _mm256_mul_epu32(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Multiply(a.UI32, b.UI32); - - /// - /// Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". - /// - /// VPMULHW ymm, ymm, ymm - /// __m256i {SI16} - /// __m256i {SI16} - /// __m256i dst {UI16} - public static __m256i _mm256_mulhi_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.MultiplyHigh(a.SI16, b.SI16); - - /// - /// Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". - /// - /// VPMULHUW ymm, ymm, ymm - /// __m256i {UI16} - /// __m256i {UI16} - /// __m256i dst {UI16} - public static __m256i _mm256_mulhi_epu16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.MultiplyHigh(a.UI16, b.UI16); - - /// - /// Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". - /// - /// VPMULHRSW ymm, ymm, ymm - /// __m256i {SI16} - /// __m256i {SI16} - /// __m256i dst {UI16} - public static __m256i _mm256_mulhrs_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.MultiplyHighRoundScale(a.SI16, b.SI16); - - /// - /// Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". - /// - /// VPMULLW ymm, ymm, ymm - /// __m256i {SI16} - /// __m256i {SI16} - /// __m256i dst {UI16} - public static __m256i _mm256_mullo_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.MultiplyLow(a.SI16, b.SI16); - - /// - /// Multiply the packed signed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". - /// - /// VPMULLD ymm, ymm, ymm - /// __m256i {SI32} - /// __m256i {SI32} - /// __m256i dst {UI32} - public static __m256i _mm256_mullo_epi32(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.MultiplyLow(a.SI32, b.SI32); - - /// - /// Compute the bitwise OR of 256 bits (representing integer data) in "a" and "b", and store the result in "dst". - /// - /// VPOR ymm, ymm, ymm - /// __m256i {M256} - /// __m256i {M256} - /// __m256i dst {M256} - public static __m256i _mm256_or_si256(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Or(a.SI8, b.SI8); - - /// - /// Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". - /// - /// VPACKSSWB ymm, ymm, ymm - /// __m256i {SI16} - /// __m256i {SI16} - /// __m256i dst {SI8} - public static __m256i _mm256_packs_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.PackSignedSaturate(a.SI16, b.SI16); - - /// - /// Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". - /// - /// VPACKSSDW ymm, ymm, ymm - /// __m256i {SI32} - /// __m256i {SI32} - /// __m256i dst {SI16} - public static __m256i _mm256_packs_epi32(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.PackSignedSaturate(a.SI32, b.SI32); - - /// - /// Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". - /// - /// VPACKUSWB ymm, ymm, ymm - /// __m256i {SI16} - /// __m256i {SI16} - /// __m256i dst {UI8} - public static __m256i _mm256_packus_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.PackUnsignedSaturate(a.SI16, b.SI16); - - /// - /// Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst". - /// - /// VPACKUSDW ymm, ymm, ymm - /// __m256i {SI32} - /// __m256i {SI32} - /// __m256i dst {UI16} - public static __m256i _mm256_packus_epi32(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.PackUnsignedSaturate(a.SI32, b.SI32); - - /// - /// Shuffle 128-bits (composed of integer data) selected by "imm8" from "a" and "b", and store the results in "dst". - /// - /// VPERM2I128 ymm, ymm, ymm, imm8 - /// __m256i {M256} - /// __m256i {M256} - /// int {IMM} - /// __m256i dst {M256} - public static __m256i _mm256_permute2x128_si256(__m256i a, __m256i b, int imm8) => System.Runtime.Intrinsics.X86.Avx2.Permute2x128(a.SI8, b.SI8, (byte)imm8); - - /// - /// Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst". - /// - /// VPERMQ ymm, ymm, imm8 - /// __m256i {UI64} - /// int {IMM} - /// __m256i dst {UI64} - public static __m256i _mm256_permute4x64_epi64(__m256i a, int imm8) => System.Runtime.Intrinsics.X86.Avx2.Permute4x64(a.UI64, (byte)imm8); - - /// - /// Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst". - /// - /// VPERMPD ymm, ymm, imm8 - /// __m256d {FP64} - /// int {IMM} - /// __m256d dst {FP64} - public static __m256d _mm256_permute4x64_pd(__m256d a, int imm8) => System.Runtime.Intrinsics.X86.Avx2.Permute4x64(a.FP64, (byte)imm8); - - /// - /// Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". - /// - /// VPERMD ymm, ymm, ymm - /// __m256i {UI32} - /// __m256i {UI32} - /// __m256i dst {UI32} - public static __m256i _mm256_permutevar8x32_epi32(__m256i a, __m256i idx) => System.Runtime.Intrinsics.X86.Avx2.PermuteVar8x32(a.UI32, idx.UI32); - - /// - /// Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx". - /// - /// VPERMPS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256i {UI32} - /// __m256 dst {FP32} - public static __m256 _mm256_permutevar8x32_ps(__m256 a, __m256i idx) => System.Runtime.Intrinsics.X86.Avx2.PermuteVar8x32(a.FP32, idx.SI32); - - /// - /// Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst". - /// - /// VPSADBW ymm, ymm, ymm - /// __m256i {UI8} - /// __m256i {UI8} - /// __m256i dst {UI16} - public static __m256i _mm256_sad_epu8(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.SumAbsoluteDifferences(a.UI8, b.UI8); - - /// - /// Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". - /// - /// VPSHUFD ymm, ymm, imm8 - /// __m256i {UI32} - /// int {IMM} - /// __m256i dst {UI32} - public static __m256i _mm256_shuffle_epi32(__m256i a, int imm8) => System.Runtime.Intrinsics.X86.Avx2.Shuffle(a.UI32, (byte)imm8); - - /// - /// Shuffle 8-bit integers in "a" within 128-bit lanes according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst". - /// - /// VPSHUFB ymm, ymm, ymm - /// __m256i {UI8} - /// __m256i {UI8} - /// __m256i dst {UI8} - public static __m256i _mm256_shuffle_epi8(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Shuffle(a.UI8, b.UI8); - - /// - /// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst". - /// - /// VPSHUFHW ymm, ymm, imm8 - /// __m256i {UI16} - /// int {IMM} - /// __m256i dst {UI16} - public static __m256i _mm256_shufflehi_epi16(__m256i a, int imm8) => System.Runtime.Intrinsics.X86.Avx2.ShuffleHigh(a.UI16, (byte)imm8); - - /// - /// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst". - /// - /// VPSHUFLW ymm, ymm, imm8 - /// __m256i {UI16} - /// int {IMM} - /// __m256i dst {UI16} - public static __m256i _mm256_shufflelo_epi16(__m256i a, int imm8) => System.Runtime.Intrinsics.X86.Avx2.ShuffleLow(a.UI16, (byte)imm8); - - /// - /// Negate packed signed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. - /// - /// VPSIGNW ymm, ymm, ymm - /// __m256i {SI16} - /// __m256i {SI16} - /// __m256i dst {UI16} - public static __m256i _mm256_sign_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Sign(a.SI16, b.SI16); - - /// - /// Negate packed signed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. - /// - /// VPSIGND ymm, ymm, ymm - /// __m256i {SI32} - /// __m256i {SI32} - /// __m256i dst {UI32} - public static __m256i _mm256_sign_epi32(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Sign(a.SI32, b.SI32); - - /// - /// Negate packed signed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. - /// - /// VPSIGNB ymm, ymm, ymm - /// __m256i {SI8} - /// __m256i {SI8} - /// __m256i dst {UI8} - public static __m256i _mm256_sign_epi8(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Sign(a.SI8, b.SI8); - - /// - /// Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". - /// - /// VPSLLW ymm, ymm, xmm - /// __m256i {UI16} - /// __m128i {UI16} - /// __m256i dst {UI16} - public static __m256i _mm256_sll_epi16(__m256i a, __m128i count) => System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(a.UI16, count.UI16); - - /// - /// Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". - /// - /// VPSLLD ymm, ymm, xmm - /// __m256i {UI32} - /// __m128i {UI32} - /// __m256i dst {UI32} - public static __m256i _mm256_sll_epi32(__m256i a, __m128i count) => System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(a.UI32, count.UI32); - - /// - /// Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". - /// - /// VPSLLQ ymm, ymm, xmm - /// __m256i {UI64} - /// __m128i {UI64} - /// __m256i dst {UI64} - public static __m256i _mm256_sll_epi64(__m256i a, __m128i count) => System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(a.UI64, count.UI64); - - /// - /// Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". - /// - /// VPSLLW ymm, ymm, imm8 - /// __m256i {UI16} - /// int {IMM} - /// __m256i dst {UI16} - public static __m256i _mm256_slli_epi16(__m256i a, int imm8) => System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(a.UI16, (byte)imm8); - - /// - /// Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". - /// - /// VPSLLD ymm, ymm, imm8 - /// __m256i {UI32} - /// int {IMM} - /// __m256i dst {UI32} - public static __m256i _mm256_slli_epi32(__m256i a, int imm8) => System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(a.UI32, (byte)imm8); - - /// - /// Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". - /// - /// VPSLLQ ymm, ymm, imm8 - /// __m256i {UI64} - /// int {IMM} - /// __m256i dst {UI64} - public static __m256i _mm256_slli_epi64(__m256i a, int imm8) => System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(a.UI64, (byte)imm8); - - /// - /// Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - /// - /// VPSLLVD ymm, ymm, ymm - /// __m256i {UI32} - /// __m256i {UI32} - /// __m256i dst {UI32} - public static __m256i _mm256_sllv_epi32(__m256i a, __m256i count) => System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogicalVariable(a.UI32, count.UI32); - - /// - /// Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - /// - /// VPSLLVQ ymm, ymm, ymm - /// __m256i {UI64} - /// __m256i {UI64} - /// __m256i dst {UI64} - public static __m256i _mm256_sllv_epi64(__m256i a, __m256i count) => System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogicalVariable(a.UI64, count.UI64); - - /// - /// Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". - /// - /// VPSRAW ymm, ymm, imm8 - /// __m256i {SI16} - /// int {IMM} - /// __m256i dst {UI16} - public static __m256i _mm256_srai_epi16(__m256i a, int imm8) => System.Runtime.Intrinsics.X86.Avx2.ShiftRightArithmetic(a.SI16, (byte)imm8); - - /// - /// Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". - /// - /// VPSRAD ymm, ymm, imm8 - /// __m256i {SI32} - /// int {IMM} - /// __m256i dst {UI32} - public static __m256i _mm256_srai_epi32(__m256i a, int imm8) => System.Runtime.Intrinsics.X86.Avx2.ShiftRightArithmetic(a.SI32, (byte)imm8); - - /// - /// Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". - /// - /// VPSRAVD ymm, ymm, ymm - /// __m256i {SI32} - /// __m256i {UI32} - /// __m256i dst {UI32} - public static __m256i _mm256_srav_epi32(__m256i a, __m256i count) => System.Runtime.Intrinsics.X86.Avx2.ShiftRightArithmeticVariable(a.SI32, count.UI32); - - /// - /// Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". - /// - /// VPSRLW ymm, ymm, xmm - /// __m256i {UI16} - /// __m128i {UI16} - /// __m256i dst {UI16} - public static __m256i _mm256_srl_epi16(__m256i a, __m128i count) => System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(a.UI16, count.UI16); - - /// - /// Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". - /// - /// VPSRLD ymm, ymm, xmm - /// __m256i {UI32} - /// __m128i {UI32} - /// __m256i dst {UI32} - public static __m256i _mm256_srl_epi32(__m256i a, __m128i count) => System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(a.UI32, count.UI32); - - /// - /// Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". - /// - /// VPSRLQ ymm, ymm, xmm - /// __m256i {UI64} - /// __m128i {UI64} - /// __m256i dst {UI64} - public static __m256i _mm256_srl_epi64(__m256i a, __m128i count) => System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(a.UI64, count.UI64); - - /// - /// Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - /// - /// VPSRLW ymm, ymm, imm8 - /// __m256i {UI16} - /// int {IMM} - /// __m256i dst {UI16} - public static __m256i _mm256_srli_epi16(__m256i a, int imm8) => System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(a.UI16, (byte)imm8); - - /// - /// Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - /// - /// VPSRLD ymm, ymm, imm8 - /// __m256i {UI32} - /// int {IMM} - /// __m256i dst {UI32} - public static __m256i _mm256_srli_epi32(__m256i a, int imm8) => System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(a.UI32, (byte)imm8); - - /// - /// Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - /// - /// VPSRLQ ymm, ymm, imm8 - /// __m256i {UI64} - /// int {IMM} - /// __m256i dst {UI64} - public static __m256i _mm256_srli_epi64(__m256i a, int imm8) => System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(a.UI64, (byte)imm8); - - /// - /// Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - /// - /// VPSRLVD ymm, ymm, ymm - /// __m256i {UI32} - /// __m256i {UI32} - /// __m256i dst {UI32} - public static __m256i _mm256_srlv_epi32(__m256i a, __m256i count) => System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogicalVariable(a.UI32, count.UI32); - - /// - /// Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - /// - /// VPSRLVQ ymm, ymm, ymm - /// __m256i {UI64} - /// __m256i {UI64} - /// __m256i dst {UI64} - public static __m256i _mm256_srlv_epi64(__m256i a, __m256i count) => System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogicalVariable(a.UI64, count.UI64); - - /// - /// Load 256-bits of integer data from memory into "dst" using a non-temporal memory hint. "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - /// - /// VMOVNTDQA ymm, m256 - /// __m256i {M256} - /// __m256i dst {M256} - public static __m256i _mm256_stream_load_si256(__m256i* mem_addr) => System.Runtime.Intrinsics.X86.Avx2.LoadAlignedVector256NonTemporal((sbyte*)mem_addr); - - /// - /// Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". - /// - /// VPSUBW ymm, ymm, ymm - /// __m256i {UI16} - /// __m256i {UI16} - /// __m256i dst {UI16} - public static __m256i _mm256_sub_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Subtract(a.UI16, b.UI16); - - /// - /// Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". - /// - /// VPSUBD ymm, ymm, ymm - /// __m256i {UI32} - /// __m256i {UI32} - /// __m256i dst {UI32} - public static __m256i _mm256_sub_epi32(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Subtract(a.UI32, b.UI32); - - /// - /// Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst". - /// - /// VPSUBQ ymm, ymm, ymm - /// __m256i {UI64} - /// __m256i {UI64} - /// __m256i dst {UI64} - public static __m256i _mm256_sub_epi64(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Subtract(a.UI64, b.UI64); - - /// - /// Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". - /// - /// VPSUBB ymm, ymm, ymm - /// __m256i {UI8} - /// __m256i {UI8} - /// __m256i dst {UI8} - public static __m256i _mm256_sub_epi8(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Subtract(a.UI8, b.UI8); - - /// - /// Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". - /// - /// VPSUBSW ymm, ymm, ymm - /// __m256i {SI16} - /// __m256i {SI16} - /// __m256i dst {UI16} - public static __m256i _mm256_subs_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.SubtractSaturate(a.SI16, b.SI16); - - /// - /// Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". - /// - /// VPSUBSB ymm, ymm, ymm - /// __m256i {SI8} - /// __m256i {SI8} - /// __m256i dst {UI8} - public static __m256i _mm256_subs_epi8(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.SubtractSaturate(a.SI8, b.SI8); - - /// - /// Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". - /// - /// VPSUBUSW ymm, ymm, ymm - /// __m256i {UI16} - /// __m256i {UI16} - /// __m256i dst {UI16} - public static __m256i _mm256_subs_epu16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.SubtractSaturate(a.UI16, b.UI16); - - /// - /// Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". - /// - /// VPSUBUSB ymm, ymm, ymm - /// __m256i {UI8} - /// __m256i {UI8} - /// __m256i dst {UI8} - public static __m256i _mm256_subs_epu8(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.SubtractSaturate(a.UI8, b.UI8); - - /// - /// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - /// - /// VPUNPCKHWD ymm, ymm, ymm - /// __m256i {UI16} - /// __m256i {UI16} - /// __m256i dst {UI16} - public static __m256i _mm256_unpackhi_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(a.UI16, b.UI16); - - /// - /// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - /// - /// VPUNPCKHDQ ymm, ymm, ymm - /// __m256i {UI32} - /// __m256i {UI32} - /// __m256i dst {UI32} - public static __m256i _mm256_unpackhi_epi32(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(a.UI32, b.UI32); - - /// - /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - /// - /// VPUNPCKHQDQ ymm, ymm, ymm - /// __m256i {UI64} - /// __m256i {UI64} - /// __m256i dst {UI64} - public static __m256i _mm256_unpackhi_epi64(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(a.UI64, b.UI64); - - /// - /// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - /// - /// VPUNPCKHBW ymm, ymm, ymm - /// __m256i {UI8} - /// __m256i {UI8} - /// __m256i dst {UI8} - public static __m256i _mm256_unpackhi_epi8(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(a.UI8, b.UI8); - - /// - /// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - /// - /// VPUNPCKLWD ymm, ymm, ymm - /// __m256i {UI16} - /// __m256i {UI16} - /// __m256i dst {UI16} - public static __m256i _mm256_unpacklo_epi16(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.UnpackLow(a.UI16, b.UI16); - - /// - /// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - /// - /// VPUNPCKLDQ ymm, ymm, ymm - /// __m256i {UI32} - /// __m256i {UI32} - /// __m256i dst {UI32} - public static __m256i _mm256_unpacklo_epi32(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.UnpackLow(a.UI32, b.UI32); - - /// - /// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - /// - /// VPUNPCKLQDQ ymm, ymm, ymm - /// __m256i {UI64} - /// __m256i {UI64} - /// __m256i dst {UI64} - public static __m256i _mm256_unpacklo_epi64(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.UnpackLow(a.UI64, b.UI64); - - /// - /// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - /// - /// VPUNPCKLBW ymm, ymm, ymm - /// __m256i {UI8} - /// __m256i {UI8} - /// __m256i dst {UI8} - public static __m256i _mm256_unpacklo_epi8(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.UnpackLow(a.UI8, b.UI8); - - /// - /// Compute the bitwise XOR of 256 bits (representing integer data) in "a" and "b", and store the result in "dst". - /// - /// VPXOR ymm, ymm, ymm - /// __m256i {M256} - /// __m256i {M256} - /// __m256i dst {M256} - public static __m256i _mm256_xor_si256(__m256i a, __m256i b) => System.Runtime.Intrinsics.X86.Avx2.Xor(a.SI8, b.SI8); - - } -} diff --git a/src/External/RawIntrinsics/FMA.cs b/src/External/RawIntrinsics/FMA.cs deleted file mode 100644 index acad284..0000000 --- a/src/External/RawIntrinsics/FMA.cs +++ /dev/null @@ -1,326 +0,0 @@ -namespace RawIntrinsics -{ - public static unsafe partial class FMA - { - /// - /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". - /// - /// VFMADD132PD xmm, xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_fmadd_pd(__m128d a, __m128d b, __m128d c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAdd(a.FP64, b.FP64, c.FP64); - - /// - /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". - /// - /// VFMADD132PS xmm, xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAdd(a.FP32, b.FP32, c.FP32); - - /// - /// Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// VFMADD132SD xmm, xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_fmadd_sd(__m128d a, __m128d b, __m128d c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddScalar(a.FP64, b.FP64, c.FP64); - - /// - /// Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// VFMADD132SS xmm, xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_fmadd_ss(__m128 a, __m128 b, __m128 c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddScalar(a.FP32, b.FP32, c.FP32); - - /// - /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". - /// - /// VFMADDSUB132PD xmm, xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_fmaddsub_pd(__m128d a, __m128d b, __m128d c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddSubtract(a.FP64, b.FP64, c.FP64); - - /// - /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". - /// - /// VFMADDSUB132PS xmm, xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_fmaddsub_ps(__m128 a, __m128 b, __m128 c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddSubtract(a.FP32, b.FP32, c.FP32); - - /// - /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". - /// - /// VFMSUB132PD xmm, xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_fmsub_pd(__m128d a, __m128d b, __m128d c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtract(a.FP64, b.FP64, c.FP64); - - /// - /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". - /// - /// VFMSUB132PS xmm, xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_fmsub_ps(__m128 a, __m128 b, __m128 c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtract(a.FP32, b.FP32, c.FP32); - - /// - /// Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// VFMSUB132SD xmm, xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_fmsub_sd(__m128d a, __m128d b, __m128d c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractScalar(a.FP64, b.FP64, c.FP64); - - /// - /// Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// VFMSUB132SS xmm, xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_fmsub_ss(__m128 a, __m128 b, __m128 c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractScalar(a.FP32, b.FP32, c.FP32); - - /// - /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". - /// - /// VFMSUBADD132PD xmm, xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_fmsubadd_pd(__m128d a, __m128d b, __m128d c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractAdd(a.FP64, b.FP64, c.FP64); - - /// - /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". - /// - /// VFMSUBADD132PS xmm, xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_fmsubadd_ps(__m128 a, __m128 b, __m128 c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractAdd(a.FP32, b.FP32, c.FP32); - - /// - /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". - /// - /// VFNMADD132PD xmm, xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_fnmadd_pd(__m128d a, __m128d b, __m128d c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddNegated(a.FP64, b.FP64, c.FP64); - - /// - /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". - /// - /// VFNMADD132PS xmm, xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddNegated(a.FP32, b.FP32, c.FP32); - - /// - /// Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// VFNMADD132SD xmm, xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_fnmadd_sd(__m128d a, __m128d b, __m128d c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddNegatedScalar(a.FP64, b.FP64, c.FP64); - - /// - /// Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// VFNMADD132SS xmm, xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_fnmadd_ss(__m128 a, __m128 b, __m128 c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddNegatedScalar(a.FP32, b.FP32, c.FP32); - - /// - /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". - /// - /// VFNMSUB132PD xmm, xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_fnmsub_pd(__m128d a, __m128d b, __m128d c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractNegated(a.FP64, b.FP64, c.FP64); - - /// - /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". - /// - /// VFNMSUB132PS xmm, xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractNegated(a.FP32, b.FP32, c.FP32); - - /// - /// Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// VFNMSUB132SD xmm, xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_fnmsub_sd(__m128d a, __m128d b, __m128d c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractNegatedScalar(a.FP64, b.FP64, c.FP64); - - /// - /// Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// VFNMSUB132SS xmm, xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_fnmsub_ss(__m128 a, __m128 b, __m128 c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractNegatedScalar(a.FP32, b.FP32, c.FP32); - - /// - /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". - /// - /// VFMADD132PD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_fmadd_pd(__m256d a, __m256d b, __m256d c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAdd(a.FP64, b.FP64, c.FP64); - - /// - /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". - /// - /// VFMADD132PS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_fmadd_ps(__m256 a, __m256 b, __m256 c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAdd(a.FP32, b.FP32, c.FP32); - - /// - /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". - /// - /// VFMADDSUB132PD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_fmaddsub_pd(__m256d a, __m256d b, __m256d c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddSubtract(a.FP64, b.FP64, c.FP64); - - /// - /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". - /// - /// VFMADDSUB132PS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_fmaddsub_ps(__m256 a, __m256 b, __m256 c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddSubtract(a.FP32, b.FP32, c.FP32); - - /// - /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". - /// - /// VFMSUB132PD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_fmsub_pd(__m256d a, __m256d b, __m256d c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtract(a.FP64, b.FP64, c.FP64); - - /// - /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". - /// - /// VFMSUB132PS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_fmsub_ps(__m256 a, __m256 b, __m256 c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtract(a.FP32, b.FP32, c.FP32); - - /// - /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". - /// - /// VFMSUBADD132PD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_fmsubadd_pd(__m256d a, __m256d b, __m256d c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractAdd(a.FP64, b.FP64, c.FP64); - - /// - /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". - /// - /// VFMSUBADD132PS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_fmsubadd_ps(__m256 a, __m256 b, __m256 c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractAdd(a.FP32, b.FP32, c.FP32); - - /// - /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". - /// - /// VFNMADD132PD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_fnmadd_pd(__m256d a, __m256d b, __m256d c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddNegated(a.FP64, b.FP64, c.FP64); - - /// - /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". - /// - /// VFNMADD132PS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_fnmadd_ps(__m256 a, __m256 b, __m256 c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddNegated(a.FP32, b.FP32, c.FP32); - - /// - /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". - /// - /// VFNMSUB132PD ymm, ymm, ymm - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d {FP64} - /// __m256d dst {FP64} - public static __m256d _mm256_fnmsub_pd(__m256d a, __m256d b, __m256d c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractNegated(a.FP64, b.FP64, c.FP64); - - /// - /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". - /// - /// VFNMSUB132PS ymm, ymm, ymm - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 {FP32} - /// __m256 dst {FP32} - public static __m256 _mm256_fnmsub_ps(__m256 a, __m256 b, __m256 c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractNegated(a.FP32, b.FP32, c.FP32); - - } -} diff --git a/src/External/RawIntrinsics/MMX.ManuallyAdded.cs b/src/External/RawIntrinsics/MMX.ManuallyAdded.cs deleted file mode 100644 index c1a5938..0000000 --- a/src/External/RawIntrinsics/MMX.ManuallyAdded.cs +++ /dev/null @@ -1,11 +0,0 @@ -namespace RawIntrinsics -{ - public static partial class MMX - { - /// - /// Return vector of type __m64 with all elements set to zero. - /// - /// __m64 dst {FP32} - public static __m64 _mm_setzero_si64() => System.Runtime.Intrinsics.Vector64.Zero; - } -} \ No newline at end of file diff --git a/src/External/RawIntrinsics/MMX.cs b/src/External/RawIntrinsics/MMX.cs deleted file mode 100644 index b8071c7..0000000 --- a/src/External/RawIntrinsics/MMX.cs +++ /dev/null @@ -1,65 +0,0 @@ -namespace RawIntrinsics -{ - public static unsafe partial class MMX - { - /// - /// Broadcast 16-bit integer "a" to all all elements of "dst". - /// - /// - /// short {UI16} - /// __m64 dst {FP32} - public static __m64 _mm_set1_pi16(short a) => System.Runtime.Intrinsics.Vector64.Create((ushort)a); - - /// - /// Broadcast 32-bit integer "a" to all elements of "dst". - /// - /// - /// int {UI32} - /// __m64 dst {FP32} - public static __m64 _mm_set1_pi32(int a) => System.Runtime.Intrinsics.Vector64.Create((uint)a); - - /// - /// Broadcast 8-bit integer "a" to all elements of "dst". - /// - /// - /// byte {UI8} - /// __m64 dst {FP32} - public static __m64 _mm_set1_pi8(byte a) => System.Runtime.Intrinsics.Vector64.Create(a); - - /// - /// Set packed 16-bit integers in "dst" with the supplied values in reverse order. - /// - /// - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// __m64 dst {FP32} - public static __m64 _mm_setr_pi16(short e3, short e2, short e1, short e0) => System.Runtime.Intrinsics.Vector64.Create((ushort)e3, (ushort)e2, (ushort)e1, (ushort)e0); - - /// - /// Set packed 32-bit integers in "dst" with the supplied values in reverse order. - /// - /// - /// int {UI32} - /// int {UI32} - /// __m64 dst {FP32} - public static __m64 _mm_setr_pi32(int e1, int e0) => System.Runtime.Intrinsics.Vector64.Create((uint)e1, (uint)e0); - - /// - /// Set packed 8-bit integers in "dst" with the supplied values in reverse order. - /// - /// - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// __m64 dst {FP32} - public static __m64 _mm_setr_pi8(byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) => System.Runtime.Intrinsics.Vector64.Create(e7, e6, e5, e4, e3, e2, e1, e0); - - } -} diff --git a/src/External/RawIntrinsics/Other.cs b/src/External/RawIntrinsics/Other.cs deleted file mode 100644 index 2d53527..0000000 --- a/src/External/RawIntrinsics/Other.cs +++ /dev/null @@ -1,101 +0,0 @@ -namespace RawIntrinsics -{ - public static unsafe partial class Other - { - /// - /// Perform one round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst". - /// - /// AESDEC xmm, xmm - /// __m128i {M128} - /// __m128i {M128} - /// __m128i dst {M128} - public static __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey) => System.Runtime.Intrinsics.X86.Aes.Decrypt(a.UI8, RoundKey.UI8); - - /// - /// Perform the last round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst". - /// - /// AESDECLAST xmm, xmm - /// __m128i {M128} - /// __m128i {M128} - /// __m128i dst {M128} - public static __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey) => System.Runtime.Intrinsics.X86.Aes.DecryptLast(a.UI8, RoundKey.UI8); - - /// - /// Perform one round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"." - /// - /// AESENC xmm, xmm - /// __m128i {M128} - /// __m128i {M128} - /// __m128i dst {M128} - public static __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey) => System.Runtime.Intrinsics.X86.Aes.Encrypt(a.UI8, RoundKey.UI8); - - /// - /// Perform the last round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"." - /// - /// AESENCLAST xmm, xmm - /// __m128i {M128} - /// __m128i {M128} - /// __m128i dst {M128} - public static __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) => System.Runtime.Intrinsics.X86.Aes.EncryptLast(a.UI8, RoundKey.UI8); - - /// - /// Perform the InvMixColumns transformation on "a" and store the result in "dst". - /// - /// AESIMC xmm, xmm - /// __m128i {M128} - /// __m128i dst {M128} - public static __m128i _mm_aesimc_si128(__m128i a) => System.Runtime.Intrinsics.X86.Aes.InverseMixColumns(a.UI8); - - /// - /// Assist in expanding the AES cipher key by computing steps towards generating a round key for encryption cipher using data from "a" and an 8-bit round constant specified in "imm8", and store the result in "dst"." - /// - /// AESKEYGENASSIST xmm, xmm, imm8 - /// __m128i {M128} - /// int {IMM} - /// __m128i dst {M128} - public static __m128i _mm_aeskeygenassist_si128(__m128i a, int imm8) => System.Runtime.Intrinsics.X86.Aes.KeygenAssist(a.UI8, (byte)imm8); - - /// - /// Perform a carry-less multiplication of two 64-bit integers, selected from "a" and "b" according to "imm8", and store the results in "dst". - /// - /// PCLMULQDQ xmm, xmm, imm8 - /// __m128i {M128} - /// __m128i {M128} - /// int {IMM} - /// __m128i dst {M128} - public static __m128i _mm_clmulepi64_si128(__m128i a, __m128i b, int imm8) => System.Runtime.Intrinsics.X86.Pclmulqdq.CarrylessMultiply(a.SI64, b.SI64, (byte)imm8); - - /// - /// Count the number of bits set to 1 in unsigned 32-bit integer "a", and return that count in "dst". - /// - /// POPCNT r32, r32 - /// int {UI32} - /// int dst {UI32} - public static int _mm_popcnt_u32(int a) => (int)System.Runtime.Intrinsics.X86.Popcnt.PopCount((uint)a); - - /// - /// Count the number of bits set to 1 in unsigned 64-bit integer "a", and return that count in "dst". - /// - /// POPCNT r64, r64 - /// long {UI64} - /// long dst {UI64} - public static long _mm_popcnt_u64(long a) => (long)System.Runtime.Intrinsics.X86.Popcnt.X64.PopCount((ulong)a); - - /// - /// Count the number of trailing zero bits in unsigned 32-bit integer "a", and return that count in "dst". - /// - /// TZCNT r32, r32 - /// int {UI32} - /// int dst {UI32} - public static int _mm_tzcnt_32(int a) => (int)System.Runtime.Intrinsics.X86.Bmi1.TrailingZeroCount((uint)a); - - /// - /// Count the number of trailing zero bits in unsigned 64-bit integer "a", and return that count in "dst". - /// - /// TZCNT r64, r64 - /// long {UI64} - /// long dst {UI64} - public static long _mm_tzcnt_64(long a) => (long)System.Runtime.Intrinsics.X86.Bmi1.X64.TrailingZeroCount((ulong)a); - - } -} diff --git a/src/External/RawIntrinsics/RawIntrinsics.csproj b/src/External/RawIntrinsics/RawIntrinsics.csproj deleted file mode 100644 index 2e66377..0000000 --- a/src/External/RawIntrinsics/RawIntrinsics.csproj +++ /dev/null @@ -1,8 +0,0 @@ - - - - net7.0 - true - - - diff --git a/src/External/RawIntrinsics/SSE.ManuallyAdded.cs b/src/External/RawIntrinsics/SSE.ManuallyAdded.cs deleted file mode 100644 index 09d9083..0000000 --- a/src/External/RawIntrinsics/SSE.ManuallyAdded.cs +++ /dev/null @@ -1,11 +0,0 @@ -namespace RawIntrinsics -{ - public static partial class SSE - { - /// - /// Return vector of type __m128 with all elements set to zero. - /// - /// __m128 dst {FP32} - public static __m128 _mm_setzero_ps() => System.Runtime.Intrinsics.Vector128.Zero; - } -} \ No newline at end of file diff --git a/src/External/RawIntrinsics/SSE.cs b/src/External/RawIntrinsics/SSE.cs deleted file mode 100644 index e7c090c..0000000 --- a/src/External/RawIntrinsics/SSE.cs +++ /dev/null @@ -1,766 +0,0 @@ -namespace RawIntrinsics -{ - public static unsafe partial class SSE - { - /// - /// Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - /// - /// ADDPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_add_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.Add(a.FP32, b.FP32); - - /// - /// Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// ADDSS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_add_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.AddScalar(a.FP32, b.FP32); - - /// - /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - /// - /// ANDPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_and_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.And(a.FP32, b.FP32); - - /// - /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". - /// - /// ANDNPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_andnot_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.AndNot(a.FP32, b.FP32); - - /// - /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in "dst". - /// - /// CMPPS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmpeq_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareEqual(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for equality, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// CMPSS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmpeq_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarEqual(a.FP32, b.FP32); - - /// - /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and store the results in "dst". - /// - /// CMPPS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmpge_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareGreaterThanOrEqual(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for greater-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// CMPSS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmpge_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarGreaterThanOrEqual(a.FP32, b.FP32); - - /// - /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for greater-than, and store the results in "dst". - /// - /// CMPPS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmpgt_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareGreaterThan(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for greater-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// CMPSS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmpgt_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarGreaterThan(a.FP32, b.FP32); - - /// - /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in "dst". - /// - /// CMPPS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmple_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareLessThanOrEqual(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// CMPSS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmple_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarLessThanOrEqual(a.FP32, b.FP32); - - /// - /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in "dst". - /// - /// CMPPS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmplt_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareLessThan(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for less-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// CMPSS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmplt_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarLessThan(a.FP32, b.FP32); - - /// - /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in "dst". - /// - /// CMPPS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmpneq_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareNotEqual(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// CMPSS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmpneq_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarNotEqual(a.FP32, b.FP32); - - /// - /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, and store the results in "dst". - /// - /// CMPPS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmpnge_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareNotGreaterThanOrEqual(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// CMPSS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmpnge_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarNotGreaterThanOrEqual(a.FP32, b.FP32); - - /// - /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than, and store the results in "dst". - /// - /// CMPPS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmpngt_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareNotGreaterThan(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// CMPSS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmpngt_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarNotGreaterThan(a.FP32, b.FP32); - - /// - /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in "dst". - /// - /// CMPPS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmpnle_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareNotLessThanOrEqual(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// CMPSS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmpnle_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarNotLessThanOrEqual(a.FP32, b.FP32); - - /// - /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in "dst". - /// - /// CMPPS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareNotLessThan(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// CMPSS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarNotLessThan(a.FP32, b.FP32); - - /// - /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in "dst". - /// - /// CMPPS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmpord_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareOrdered(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// CMPSS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmpord_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarOrdered(a.FP32, b.FP32); - - /// - /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in "dst". - /// - /// CMPPS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmpunord_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareUnordered(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// CMPSS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_cmpunord_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarUnordered(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). - /// - /// COMISS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// bool k {UI8} - public static bool _mm_comieq_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarOrderedEqual(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). - /// - /// COMISS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// bool k {UI8} - public static bool _mm_comige_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarOrderedGreaterThanOrEqual(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). - /// - /// COMISS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// bool k {UI8} - public static bool _mm_comigt_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarOrderedGreaterThan(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). - /// - /// COMISS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// bool k {UI8} - public static bool _mm_comile_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarOrderedLessThanOrEqual(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). - /// - /// COMISS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// bool k {UI8} - public static bool _mm_comilt_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarOrderedLessThan(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). - /// - /// COMISS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// bool k {UI8} - public static bool _mm_comineq_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarOrderedNotEqual(a.FP32, b.FP32); - - /// - /// Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// CVTSI2SS xmm, r32 - /// __m128 {FP32} - /// int {SI32} - /// __m128 dst {FP32} - public static __m128 _mm_cvtsi32_ss(__m128 a, int b) => System.Runtime.Intrinsics.X86.Sse.ConvertScalarToVector128Single(a.FP32, b); - - /// - /// Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// CVTSI2SS xmm, r64 - /// __m128 {FP32} - /// long {SI64} - /// __m128 dst {FP32} - public static __m128 _mm_cvtsi64_ss(__m128 a, long b) => System.Runtime.Intrinsics.X86.Sse.X64.ConvertScalarToVector128Single(a.FP32, b); - - /// - /// Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". - /// - /// CVTSS2SI r32, xmm - /// __m128 {FP32} - /// int dst {UI32} - public static int _mm_cvtss_si32(__m128 a) => System.Runtime.Intrinsics.X86.Sse.ConvertToInt32(a.FP32); - - /// - /// Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". - /// - /// CVTSS2SI r64, xmm - /// __m128 {FP32} - /// long dst {UI64} - public static long _mm_cvtss_si64(__m128 a) => System.Runtime.Intrinsics.X86.Sse.X64.ConvertToInt64(a.FP32); - - /// - /// Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". - /// - /// CVTTSS2SI r32, xmm - /// __m128 {FP32} - /// int dst {UI32} - public static int _mm_cvttss_si32(__m128 a) => System.Runtime.Intrinsics.X86.Sse.ConvertToInt32WithTruncation(a.FP32); - - /// - /// Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". - /// - /// CVTTSS2SI r64, xmm - /// __m128 {FP32} - /// long dst {UI64} - public static long _mm_cvttss_si64(__m128 a) => System.Runtime.Intrinsics.X86.Sse.X64.ConvertToInt64WithTruncation(a.FP32); - - /// - /// Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". - /// - /// DIVPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_div_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.Divide(a.FP32, b.FP32); - - /// - /// Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// DIVSS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_div_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.DivideScalar(a.FP32, b.FP32); - - /// - /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into "dst". "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - /// - /// MOVAPS xmm, m128 - /// float {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_load_ps(float* mem_addr) => System.Runtime.Intrinsics.X86.Sse.LoadAlignedVector128(mem_addr); - - /// - /// Load a single-precision (32-bit) floating-point element from memory into the lower of "dst", and zero the upper 3 elements. "mem_addr" does not need to be aligned on any particular boundary. - /// - /// MOVSS xmm, m32 - /// float {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_load_ss(float* mem_addr) => System.Runtime.Intrinsics.X86.Sse.LoadScalarVector128(mem_addr); - - /// - /// Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of "dst", and copy the lower 2 elements from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. - /// - /// MOVHPS xmm, m64 - /// __m128 {FP32} - /// __m64 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_loadh_pi(__m128 a, __m64* mem_addr) => System.Runtime.Intrinsics.X86.Sse.LoadHigh(a.FP32, (float*)mem_addr); - - /// - /// Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of "dst", and copy the upper 2 elements from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. - /// - /// MOVLPS xmm, m64 - /// __m128 {FP32} - /// __m64 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_loadl_pi(__m128 a, __m64* mem_addr) => System.Runtime.Intrinsics.X86.Sse.LoadLow(a.FP32, (float*)mem_addr); - - /// - /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into "dst". "mem_addr" does not need to be aligned on any particular boundary. - /// - /// MOVUPS xmm, m128 - /// float {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_loadu_ps(float* mem_addr) => System.Runtime.Intrinsics.X86.Sse.LoadVector128(mem_addr); - - /// - /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". - /// - /// MAXPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_max_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.Max(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper element of "dst". - /// - /// MAXSS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_max_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.MaxScalar(a.FP32, b.FP32); - - /// - /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". - /// - /// MINPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_min_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.Min(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper element of "dst". - /// - /// MINSS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_min_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.MinScalar(a.FP32, b.FP32); - - /// - /// Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// MOVSS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_move_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.MoveScalar(a.FP32, b.FP32); - - /// - /// Move the upper 2 single-precision (32-bit) floating-point elements from "b" to the lower 2 elements of "dst", and copy the upper 2 elements from "a" to the upper 2 elements of "dst". - /// - /// MOVHLPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_movehl_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.MoveHighToLow(a.FP32, b.FP32); - - /// - /// Move the lower 2 single-precision (32-bit) floating-point elements from "b" to the upper 2 elements of "dst", and copy the lower 2 elements from "a" to the lower 2 elements of "dst". - /// - /// MOVLHPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_movelh_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.MoveLowToHigh(a.FP32, b.FP32); - - /// - /// Set each bit of mask "dst" based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in "a". - /// - /// MOVMSKPS r32, xmm - /// __m128 {FP32} - /// int dst {UI32} - public static int _mm_movemask_ps(__m128 a) => System.Runtime.Intrinsics.X86.Sse.MoveMask(a.FP32); - - /// - /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - /// - /// MULPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_mul_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.Multiply(a.FP32, b.FP32); - - /// - /// Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// MULSS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_mul_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.MultiplyScalar(a.FP32, b.FP32); - - /// - /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - /// - /// ORPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_or_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.Or(a.FP32, b.FP32); - - /// - /// Fetch the line of data from memory that contains address "p" to a location in the cache heirarchy specified by the locality hint "i". - /// - /// PREFETCHNTA m8 - /// byte {UI8} - /// int {IMM} - /// void {} - public static void _mm_prefetch(byte* p, int i) => System.Runtime.Intrinsics.X86.Sse.Prefetch0((void*)p); - - /// - /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - /// - /// RCPPS xmm, xmm - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_rcp_ps(__m128 a) => System.Runtime.Intrinsics.X86.Sse.Reciprocal(a.FP32); - - /// - /// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - /// - /// RCPSS xmm, xmm - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_rcp_ss(__m128 a) => System.Runtime.Intrinsics.X86.Sse.ReciprocalScalar(a.FP32); - - /// - /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - /// - /// RSQRTPS xmm, xmm - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_rsqrt_ps(__m128 a) => System.Runtime.Intrinsics.X86.Sse.ReciprocalSqrt(a.FP32); - - /// - /// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - /// - /// RSQRTSS xmm, xmm - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_rsqrt_ss(__m128 a) => System.Runtime.Intrinsics.X86.Sse.ReciprocalSqrtScalar(a.FP32); - - /// - /// Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst". - /// - /// - /// float {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_set1_ps(float a) => System.Runtime.Intrinsics.Vector128.Create(a); - - /// - /// Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order. - /// - /// - /// float {FP32} - /// float {FP32} - /// float {FP32} - /// float {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_setr_ps(float e3, float e2, float e1, float e0) => System.Runtime.Intrinsics.Vector128.Create(e3, e2, e1, e0); - - /// - /// Perform a serializing operation on all store-to-memory instructions that were issued prior to this instruction. Guarantees that every store instruction that precedes, in program order, is globally visible before any store instruction which follows the fence in program order. - /// - /// SFENCE - /// void {} - public static void _mm_sfence() => System.Runtime.Intrinsics.X86.Sse.StoreFence(); - - /// - /// Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst". - /// - /// SHUFPS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// int {IMM} - /// __m128 dst {FP32} - public static __m128 _mm_shuffle_ps(__m128 a, __m128 b, int imm8) => System.Runtime.Intrinsics.X86.Sse.Shuffle(a.FP32, b.FP32, (byte)imm8); - - /// - /// Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - /// - /// SQRTPS xmm, xmm - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_sqrt_ps(__m128 a) => System.Runtime.Intrinsics.X86.Sse.Sqrt(a.FP32); - - /// - /// Compute the square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// SQRTSS xmm, xmm - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_sqrt_ss(__m128 a) => System.Runtime.Intrinsics.X86.Sse.SqrtScalar(a.FP32); - - /// - /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - /// - /// MOVAPS m128, xmm - /// float {FP32} - /// __m128 {FP32} - /// void {} - public static void _mm_store_ps(float* mem_addr, __m128 a) => System.Runtime.Intrinsics.X86.Sse.StoreAligned(mem_addr, a.FP32); - - /// - /// Store the lower single-precision (32-bit) floating-point element from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. - /// - /// MOVSS m32, xmm - /// float {FP32} - /// __m128 {FP32} - /// void {} - public static void _mm_store_ss(float* mem_addr, __m128 a) => System.Runtime.Intrinsics.X86.Sse.StoreScalar(mem_addr, a.FP32); - - /// - /// Store the upper 2 single-precision (32-bit) floating-point elements from "a" into memory. - /// - /// MOVHPS m64, xmm - /// __m64 {FP32} - /// __m128 {FP32} - /// void {} - public static void _mm_storeh_pi(__m64* mem_addr, __m128 a) => System.Runtime.Intrinsics.X86.Sse.StoreHigh((float*)mem_addr, a.FP32); - - /// - /// Store the lower 2 single-precision (32-bit) floating-point elements from "a" into memory. - /// - /// MOVLPS m64, xmm - /// __m64 {FP32} - /// __m128 {FP32} - /// void {} - public static void _mm_storel_pi(__m64* mem_addr, __m128 a) => System.Runtime.Intrinsics.X86.Sse.StoreLow((float*)mem_addr, a.FP32); - - /// - /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. - /// - /// MOVUPS m128, xmm - /// float {FP32} - /// __m128 {FP32} - /// void {} - public static void _mm_storeu_ps(float* mem_addr, __m128 a) => System.Runtime.Intrinsics.X86.Sse.Store(mem_addr, a.FP32); - - /// - /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - /// - /// MOVNTPS m128, xmm - /// float {FP32} - /// __m128 {FP32} - /// void {} - public static void _mm_stream_ps(float* mem_addr, __m128 a) => System.Runtime.Intrinsics.X86.Sse.StoreAlignedNonTemporal(mem_addr, a.FP32); - - /// - /// Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - /// - /// SUBPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_sub_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.Subtract(a.FP32, b.FP32); - - /// - /// Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// SUBSS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_sub_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.SubtractScalar(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - /// - /// UCOMISS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// bool k {UI8} - public static bool _mm_ucomieq_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarUnorderedEqual(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - /// - /// UCOMISS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// bool k {UI8} - public static bool _mm_ucomige_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarUnorderedGreaterThanOrEqual(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - /// - /// UCOMISS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// bool k {UI8} - public static bool _mm_ucomigt_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarUnorderedGreaterThan(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - /// - /// UCOMISS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// bool k {UI8} - public static bool _mm_ucomile_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarUnorderedLessThanOrEqual(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - /// - /// UCOMISS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// bool k {UI8} - public static bool _mm_ucomilt_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarUnorderedLessThan(a.FP32, b.FP32); - - /// - /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - /// - /// UCOMISS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// bool k {UI8} - public static bool _mm_ucomineq_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarUnorderedNotEqual(a.FP32, b.FP32); - - /// - /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half "a" and "b", and store the results in "dst". - /// - /// UNPCKHPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_unpackhi_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.UnpackHigh(a.FP32, b.FP32); - - /// - /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst". - /// - /// UNPCKLPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_unpacklo_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.UnpackLow(a.FP32, b.FP32); - - /// - /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - /// - /// XORPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_xor_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.Xor(a.FP32, b.FP32); - - } -} diff --git a/src/External/RawIntrinsics/SSE2.ManuallyAdded.cs b/src/External/RawIntrinsics/SSE2.ManuallyAdded.cs deleted file mode 100644 index de75d41..0000000 --- a/src/External/RawIntrinsics/SSE2.ManuallyAdded.cs +++ /dev/null @@ -1,17 +0,0 @@ -namespace RawIntrinsics -{ - public static partial class SSE2 - { - /// - /// Return vector of type __m128d with all elements set to zero. - /// - /// __m128d dst {M128} - public static __m128d _mm_setzero_pd() => System.Runtime.Intrinsics.Vector128.Zero; - - /// - /// Return vector of type __m128i with all elements set to zero. - /// - /// __m128i dst {M128} - public static __m128i _mm_setzero_si128() => System.Runtime.Intrinsics.Vector128.Zero; - } -} \ No newline at end of file diff --git a/src/External/RawIntrinsics/SSE2.cs b/src/External/RawIntrinsics/SSE2.cs deleted file mode 100644 index e25c4fc..0000000 --- a/src/External/RawIntrinsics/SSE2.cs +++ /dev/null @@ -1,1714 +0,0 @@ -namespace RawIntrinsics -{ - public static unsafe partial class SSE2 - { - /// - /// Add packed 16-bit integers in "a" and "b", and store the results in "dst". - /// - /// PADDW xmm, xmm - /// __m128i {UI16} - /// __m128i {UI16} - /// __m128i dst {UI16} - public static __m128i _mm_add_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.Add(a.UI16, b.UI16); - - /// - /// Add packed 32-bit integers in "a" and "b", and store the results in "dst". - /// - /// PADDD xmm, xmm - /// __m128i {UI32} - /// __m128i {UI32} - /// __m128i dst {UI32} - public static __m128i _mm_add_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.Add(a.UI32, b.UI32); - - /// - /// Add packed 64-bit integers in "a" and "b", and store the results in "dst". - /// - /// PADDQ xmm, xmm - /// __m128i {UI64} - /// __m128i {UI64} - /// __m128i dst {UI64} - public static __m128i _mm_add_epi64(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.Add(a.UI64, b.UI64); - - /// - /// Add packed 8-bit integers in "a" and "b", and store the results in "dst". - /// - /// PADDB xmm, xmm - /// __m128i {UI8} - /// __m128i {UI8} - /// __m128i dst {UI8} - public static __m128i _mm_add_epi8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.Add(a.UI8, b.UI8); - - /// - /// Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - /// - /// ADDPD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_add_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.Add(a.FP64, b.FP64); - - /// - /// Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// ADDSD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_add_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.AddScalar(a.FP64, b.FP64); - - /// - /// Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". - /// - /// PADDSW xmm, xmm - /// __m128i {SI16} - /// __m128i {SI16} - /// __m128i dst {UI16} - public static __m128i _mm_adds_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.AddSaturate(a.SI16, b.SI16); - - /// - /// Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". - /// - /// PADDSB xmm, xmm - /// __m128i {SI8} - /// __m128i {SI8} - /// __m128i dst {UI8} - public static __m128i _mm_adds_epi8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.AddSaturate(a.SI8, b.SI8); - - /// - /// Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". - /// - /// PADDUSW xmm, xmm - /// __m128i {UI16} - /// __m128i {UI16} - /// __m128i dst {UI16} - public static __m128i _mm_adds_epu16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.AddSaturate(a.UI16, b.UI16); - - /// - /// Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". - /// - /// PADDUSB xmm, xmm - /// __m128i {UI8} - /// __m128i {UI8} - /// __m128i dst {UI8} - public static __m128i _mm_adds_epu8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.AddSaturate(a.UI8, b.UI8); - - /// - /// Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - /// - /// ANDPD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_and_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.And(a.FP64, b.FP64); - - /// - /// Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and store the result in "dst". - /// - /// PAND xmm, xmm - /// __m128i {M128} - /// __m128i {M128} - /// __m128i dst {M128} - public static __m128i _mm_and_si128(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.And(a.UI8, b.UI8); - - /// - /// Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". - /// - /// ANDNPD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_andnot_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.AndNot(a.FP64, b.FP64); - - /// - /// Compute the bitwise NOT of 128 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". - /// - /// PANDN xmm, xmm - /// __m128i {M128} - /// __m128i {M128} - /// __m128i dst {M128} - public static __m128i _mm_andnot_si128(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.AndNot(a.UI8, b.UI8); - - /// - /// Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". - /// - /// PAVGW xmm, xmm - /// __m128i {UI16} - /// __m128i {UI16} - /// __m128i dst {UI16} - public static __m128i _mm_avg_epu16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.Average(a.UI16, b.UI16); - - /// - /// Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". - /// - /// PAVGB xmm, xmm - /// __m128i {UI8} - /// __m128i {UI8} - /// __m128i dst {UI8} - public static __m128i _mm_avg_epu8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.Average(a.UI8, b.UI8); - - /// - /// Shift "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". - /// - /// PSLLDQ xmm, imm8 - /// __m128i {M128} - /// int {IMM} - /// __m128i dst {M128} - public static __m128i _mm_bslli_si128(__m128i a, int imm8) => System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical128BitLane(a.SI8, (byte)imm8); - - /// - /// Shift "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst". - /// - /// PSRLDQ xmm, imm8 - /// __m128i {M128} - /// int {IMM} - /// __m128i dst {M128} - public static __m128i _mm_bsrli_si128(__m128i a, int imm8) => System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical128BitLane(a.SI8, (byte)imm8); - - /// - /// Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst". - /// - /// PCMPEQW xmm, xmm - /// __m128i {UI16} - /// __m128i {UI16} - /// __m128i dst {UI16} - public static __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.CompareEqual(a.UI16, b.UI16); - - /// - /// Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst". - /// - /// PCMPEQD xmm, xmm - /// __m128i {UI32} - /// __m128i {UI32} - /// __m128i dst {UI32} - public static __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.CompareEqual(a.UI32, b.UI32); - - /// - /// Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst". - /// - /// PCMPEQB xmm, xmm - /// __m128i {UI8} - /// __m128i {UI8} - /// __m128i dst {UI8} - public static __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.CompareEqual(a.UI8, b.UI8); - - /// - /// Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in "dst". - /// - /// CMPPD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmpeq_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareEqual(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for equality, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// CMPSD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmpeq_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarEqual(a.FP64, b.FP64); - - /// - /// Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and store the results in "dst". - /// - /// CMPPD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmpge_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareGreaterThanOrEqual(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for greater-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// CMPSD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmpge_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarGreaterThanOrEqual(a.FP64, b.FP64); - - /// - /// Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst". - /// - /// PCMPGTW xmm, xmm - /// __m128i {SI16} - /// __m128i {SI16} - /// __m128i dst {UI16} - public static __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.CompareGreaterThan(a.SI16, b.SI16); - - /// - /// Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst". - /// - /// PCMPGTD xmm, xmm - /// __m128i {SI32} - /// __m128i {SI32} - /// __m128i dst {UI32} - public static __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.CompareGreaterThan(a.SI32, b.SI32); - - /// - /// Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst". - /// - /// PCMPGTB xmm, xmm - /// __m128i {SI8} - /// __m128i {SI8} - /// __m128i dst {UI8} - public static __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.CompareGreaterThan(a.SI8, b.SI8); - - /// - /// Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for greater-than, and store the results in "dst". - /// - /// CMPPD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmpgt_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareGreaterThan(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for greater-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// CMPSD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmpgt_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarGreaterThan(a.FP64, b.FP64); - - /// - /// Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in "dst". - /// - /// CMPPD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmple_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareLessThanOrEqual(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// CMPSD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmple_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarLessThanOrEqual(a.FP64, b.FP64); - - /// - /// Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtw instruction with the order of the operands switched. - /// - /// PCMPGTW xmm, xmm - /// __m128i {SI16} - /// __m128i {SI16} - /// __m128i dst {UI16} - public static __m128i _mm_cmplt_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.CompareLessThan(a.SI16, b.SI16); - - /// - /// Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtd instruction with the order of the operands switched. - /// - /// PCMPGTD xmm, xmm - /// __m128i {SI32} - /// __m128i {SI32} - /// __m128i dst {UI32} - public static __m128i _mm_cmplt_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.CompareLessThan(a.SI32, b.SI32); - - /// - /// Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtb instruction with the order of the operands switched. - /// - /// PCMPGTB xmm, xmm - /// __m128i {SI8} - /// __m128i {SI8} - /// __m128i dst {UI8} - public static __m128i _mm_cmplt_epi8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.CompareLessThan(a.SI8, b.SI8); - - /// - /// Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in "dst". - /// - /// CMPPD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmplt_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareLessThan(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for less-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// CMPSD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmplt_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarLessThan(a.FP64, b.FP64); - - /// - /// Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in "dst". - /// - /// CMPPD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmpneq_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareNotEqual(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// CMPSD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmpneq_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarNotEqual(a.FP64, b.FP64); - - /// - /// Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, and store the results in "dst". - /// - /// CMPPD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmpnge_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareNotGreaterThanOrEqual(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// CMPSD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmpnge_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarNotGreaterThanOrEqual(a.FP64, b.FP64); - - /// - /// Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than, and store the results in "dst". - /// - /// CMPPD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmpngt_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareNotGreaterThan(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// CMPSD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmpngt_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarNotGreaterThan(a.FP64, b.FP64); - - /// - /// Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in "dst". - /// - /// CMPPD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmpnle_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareNotLessThanOrEqual(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// CMPSD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmpnle_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarNotLessThanOrEqual(a.FP64, b.FP64); - - /// - /// Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in "dst". - /// - /// CMPPD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmpnlt_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareNotLessThan(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// CMPSD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmpnlt_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarNotLessThan(a.FP64, b.FP64); - - /// - /// Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in "dst". - /// - /// CMPPD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmpord_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareOrdered(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// CMPSD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmpord_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarOrdered(a.FP64, b.FP64); - - /// - /// Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in "dst". - /// - /// CMPPD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmpunord_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareUnordered(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// CMPSD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_cmpunord_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarUnordered(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). - /// - /// COMISD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// bool k {UI8} - public static bool _mm_comieq_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarOrderedEqual(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). - /// - /// COMISD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// bool k {UI8} - public static bool _mm_comige_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarOrderedGreaterThanOrEqual(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). - /// - /// COMISD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// bool k {UI8} - public static bool _mm_comigt_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarOrderedGreaterThan(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). - /// - /// COMISD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// bool k {UI8} - public static bool _mm_comile_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarOrderedLessThanOrEqual(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). - /// - /// COMISD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// bool k {UI8} - public static bool _mm_comilt_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarOrderedLessThan(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). - /// - /// COMISD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// bool k {UI8} - public static bool _mm_comineq_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarOrderedNotEqual(a.FP64, b.FP64); - - /// - /// Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - /// - /// CVTDQ2PD xmm, xmm - /// __m128i {SI32} - /// __m128d dst {FP64} - public static __m128d _mm_cvtepi32_pd(__m128i a) => System.Runtime.Intrinsics.X86.Sse2.ConvertToVector128Double(a.SI32); - - /// - /// Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - /// - /// CVTDQ2PS xmm, xmm - /// __m128i {SI32} - /// __m128 dst {FP32} - public static __m128 _mm_cvtepi32_ps(__m128i a) => System.Runtime.Intrinsics.X86.Sse2.ConvertToVector128Single(a.SI32); - - /// - /// Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - /// - /// CVTPD2DQ xmm, xmm - /// __m128d {FP64} - /// __m128i dst {UI32} - public static __m128i _mm_cvtpd_epi32(__m128d a) => System.Runtime.Intrinsics.X86.Sse2.ConvertToVector128Int32(a.FP64); - - /// - /// Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - /// - /// CVTPD2PS xmm, xmm - /// __m128d {FP64} - /// __m128 dst {FP32} - public static __m128 _mm_cvtpd_ps(__m128d a) => System.Runtime.Intrinsics.X86.Sse2.ConvertToVector128Single(a.FP64); - - /// - /// Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - /// - /// CVTPS2DQ xmm, xmm - /// __m128 {FP32} - /// __m128i dst {UI32} - public static __m128i _mm_cvtps_epi32(__m128 a) => System.Runtime.Intrinsics.X86.Sse2.ConvertToVector128Int32(a.FP32); - - /// - /// Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - /// - /// CVTPS2PD xmm, xmm - /// __m128 {FP32} - /// __m128d dst {FP64} - public static __m128d _mm_cvtps_pd(__m128 a) => System.Runtime.Intrinsics.X86.Sse2.ConvertToVector128Double(a.FP32); - - /// - /// Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". - /// - /// CVTSD2SI r32, xmm - /// __m128d {FP64} - /// int dst {UI32} - public static int _mm_cvtsd_si32(__m128d a) => System.Runtime.Intrinsics.X86.Sse2.ConvertToInt32(a.FP64); - - /// - /// Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". - /// - /// CVTSD2SI r64, xmm - /// __m128d {FP64} - /// long dst {UI64} - public static long _mm_cvtsd_si64(__m128d a) => System.Runtime.Intrinsics.X86.Sse2.X64.ConvertToInt64(a.FP64); - - /// - /// Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// CVTSD2SS xmm, xmm - /// __m128 {FP32} - /// __m128d {FP64} - /// __m128 dst {FP32} - public static __m128 _mm_cvtsd_ss(__m128 a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.ConvertScalarToVector128Single(a.FP32, b.FP64); - - /// - /// Copy the lower 32-bit integer in "a" to "dst". - /// - /// MOVD r32, xmm - /// __m128i {UI32} - /// int dst {UI32} - public static int _mm_cvtsi128_si32(__m128i a) => (int)System.Runtime.Intrinsics.X86.Sse2.ConvertToUInt32(a.UI32); - - /// - /// Copy the lower 64-bit integer in "a" to "dst". - /// - /// MOVQ r64, xmm - /// __m128i {UI64} - /// long dst {UI64} - public static long _mm_cvtsi128_si64(__m128i a) => (long)System.Runtime.Intrinsics.X86.Sse2.X64.ConvertToUInt64(a.UI64); - - /// - /// Convert the signed 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// CVTSI2SD xmm, r32 - /// __m128d {FP64} - /// int {SI32} - /// __m128d dst {FP64} - public static __m128d _mm_cvtsi32_sd(__m128d a, int b) => System.Runtime.Intrinsics.X86.Sse2.ConvertScalarToVector128Double(a.FP64, b); - - /// - /// Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper elements of "dst". - /// - /// MOVD xmm, r32 - /// int {UI32} - /// __m128i dst {UI32} - public static __m128i _mm_cvtsi32_si128(int a) => System.Runtime.Intrinsics.X86.Sse2.ConvertScalarToVector128UInt32((uint)a); - - /// - /// Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// CVTSI2SD xmm, r64 - /// __m128d {FP64} - /// long {SI64} - /// __m128d dst {FP64} - public static __m128d _mm_cvtsi64_sd(__m128d a, long b) => System.Runtime.Intrinsics.X86.Sse2.X64.ConvertScalarToVector128Double(a.FP64, b); - - /// - /// Copy 64-bit integer "a" to the lower element of "dst", and zero the upper element. - /// - /// MOVQ xmm, r64 - /// long {UI64} - /// __m128i dst {UI64} - public static __m128i _mm_cvtsi64_si128(long a) => System.Runtime.Intrinsics.X86.Sse2.X64.ConvertScalarToVector128UInt64((ulong)a); - - /// - /// Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// CVTSS2SD xmm, xmm - /// __m128d {FP64} - /// __m128 {FP32} - /// __m128d dst {FP64} - public static __m128d _mm_cvtss_sd(__m128d a, __m128 b) => System.Runtime.Intrinsics.X86.Sse2.ConvertScalarToVector128Double(a.FP64, b.FP32); - - /// - /// Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". - /// - /// CVTTPD2DQ xmm, xmm - /// __m128d {FP64} - /// __m128i dst {UI32} - public static __m128i _mm_cvttpd_epi32(__m128d a) => System.Runtime.Intrinsics.X86.Sse2.ConvertToVector128Int32WithTruncation(a.FP64); - - /// - /// Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". - /// - /// CVTTPS2DQ xmm, xmm - /// __m128 {FP32} - /// __m128i dst {UI32} - public static __m128i _mm_cvttps_epi32(__m128 a) => System.Runtime.Intrinsics.X86.Sse2.ConvertToVector128Int32WithTruncation(a.FP32); - - /// - /// Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". - /// - /// CVTTSD2SI r32, xmm - /// __m128d {FP64} - /// int dst {UI32} - public static int _mm_cvttsd_si32(__m128d a) => System.Runtime.Intrinsics.X86.Sse2.ConvertToInt32WithTruncation(a.FP64); - - /// - /// Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". - /// - /// CVTTSD2SI r64, xmm - /// __m128d {FP64} - /// long dst {UI64} - public static long _mm_cvttsd_si64(__m128d a) => System.Runtime.Intrinsics.X86.Sse2.X64.ConvertToInt64WithTruncation(a.FP64); - - /// - /// Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". - /// - /// DIVPD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_div_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.Divide(a.FP64, b.FP64); - - /// - /// Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// DIVSD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_div_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.DivideScalar(a.FP64, b.FP64); - - /// - /// Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst". - /// - /// PEXTRW r32, xmm, imm8 - /// __m128i {UI16} - /// int {IMM} - /// int dst {UI16} - public static int _mm_extract_epi16(__m128i a, int imm8) => (int)System.Runtime.Intrinsics.X86.Sse2.Extract(a.UI16, (byte)imm8); - - /// - /// Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8". - /// - /// PINSRW xmm, r32, imm8 - /// __m128i {UI16} - /// int {UI16} - /// int {IMM} - /// __m128i dst {UI16} - public static __m128i _mm_insert_epi16(__m128i a, int i, int imm8) => System.Runtime.Intrinsics.X86.Sse2.Insert(a.UI16, (ushort)i, (byte)imm8); - - /// - /// Perform a serializing operation on all load-from-memory instructions that were issued prior to this instruction. Guarantees that every load instruction that precedes, in program order, is globally visible before any load instruction which follows the fence in program order. - /// - /// LFENCE - /// void {} - public static void _mm_lfence() => System.Runtime.Intrinsics.X86.Sse2.LoadFence(); - - /// - /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into "dst". "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - /// - /// MOVAPD xmm, m128 - /// double {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_load_pd(double* mem_addr) => System.Runtime.Intrinsics.X86.Sse2.LoadAlignedVector128(mem_addr); - - /// - /// Load a double-precision (64-bit) floating-point element from memory into the lower of "dst", and zero the upper element. "mem_addr" does not need to be aligned on any particular boundary. - /// - /// MOVSD xmm, m64 - /// double {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_load_sd(double* mem_addr) => System.Runtime.Intrinsics.X86.Sse2.LoadScalarVector128(mem_addr); - - /// - /// Load 128-bits of integer data from memory into "dst". "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - /// - /// MOVDQA xmm, m128 - /// __m128i {M128} - /// __m128i dst {M128} - public static __m128i _mm_load_si128(__m128i* mem_addr) => System.Runtime.Intrinsics.X86.Sse2.LoadAlignedVector128((sbyte*)mem_addr); - - /// - /// Load a double-precision (64-bit) floating-point element from memory into the upper element of "dst", and copy the lower element from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. - /// - /// MOVHPD xmm, m64 - /// __m128d {FP64} - /// double {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_loadh_pd(__m128d a, double* mem_addr) => System.Runtime.Intrinsics.X86.Sse2.LoadHigh(a.FP64, mem_addr); - - /// - /// Load 64-bit integer from memory into the first element of "dst". - /// - /// MOVQ xmm, m64 - /// __m128i {UI64} - /// __m128i dst {UI64} - public static __m128i _mm_loadl_epi64(__m128i* mem_addr) => System.Runtime.Intrinsics.X86.Sse2.LoadScalarVector128((long*)mem_addr); - - /// - /// Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst", and copy the upper element from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. - /// - /// MOVLPD xmm, m64 - /// __m128d {FP64} - /// double {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_loadl_pd(__m128d a, double* mem_addr) => System.Runtime.Intrinsics.X86.Sse2.LoadLow(a.FP64, mem_addr); - - /// - /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into "dst". "mem_addr" does not need to be aligned on any particular boundary. - /// - /// MOVUPD xmm, m128 - /// double {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_loadu_pd(double* mem_addr) => System.Runtime.Intrinsics.X86.Sse2.LoadVector128(mem_addr); - - /// - /// Load 128-bits of integer data from memory into "dst". "mem_addr" does not need to be aligned on any particular boundary. - /// - /// MOVDQU xmm, m128 - /// __m128i {M128} - /// __m128i dst {M128} - public static __m128i _mm_loadu_si128(__m128i* mem_addr) => System.Runtime.Intrinsics.X86.Sse2.LoadVector128((sbyte*)mem_addr); - - /// - /// Load unaligned 32-bit integer from memory into the first element of "dst". - /// - /// MOVD xmm, m32 - /// void {UI32} - /// __m128i dst {UI32} - public static __m128i _mm_loadu_si32(void* mem_addr) => System.Runtime.Intrinsics.X86.Sse2.LoadScalarVector128((int*)mem_addr); - - /// - /// Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". - /// - /// PMADDWD xmm, xmm - /// __m128i {SI16} - /// __m128i {SI16} - /// __m128i dst {SI32} - public static __m128i _mm_madd_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.MultiplyAddAdjacent(a.SI16, b.SI16); - - /// - /// Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint. "mem_addr" does not need to be aligned on any particular boundary. - /// - /// MASKMOVDQU xmm, xmm - /// __m128i {UI8} - /// __m128i {UI8} - /// byte {UI8} - /// void {} - public static void _mm_maskmoveu_si128(__m128i a, __m128i mask, byte* mem_addr) => System.Runtime.Intrinsics.X86.Sse2.MaskMove(a.UI8, mask.UI8, mem_addr); - - /// - /// Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". - /// - /// PMAXSW xmm, xmm - /// __m128i {SI16} - /// __m128i {SI16} - /// __m128i dst {UI16} - public static __m128i _mm_max_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.Max(a.SI16, b.SI16); - - /// - /// Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". - /// - /// PMAXUB xmm, xmm - /// __m128i {UI8} - /// __m128i {UI8} - /// __m128i dst {UI8} - public static __m128i _mm_max_epu8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.Max(a.UI8, b.UI8); - - /// - /// Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". - /// - /// MAXPD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_max_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.Max(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// MAXSD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_max_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.MaxScalar(a.FP64, b.FP64); - - /// - /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction is globally visible before any memory instruction which follows the fence in program order. - /// - /// MFENCE - /// void {} - public static void _mm_mfence() => System.Runtime.Intrinsics.X86.Sse2.MemoryFence(); - - /// - /// Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". - /// - /// PMINSW xmm, xmm - /// __m128i {SI16} - /// __m128i {SI16} - /// __m128i dst {UI16} - public static __m128i _mm_min_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.Min(a.SI16, b.SI16); - - /// - /// Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". - /// - /// PMINUB xmm, xmm - /// __m128i {UI8} - /// __m128i {UI8} - /// __m128i dst {UI8} - public static __m128i _mm_min_epu8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.Min(a.UI8, b.UI8); - - /// - /// Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". - /// - /// MINPD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_min_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.Min(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// MINSD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_min_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.MinScalar(a.FP64, b.FP64); - - /// - /// Copy the lower 64-bit integer in "a" to the lower element of "dst", and zero the upper element. - /// - /// MOVQ xmm, xmm - /// __m128i {UI64} - /// __m128i dst {UI64} - public static __m128i _mm_move_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Sse2.MoveScalar(a.UI64); - - /// - /// Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// MOVSD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_move_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.MoveScalar(a.FP64, b.FP64); - - /// - /// Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst". - /// - /// PMOVMSKB r32, xmm - /// __m128i {UI8} - /// int dst {MASK} - public static int _mm_movemask_epi8(__m128i a) => System.Runtime.Intrinsics.X86.Sse2.MoveMask(a.UI8); - - /// - /// Set each bit of mask "dst" based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in "a". - /// - /// MOVMSKPD r32, xmm - /// __m128d {FP64} - /// int dst {MASK} - public static int _mm_movemask_pd(__m128d a) => System.Runtime.Intrinsics.X86.Sse2.MoveMask(a.FP64); - - /// - /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst". - /// - /// PMULUDQ xmm, xmm - /// __m128i {UI32} - /// __m128i {UI32} - /// __m128i dst {UI64} - public static __m128i _mm_mul_epu32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.Multiply(a.UI32, b.UI32); - - /// - /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - /// - /// MULPD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_mul_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.Multiply(a.FP64, b.FP64); - - /// - /// Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// MULSD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_mul_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.MultiplyScalar(a.FP64, b.FP64); - - /// - /// Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". - /// - /// PMULHW xmm, xmm - /// __m128i {SI16} - /// __m128i {SI16} - /// __m128i dst {UI16} - public static __m128i _mm_mulhi_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.MultiplyHigh(a.SI16, b.SI16); - - /// - /// Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". - /// - /// PMULHUW xmm, xmm - /// __m128i {UI16} - /// __m128i {UI16} - /// __m128i dst {UI16} - public static __m128i _mm_mulhi_epu16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.MultiplyHigh(a.UI16, b.UI16); - - /// - /// Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". - /// - /// PMULLW xmm, xmm - /// __m128i {UI16} - /// __m128i {UI16} - /// __m128i dst {UI16} - public static __m128i _mm_mullo_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.MultiplyLow(a.UI16, b.UI16); - - /// - /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - /// - /// ORPD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_or_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.Or(a.FP64, b.FP64); - - /// - /// Compute the bitwise OR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst". - /// - /// POR xmm, xmm - /// __m128i {M128} - /// __m128i {M128} - /// __m128i dst {M128} - public static __m128i _mm_or_si128(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.Or(a.UI8, b.UI8); - - /// - /// Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". - /// - /// PACKSSWB xmm, xmm - /// __m128i {SI16} - /// __m128i {SI16} - /// __m128i dst {SI8} - public static __m128i _mm_packs_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.PackSignedSaturate(a.SI16, b.SI16); - - /// - /// Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". - /// - /// PACKSSDW xmm, xmm - /// __m128i {SI32} - /// __m128i {SI32} - /// __m128i dst {SI16} - public static __m128i _mm_packs_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.PackSignedSaturate(a.SI32, b.SI32); - - /// - /// Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". - /// - /// PACKUSWB xmm, xmm - /// __m128i {SI16} - /// __m128i {SI16} - /// __m128i dst {UI8} - public static __m128i _mm_packus_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.PackUnsignedSaturate(a.SI16, b.SI16); - - /// - /// Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst". - /// - /// PSADBW xmm, xmm - /// __m128i {UI8} - /// __m128i {UI8} - /// __m128i dst {UI16} - public static __m128i _mm_sad_epu8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.SumAbsoluteDifferences(a.UI8, b.UI8); - - /// - /// Broadcast 16-bit integer "a" to all all elements of "dst". This intrinsic may generate "vpbroadcastw". - /// - /// - /// short {UI16} - /// __m128i dst {UI16} - public static __m128i _mm_set1_epi16(short a) => System.Runtime.Intrinsics.Vector128.Create((ushort)a); - - /// - /// Broadcast 32-bit integer "a" to all elements of "dst". This intrinsic may generate "vpbroadcastd". - /// - /// - /// int {UI32} - /// __m128i dst {UI32} - public static __m128i _mm_set1_epi32(int a) => System.Runtime.Intrinsics.Vector128.Create((uint)a); - - /// - /// Broadcast 64-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastq". - /// - /// - /// long {UI64} - /// __m128i dst {UI64} - public static __m128i _mm_set1_epi64x(long a) => System.Runtime.Intrinsics.Vector128.Create((ulong)a); - - /// - /// Broadcast 8-bit integer "a" to all elements of "dst". This intrinsic may generate "vpbroadcastb". - /// - /// - /// byte {UI8} - /// __m128i dst {UI8} - public static __m128i _mm_set1_epi8(byte a) => System.Runtime.Intrinsics.Vector128.Create(a); - - /// - /// Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst". - /// - /// - /// double {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_set1_pd(double a) => System.Runtime.Intrinsics.Vector128.Create(a); - - /// - /// Set packed 16-bit integers in "dst" with the supplied values in reverse order. - /// - /// - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// short {UI16} - /// __m128i dst {UI16} - public static __m128i _mm_setr_epi16(short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) => System.Runtime.Intrinsics.Vector128.Create((ushort)e7, (ushort)e6, (ushort)e5, (ushort)e4, (ushort)e3, (ushort)e2, (ushort)e1, (ushort)e0); - - /// - /// Set packed 32-bit integers in "dst" with the supplied values in reverse order. - /// - /// - /// int {UI32} - /// int {UI32} - /// int {UI32} - /// int {UI32} - /// __m128i dst {UI32} - public static __m128i _mm_setr_epi32(int e3, int e2, int e1, int e0) => System.Runtime.Intrinsics.Vector128.Create((uint)e3, (uint)e2, (uint)e1, (uint)e0); - - /// - /// Set packed 64-bit integers in "dst" with the supplied values in reverse order. - /// - /// - /// __m64 {UI64} - /// __m64 {UI64} - /// __m128i dst {UI64} - public static __m128i _mm_setr_epi64(__m64 e1, __m64 e0) => System.Runtime.Intrinsics.Vector128.Create(e1.SI32, e0.SI32); - - /// - /// Set packed 8-bit integers in "dst" with the supplied values in reverse order. - /// - /// - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// byte {UI8} - /// __m128i dst {UI8} - public static __m128i _mm_setr_epi8(byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9, byte e8, byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) => System.Runtime.Intrinsics.Vector128.Create(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0); - - /// - /// Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order. - /// - /// - /// double {FP64} - /// double {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_setr_pd(double e1, double e0) => System.Runtime.Intrinsics.Vector128.Create(e1, e0); - - /// - /// Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst". - /// - /// PSHUFD xmm, xmm, imm8 - /// __m128i {UI32} - /// int {IMM} - /// __m128i dst {UI32} - public static __m128i _mm_shuffle_epi32(__m128i a, int imm8) => System.Runtime.Intrinsics.X86.Sse2.Shuffle(a.UI32, (byte)imm8); - - /// - /// Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst". - /// - /// SHUFPD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// int {IMM} - /// __m128d dst {FP64} - public static __m128d _mm_shuffle_pd(__m128d a, __m128d b, int imm8) => System.Runtime.Intrinsics.X86.Sse2.Shuffle(a.FP64, b.FP64, (byte)imm8); - - /// - /// Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst". - /// - /// PSHUFHW xmm, xmm, imm8 - /// __m128i {UI16} - /// int {IMM} - /// __m128i dst {UI16} - public static __m128i _mm_shufflehi_epi16(__m128i a, int imm8) => System.Runtime.Intrinsics.X86.Sse2.ShuffleHigh(a.UI16, (byte)imm8); - - /// - /// Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst". - /// - /// PSHUFLW xmm, xmm, imm8 - /// __m128i {UI16} - /// int {IMM} - /// __m128i dst {UI16} - public static __m128i _mm_shufflelo_epi16(__m128i a, int imm8) => System.Runtime.Intrinsics.X86.Sse2.ShuffleLow(a.UI16, (byte)imm8); - - /// - /// Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". - /// - /// PSLLW xmm, xmm - /// __m128i {UI16} - /// __m128i {UI16} - /// __m128i dst {UI16} - public static __m128i _mm_sll_epi16(__m128i a, __m128i count) => System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(a.UI16, count.UI16); - - /// - /// Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". - /// - /// PSLLD xmm, xmm - /// __m128i {UI32} - /// __m128i {UI32} - /// __m128i dst {UI32} - public static __m128i _mm_sll_epi32(__m128i a, __m128i count) => System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(a.UI32, count.UI32); - - /// - /// Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". - /// - /// PSLLQ xmm, xmm - /// __m128i {UI64} - /// __m128i {UI64} - /// __m128i dst {UI64} - public static __m128i _mm_sll_epi64(__m128i a, __m128i count) => System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(a.UI64, count.UI64); - - /// - /// Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". - /// - /// PSLLW xmm, imm8 - /// __m128i {UI16} - /// int {IMM} - /// __m128i dst {UI16} - public static __m128i _mm_slli_epi16(__m128i a, int imm8) => System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(a.UI16, (byte)imm8); - - /// - /// Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". - /// - /// PSLLD xmm, imm8 - /// __m128i {UI32} - /// int {IMM} - /// __m128i dst {UI32} - public static __m128i _mm_slli_epi32(__m128i a, int imm8) => System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(a.UI32, (byte)imm8); - - /// - /// Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". - /// - /// PSLLQ xmm, imm8 - /// __m128i {UI64} - /// int {IMM} - /// __m128i dst {UI64} - public static __m128i _mm_slli_epi64(__m128i a, int imm8) => System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(a.UI64, (byte)imm8); - - /// - /// Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - /// - /// SQRTPD xmm, xmm - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_sqrt_pd(__m128d a) => System.Runtime.Intrinsics.X86.Sse2.Sqrt(a.FP64); - - /// - /// Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// SQRTSD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_sqrt_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.SqrtScalar(a.FP64); - - /// - /// Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". - /// - /// PSRAW xmm, xmm - /// __m128i {UI16} - /// __m128i {UI16} - /// __m128i dst {UI16} - public static __m128i _mm_sra_epi16(__m128i a, __m128i count) => System.Runtime.Intrinsics.X86.Sse2.ShiftRightArithmetic(a.SI16, count.SI16); - - /// - /// Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". - /// - /// PSRAD xmm, xmm - /// __m128i {UI32} - /// __m128i {UI32} - /// __m128i dst {UI32} - public static __m128i _mm_sra_epi32(__m128i a, __m128i count) => System.Runtime.Intrinsics.X86.Sse2.ShiftRightArithmetic(a.SI32, count.SI32); - - /// - /// Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". - /// - /// PSRAW xmm, imm8 - /// __m128i {SI16} - /// int {IMM} - /// __m128i dst {UI16} - public static __m128i _mm_srai_epi16(__m128i a, int imm8) => System.Runtime.Intrinsics.X86.Sse2.ShiftRightArithmetic(a.SI16, (byte)imm8); - - /// - /// Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". - /// - /// PSRAD xmm, imm8 - /// __m128i {SI32} - /// int {IMM} - /// __m128i dst {UI32} - public static __m128i _mm_srai_epi32(__m128i a, int imm8) => System.Runtime.Intrinsics.X86.Sse2.ShiftRightArithmetic(a.SI32, (byte)imm8); - - /// - /// Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". - /// - /// PSRLW xmm, xmm - /// __m128i {UI16} - /// __m128i {UI16} - /// __m128i dst {UI16} - public static __m128i _mm_srl_epi16(__m128i a, __m128i count) => System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(a.UI16, count.UI16); - - /// - /// Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". - /// - /// PSRLD xmm, xmm - /// __m128i {UI32} - /// __m128i {UI32} - /// __m128i dst {UI32} - public static __m128i _mm_srl_epi32(__m128i a, __m128i count) => System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(a.UI32, count.UI32); - - /// - /// Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". - /// - /// PSRLQ xmm, xmm - /// __m128i {UI64} - /// __m128i {UI64} - /// __m128i dst {UI64} - public static __m128i _mm_srl_epi64(__m128i a, __m128i count) => System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(a.UI64, count.UI64); - - /// - /// Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - /// - /// PSRLW xmm, imm8 - /// __m128i {UI16} - /// int {IMM} - /// __m128i dst {UI16} - public static __m128i _mm_srli_epi16(__m128i a, int imm8) => System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(a.UI16, (byte)imm8); - - /// - /// Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - /// - /// PSRLD xmm, imm8 - /// __m128i {UI32} - /// int {IMM} - /// __m128i dst {UI32} - public static __m128i _mm_srli_epi32(__m128i a, int imm8) => System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(a.UI32, (byte)imm8); - - /// - /// Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - /// - /// PSRLQ xmm, imm8 - /// __m128i {UI64} - /// int {IMM} - /// __m128i dst {UI64} - public static __m128i _mm_srli_epi64(__m128i a, int imm8) => System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(a.UI64, (byte)imm8); - - /// - /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - /// - /// MOVAPD m128, xmm - /// double {FP64} - /// __m128d {FP64} - /// void {} - public static void _mm_store_pd(double* mem_addr, __m128d a) => System.Runtime.Intrinsics.X86.Sse2.StoreAligned(mem_addr, a.FP64); - - /// - /// Store the lower double-precision (64-bit) floating-point element from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. - /// - /// MOVSD m64, xmm - /// double {FP64} - /// __m128d {FP64} - /// void {} - public static void _mm_store_sd(double* mem_addr, __m128d a) => System.Runtime.Intrinsics.X86.Sse2.StoreScalar(mem_addr, a.FP64); - - /// - /// Store 128-bits of integer data from "a" into memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - /// - /// MOVDQA m128, xmm - /// __m128i {M128} - /// __m128i {M128} - /// void {} - public static void _mm_store_si128(__m128i* mem_addr, __m128i a) => System.Runtime.Intrinsics.X86.Sse2.StoreAligned((sbyte*)mem_addr, a.SI8); - - /// - /// Store the upper double-precision (64-bit) floating-point element from "a" into memory. - /// - /// MOVHPD m64, xmm - /// double {FP64} - /// __m128d {FP64} - /// void {} - public static void _mm_storeh_pd(double* mem_addr, __m128d a) => System.Runtime.Intrinsics.X86.Sse2.StoreHigh(mem_addr, a.FP64); - - /// - /// Store 64-bit integer from the first element of "a" into memory. - /// - /// MOVQ m64, xmm - /// __m128i {UI64} - /// __m128i {UI64} - /// void {} - public static void _mm_storel_epi64(__m128i* mem_addr, __m128i a) => System.Runtime.Intrinsics.X86.Sse2.StoreScalar((long*)mem_addr, a.SI64); - - /// - /// Store the lower double-precision (64-bit) floating-point element from "a" into memory. - /// - /// MOVLPD m64, xmm - /// double {FP64} - /// __m128d {FP64} - /// void {} - public static void _mm_storel_pd(double* mem_addr, __m128d a) => System.Runtime.Intrinsics.X86.Sse2.StoreLow(mem_addr, a.FP64); - - /// - /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. - /// - /// MOVUPD m128, xmm - /// double {FP64} - /// __m128d {FP64} - /// void {} - public static void _mm_storeu_pd(double* mem_addr, __m128d a) => System.Runtime.Intrinsics.X86.Sse2.Store(mem_addr, a.FP64); - - /// - /// Store 128-bits of integer data from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. - /// - /// MOVDQU m128, xmm - /// __m128i {M128} - /// __m128i {M128} - /// void {} - public static void _mm_storeu_si128(__m128i* mem_addr, __m128i a) => System.Runtime.Intrinsics.X86.Sse2.Store((sbyte*)mem_addr, a.SI8); - - /// - /// Store 32-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. - /// - /// MOVD m32, xmm - /// void {UI32} - /// __m128i {UI32} - /// void {} - public static void _mm_storeu_si32(void* mem_addr, __m128i a) => System.Runtime.Intrinsics.X86.Sse2.StoreScalar((int*)mem_addr, a.SI32); - - /// - /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - /// - /// MOVNTPD m128, xmm - /// double {FP64} - /// __m128d {FP64} - /// void {} - public static void _mm_stream_pd(double* mem_addr, __m128d a) => System.Runtime.Intrinsics.X86.Sse2.StoreAlignedNonTemporal(mem_addr, a.FP64); - - /// - /// Store 128-bits of integer data from "a" into memory using a non-temporal memory hint. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - /// - /// MOVNTDQ m128, xmm - /// __m128i {M128} - /// __m128i {M128} - /// void {} - public static void _mm_stream_si128(__m128i* mem_addr, __m128i a) => System.Runtime.Intrinsics.X86.Sse2.StoreAlignedNonTemporal((sbyte*)mem_addr, a.SI8); - - /// - /// Store 32-bit integer "a" into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address "mem_addr" is already in the cache, the cache will be updated. - /// - /// MOVNTI m32, r32 - /// int {UI32} - /// int {UI32} - /// void {} - public static void _mm_stream_si32(int* mem_addr, int a) => System.Runtime.Intrinsics.X86.Sse2.StoreNonTemporal((uint*)mem_addr, (uint)a); - - /// - /// Store 64-bit integer "a" into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address "mem_addr" is already in the cache, the cache will be updated. - /// - /// MOVNTI m64, r64 - /// long {UI64} - /// long {UI64} - /// void {} - public static void _mm_stream_si64(long* mem_addr, long a) => System.Runtime.Intrinsics.X86.Sse2.X64.StoreNonTemporal((ulong*)mem_addr, (ulong)a); - - /// - /// Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". - /// - /// PSUBW xmm, xmm - /// __m128i {UI16} - /// __m128i {UI16} - /// __m128i dst {UI16} - public static __m128i _mm_sub_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.Subtract(a.UI16, b.UI16); - - /// - /// Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". - /// - /// PSUBD xmm, xmm - /// __m128i {UI32} - /// __m128i {UI32} - /// __m128i dst {UI32} - public static __m128i _mm_sub_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.Subtract(a.UI32, b.UI32); - - /// - /// Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst". - /// - /// PSUBQ xmm, xmm - /// __m128i {UI64} - /// __m128i {UI64} - /// __m128i dst {UI64} - public static __m128i _mm_sub_epi64(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.Subtract(a.UI64, b.UI64); - - /// - /// Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". - /// - /// PSUBB xmm, xmm - /// __m128i {UI8} - /// __m128i {UI8} - /// __m128i dst {UI8} - public static __m128i _mm_sub_epi8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.Subtract(a.UI8, b.UI8); - - /// - /// Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - /// - /// SUBPD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_sub_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.Subtract(a.FP64, b.FP64); - - /// - /// Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// SUBSD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_sub_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.SubtractScalar(a.FP64, b.FP64); - - /// - /// Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". - /// - /// PSUBSW xmm, xmm - /// __m128i {SI16} - /// __m128i {SI16} - /// __m128i dst {UI16} - public static __m128i _mm_subs_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.SubtractSaturate(a.SI16, b.SI16); - - /// - /// Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". - /// - /// PSUBSB xmm, xmm - /// __m128i {SI8} - /// __m128i {SI8} - /// __m128i dst {UI8} - public static __m128i _mm_subs_epi8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.SubtractSaturate(a.SI8, b.SI8); - - /// - /// Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". - /// - /// PSUBUSW xmm, xmm - /// __m128i {UI16} - /// __m128i {UI16} - /// __m128i dst {UI16} - public static __m128i _mm_subs_epu16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.SubtractSaturate(a.UI16, b.UI16); - - /// - /// Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". - /// - /// PSUBUSB xmm, xmm - /// __m128i {UI8} - /// __m128i {UI8} - /// __m128i dst {UI8} - public static __m128i _mm_subs_epu8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.SubtractSaturate(a.UI8, b.UI8); - - /// - /// Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - /// - /// UCOMISD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// bool k {UI8} - public static bool _mm_ucomieq_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarUnorderedEqual(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - /// - /// UCOMISD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// bool k {UI8} - public static bool _mm_ucomige_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarUnorderedGreaterThanOrEqual(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - /// - /// UCOMISD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// bool k {UI8} - public static bool _mm_ucomigt_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarUnorderedGreaterThan(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - /// - /// UCOMISD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// bool k {UI8} - public static bool _mm_ucomile_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarUnorderedLessThanOrEqual(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - /// - /// UCOMISD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// bool k {UI8} - public static bool _mm_ucomilt_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarUnorderedLessThan(a.FP64, b.FP64); - - /// - /// Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - /// - /// UCOMISD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// bool k {UI8} - public static bool _mm_ucomineq_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.CompareScalarUnorderedNotEqual(a.FP64, b.FP64); - - /// - /// Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst". - /// - /// PUNPCKHWD xmm, xmm - /// __m128i {UI16} - /// __m128i {UI16} - /// __m128i dst {UI16} - public static __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(a.UI16, b.UI16); - - /// - /// Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst". - /// - /// PUNPCKHDQ xmm, xmm - /// __m128i {UI32} - /// __m128i {UI32} - /// __m128i dst {UI32} - public static __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(a.UI32, b.UI32); - - /// - /// Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst". - /// - /// PUNPCKHQDQ xmm, xmm - /// __m128i {UI64} - /// __m128i {UI64} - /// __m128i dst {UI64} - public static __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(a.UI64, b.UI64); - - /// - /// Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst". - /// - /// PUNPCKHBW xmm, xmm - /// __m128i {UI8} - /// __m128i {UI8} - /// __m128i dst {UI8} - public static __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(a.UI8, b.UI8); - - /// - /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst". - /// - /// UNPCKHPD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_unpackhi_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(a.FP64, b.FP64); - - /// - /// Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst". - /// - /// PUNPCKLWD xmm, xmm - /// __m128i {UI16} - /// __m128i {UI16} - /// __m128i dst {UI16} - public static __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.UnpackLow(a.UI16, b.UI16); - - /// - /// Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst". - /// - /// PUNPCKLDQ xmm, xmm - /// __m128i {UI32} - /// __m128i {UI32} - /// __m128i dst {UI32} - public static __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.UnpackLow(a.UI32, b.UI32); - - /// - /// Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst". - /// - /// PUNPCKLQDQ xmm, xmm - /// __m128i {UI64} - /// __m128i {UI64} - /// __m128i dst {UI64} - public static __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.UnpackLow(a.UI64, b.UI64); - - /// - /// Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst". - /// - /// PUNPCKLBW xmm, xmm - /// __m128i {UI8} - /// __m128i {UI8} - /// __m128i dst {UI8} - public static __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.UnpackLow(a.UI8, b.UI8); - - /// - /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst". - /// - /// UNPCKLPD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_unpacklo_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.UnpackLow(a.FP64, b.FP64); - - /// - /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - /// - /// XORPD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_xor_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse2.Xor(a.FP64, b.FP64); - - /// - /// Compute the bitwise XOR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst". - /// - /// PXOR xmm, xmm - /// __m128i {M128} - /// __m128i {M128} - /// __m128i dst {M128} - public static __m128i _mm_xor_si128(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse2.Xor(a.UI8, b.UI8); - - } -} diff --git a/src/External/RawIntrinsics/SSE3.cs b/src/External/RawIntrinsics/SSE3.cs deleted file mode 100644 index de71b53..0000000 --- a/src/External/RawIntrinsics/SSE3.cs +++ /dev/null @@ -1,100 +0,0 @@ -namespace RawIntrinsics -{ - public static unsafe partial class SSE3 - { - /// - /// Alternatively add and subtract packed double-precision (64-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". - /// - /// ADDSUBPD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_addsub_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse3.AddSubtract(a.FP64, b.FP64); - - /// - /// Alternatively add and subtract packed single-precision (32-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". - /// - /// ADDSUBPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_addsub_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse3.AddSubtract(a.FP32, b.FP32); - - /// - /// Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". - /// - /// HADDPD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_hadd_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse3.HorizontalAdd(a.FP64, b.FP64); - - /// - /// Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". - /// - /// HADDPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_hadd_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse3.HorizontalAdd(a.FP32, b.FP32); - - /// - /// Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". - /// - /// HSUBPD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_hsub_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse3.HorizontalSubtract(a.FP64, b.FP64); - - /// - /// Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". - /// - /// HSUBPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_hsub_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse3.HorizontalSubtract(a.FP32, b.FP32); - - /// - /// Load 128-bits of integer data from unaligned memory into "dst". This intrinsic may perform better than "_mm_loadu_si128" when the data crosses a cache line boundary. - /// - /// LDDQU xmm, m128 - /// __m128i {M128} - /// __m128i dst {M128} - public static __m128i _mm_lddqu_si128(__m128i* mem_addr) => System.Runtime.Intrinsics.X86.Sse3.LoadDquVector128((sbyte*)mem_addr); - - /// - /// Load a double-precision (64-bit) floating-point element from memory into both elements of "dst". - /// - /// MOVDDUP xmm, m64 - /// double {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_loaddup_pd(double* mem_addr) => System.Runtime.Intrinsics.X86.Sse3.LoadAndDuplicateToVector128(mem_addr); - - /// - /// Duplicate the low double-precision (64-bit) floating-point element from "a", and store the results in "dst". - /// - /// MOVDDUP xmm, xmm - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_movedup_pd(__m128d a) => System.Runtime.Intrinsics.X86.Sse3.MoveAndDuplicate(a.FP64); - - /// - /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". - /// - /// MOVSHDUP xmm, xmm - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_movehdup_ps(__m128 a) => System.Runtime.Intrinsics.X86.Sse3.MoveHighAndDuplicate(a.FP32); - - /// - /// Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". - /// - /// MOVSLDUP xmm, xmm - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_moveldup_ps(__m128 a) => System.Runtime.Intrinsics.X86.Sse3.MoveLowAndDuplicate(a.FP32); - - } -} diff --git a/src/External/RawIntrinsics/SSE41.cs b/src/External/RawIntrinsics/SSE41.cs deleted file mode 100644 index 105b0a8..0000000 --- a/src/External/RawIntrinsics/SSE41.cs +++ /dev/null @@ -1,525 +0,0 @@ -namespace RawIntrinsics -{ - public static unsafe partial class SSE41 - { - /// - /// Blend packed 16-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst". - /// - /// PBLENDW xmm, xmm, imm8 - /// __m128i {UI16} - /// __m128i {UI16} - /// int {IMM} - /// __m128i dst {UI16} - public static __m128i _mm_blend_epi16(__m128i a, __m128i b, int imm8) => System.Runtime.Intrinsics.X86.Sse41.Blend(a.UI16, b.UI16, (byte)imm8); - - /// - /// Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". - /// - /// BLENDPD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// int {IMM} - /// __m128d dst {FP64} - public static __m128d _mm_blend_pd(__m128d a, __m128d b, int imm8) => System.Runtime.Intrinsics.X86.Sse41.Blend(a.FP64, b.FP64, (byte)imm8); - - /// - /// Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". - /// - /// BLENDPS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// int {IMM} - /// __m128 dst {FP32} - public static __m128 _mm_blend_ps(__m128 a, __m128 b, int imm8) => System.Runtime.Intrinsics.X86.Sse41.Blend(a.FP32, b.FP32, (byte)imm8); - - /// - /// Blend packed 8-bit integers from "a" and "b" using "mask", and store the results in "dst". - /// - /// PBLENDVB xmm, xmm - /// __m128i {UI8} - /// __m128i {UI8} - /// __m128i {UI8} - /// __m128i dst {UI8} - public static __m128i _mm_blendv_epi8(__m128i a, __m128i b, __m128i mask) => System.Runtime.Intrinsics.X86.Sse41.BlendVariable(a.UI8, b.UI8, mask.UI8); - - /// - /// Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". - /// - /// BLENDVPD xmm, xmm - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_blendv_pd(__m128d a, __m128d b, __m128d mask) => System.Runtime.Intrinsics.X86.Sse41.BlendVariable(a.FP64, b.FP64, mask.FP64); - - /// - /// Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". - /// - /// BLENDVPS xmm, xmm - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 mask) => System.Runtime.Intrinsics.X86.Sse41.BlendVariable(a.FP32, b.FP32, mask.FP32); - - /// - /// Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". - /// - /// ROUNDPD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_ceil_pd(__m128d a) => System.Runtime.Intrinsics.X86.Sse41.Ceiling(a.FP64); - - /// - /// Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". - /// - /// ROUNDPS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_ceil_ps(__m128 a) => System.Runtime.Intrinsics.X86.Sse41.Ceiling(a.FP32); - - /// - /// Round the lower double-precision (64-bit) floating-point element in "b" up to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// ROUNDSD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_ceil_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse41.CeilingScalar(a.FP64); - - /// - /// Round the lower single-precision (32-bit) floating-point element in "b" up to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// ROUNDSS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_ceil_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse41.CeilingScalar(a.FP32); - - /// - /// Compare packed 64-bit integers in "a" and "b" for equality, and store the results in "dst". - /// - /// PCMPEQQ xmm, xmm - /// __m128i {UI64} - /// __m128i {UI64} - /// __m128i dst {UI64} - public static __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.CompareEqual(a.UI64, b.UI64); - - /// - /// Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - /// - /// PMOVSXWD xmm, xmm - /// __m128i {SI16} - /// __m128i dst {SI32} - public static __m128i _mm_cvtepi16_epi32(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int32(a.SI16); - - /// - /// Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - /// - /// PMOVSXWQ xmm, xmm - /// __m128i {SI16} - /// __m128i dst {SI64} - public static __m128i _mm_cvtepi16_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int64(a.SI16); - - /// - /// Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - /// - /// PMOVSXDQ xmm, xmm - /// __m128i {SI32} - /// __m128i dst {SI64} - public static __m128i _mm_cvtepi32_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int64(a.SI32); - - /// - /// Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". - /// - /// PMOVSXBW xmm, xmm - /// __m128i {SI8} - /// __m128i dst {SI16} - public static __m128i _mm_cvtepi8_epi16(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int16(a.SI8); - - /// - /// Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - /// - /// PMOVSXBD xmm, xmm - /// __m128i {SI8} - /// __m128i dst {SI32} - public static __m128i _mm_cvtepi8_epi32(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int32(a.SI8); - - /// - /// Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst". - /// - /// PMOVSXBQ xmm, xmm - /// __m128i {SI8} - /// __m128i dst {SI64} - public static __m128i _mm_cvtepi8_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int64(a.SI8); - - /// - /// Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - /// - /// PMOVZXWD xmm, xmm - /// __m128i {UI16} - /// __m128i dst {UI32} - public static __m128i _mm_cvtepu16_epi32(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int32(a.UI16); - - /// - /// Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - /// - /// PMOVZXWQ xmm, xmm - /// __m128i {UI16} - /// __m128i dst {UI64} - public static __m128i _mm_cvtepu16_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int64(a.UI16); - - /// - /// Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - /// - /// PMOVZXDQ xmm, xmm - /// __m128i {UI32} - /// __m128i dst {UI64} - public static __m128i _mm_cvtepu32_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int64(a.UI32); - - /// - /// Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". - /// - /// PMOVZXBW xmm, xmm - /// __m128i {UI8} - /// __m128i dst {UI16} - public static __m128i _mm_cvtepu8_epi16(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int16(a.UI8); - - /// - /// Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - /// - /// PMOVZXBD xmm, xmm - /// __m128i {UI8} - /// __m128i dst {UI32} - public static __m128i _mm_cvtepu8_epi32(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int32(a.UI8); - - /// - /// Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst". - /// - /// PMOVZXBQ xmm, xmm - /// __m128i {UI8} - /// __m128i dst {UI64} - public static __m128i _mm_cvtepu8_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int64(a.UI8); - - /// - /// Conditionally multiply the packed double-precision (64-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8". - /// - /// DPPD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// int {IMM} - /// __m128d dst {FP64} - public static __m128d _mm_dp_pd(__m128d a, __m128d b, int imm8) => System.Runtime.Intrinsics.X86.Sse41.DotProduct(a.FP64, b.FP64, (byte)imm8); - - /// - /// Conditionally multiply the packed single-precision (32-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8". - /// - /// DPPS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// int {IMM} - /// __m128 dst {FP32} - public static __m128 _mm_dp_ps(__m128 a, __m128 b, int imm8) => System.Runtime.Intrinsics.X86.Sse41.DotProduct(a.FP32, b.FP32, (byte)imm8); - - /// - /// Extract a 32-bit integer from "a", selected with "imm8", and store the result in "dst". - /// - /// PEXTRD r32, xmm, imm8 - /// __m128i {UI32} - /// int {IMM} - /// int dst {UI32} - public static int _mm_extract_epi32(__m128i a, int imm8) => (int)System.Runtime.Intrinsics.X86.Sse41.Extract(a.UI32, (byte)imm8); - - /// - /// Extract a 64-bit integer from "a", selected with "imm8", and store the result in "dst". - /// - /// PEXTRQ r64, xmm, imm8 - /// __m128i {UI64} - /// int {IMM} - /// long dst {UI64} - public static long _mm_extract_epi64(__m128i a, int imm8) => (long)System.Runtime.Intrinsics.X86.Sse41.X64.Extract(a.UI64, (byte)imm8); - - /// - /// Extract an 8-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst". - /// - /// PEXTRB r32, xmm, imm8 - /// __m128i {UI8} - /// int {IMM} - /// int dst {UI8} - public static int _mm_extract_epi8(__m128i a, int imm8) => (int)System.Runtime.Intrinsics.X86.Sse41.Extract(a.UI8, (byte)imm8); - - /// - /// Extract a single-precision (32-bit) floating-point element from "a", selected with "imm8", and store the result in "dst". - /// - /// EXTRACTPS r32, xmm, imm8 - /// __m128 {FP32} - /// int {IMM} - /// int dst {UI32} - public static int _mm_extract_ps(__m128 a, int imm8) => (int)System.Runtime.Intrinsics.X86.Sse41.Extract(a.FP32, (byte)imm8); - - /// - /// Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". - /// - /// ROUNDPD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_floor_pd(__m128d a) => System.Runtime.Intrinsics.X86.Sse41.Floor(a.FP64); - - /// - /// Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". - /// - /// ROUNDPS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_floor_ps(__m128 a) => System.Runtime.Intrinsics.X86.Sse41.Floor(a.FP32); - - /// - /// Round the lower double-precision (64-bit) floating-point element in "b" down to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - /// - /// ROUNDSD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// __m128d dst {FP64} - public static __m128d _mm_floor_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse41.FloorScalar(a.FP64); - - /// - /// Round the lower single-precision (32-bit) floating-point element in "b" down to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - /// - /// ROUNDSS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// __m128 dst {FP32} - public static __m128 _mm_floor_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse41.FloorScalar(a.FP32); - - /// - /// Copy "a" to "dst", and insert the 32-bit integer "i" into "dst" at the location specified by "imm8". - /// - /// PINSRD xmm, r32, imm8 - /// __m128i {UI32} - /// int {UI32} - /// int {IMM} - /// __m128i dst {UI32} - public static __m128i _mm_insert_epi32(__m128i a, int i, int imm8) => System.Runtime.Intrinsics.X86.Sse41.Insert(a.UI32, (uint)i, (byte)imm8); - - /// - /// Copy "a" to "dst", and insert the 64-bit integer "i" into "dst" at the location specified by "imm8". - /// - /// PINSRQ xmm, r64, imm8 - /// __m128i {UI64} - /// long {UI64} - /// int {IMM} - /// __m128i dst {UI64} - public static __m128i _mm_insert_epi64(__m128i a, long i, int imm8) => System.Runtime.Intrinsics.X86.Sse41.X64.Insert(a.UI64, (ulong)i, (byte)imm8); - - /// - /// Copy "a" to "dst", and insert the lower 8-bit integer from "i" into "dst" at the location specified by "imm8". - /// - /// PINSRB xmm, r32, imm8 - /// __m128i {UI8} - /// int {UI8} - /// int {IMM} - /// __m128i dst {UI8} - public static __m128i _mm_insert_epi8(__m128i a, int i, int imm8) => System.Runtime.Intrinsics.X86.Sse41.Insert(a.UI8, (byte)i, (byte)imm8); - - /// - /// Copy "a" to "tmp", then insert a single-precision (32-bit) floating-point element from "b" into "tmp" using the control in "imm8". Store "tmp" to "dst" using the mask in "imm8" (elements are zeroed out when the corresponding bit is set). - /// - /// INSERTPS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// int {IMM} - /// __m128 dst {FP32} - public static __m128 _mm_insert_ps(__m128 a, __m128 b, int imm8) => System.Runtime.Intrinsics.X86.Sse41.Insert(a.FP32, b.FP32, (byte)imm8); - - /// - /// Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst". - /// - /// PMAXSD xmm, xmm - /// __m128i {SI32} - /// __m128i {SI32} - /// __m128i dst {UI32} - public static __m128i _mm_max_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.Max(a.SI32, b.SI32); - - /// - /// Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst". - /// - /// PMAXSB xmm, xmm - /// __m128i {SI8} - /// __m128i {SI8} - /// __m128i dst {UI8} - public static __m128i _mm_max_epi8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.Max(a.SI8, b.SI8); - - /// - /// Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst". - /// - /// PMAXUW xmm, xmm - /// __m128i {UI16} - /// __m128i {UI16} - /// __m128i dst {UI16} - public static __m128i _mm_max_epu16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.Max(a.UI16, b.UI16); - - /// - /// Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst". - /// - /// PMAXUD xmm, xmm - /// __m128i {UI32} - /// __m128i {UI32} - /// __m128i dst {UI32} - public static __m128i _mm_max_epu32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.Max(a.UI32, b.UI32); - - /// - /// Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst". - /// - /// PMINSD xmm, xmm - /// __m128i {SI32} - /// __m128i {SI32} - /// __m128i dst {UI32} - public static __m128i _mm_min_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.Min(a.SI32, b.SI32); - - /// - /// Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst". - /// - /// PMINSB xmm, xmm - /// __m128i {SI8} - /// __m128i {SI8} - /// __m128i dst {UI8} - public static __m128i _mm_min_epi8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.Min(a.SI8, b.SI8); - - /// - /// Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst". - /// - /// PMINUW xmm, xmm - /// __m128i {UI16} - /// __m128i {UI16} - /// __m128i dst {UI16} - public static __m128i _mm_min_epu16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.Min(a.UI16, b.UI16); - - /// - /// Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst". - /// - /// PMINUD xmm, xmm - /// __m128i {UI32} - /// __m128i {UI32} - /// __m128i dst {UI32} - public static __m128i _mm_min_epu32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.Min(a.UI32, b.UI32); - - /// - /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in "a", store the minimum and index in "dst", and zero the remaining bits in "dst". - /// - /// PHMINPOSUW xmm, xmm - /// __m128i {UI16} - /// __m128i dst {UI16} - public static __m128i _mm_minpos_epu16(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.MinHorizontal(a.UI16); - - /// - /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". Eight SADs are performed using one quadruplet from "b" and eight quadruplets from "a". One quadruplet is selected from "b" starting at on the offset specified in "imm8". Eight quadruplets are formed from sequential 8-bit integers selected from "a" starting at the offset specified in "imm8". - /// - /// MPSADBW xmm, xmm, imm8 - /// __m128i {UI8} - /// __m128i {UI8} - /// int {IMM} - /// __m128i dst {UI8} - public static __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, int imm8) => System.Runtime.Intrinsics.X86.Sse41.MultipleSumAbsoluteDifferences(a.UI8, b.UI8, (byte)imm8); - - /// - /// Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". - /// - /// PMULDQ xmm, xmm - /// __m128i {SI32} - /// __m128i {SI32} - /// __m128i dst {SI64} - public static __m128i _mm_mul_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.Multiply(a.SI32, b.SI32); - - /// - /// Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". - /// - /// PMULLD xmm, xmm - /// __m128i {UI32} - /// __m128i {UI32} - /// __m128i dst {UI32} - public static __m128i _mm_mullo_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.MultiplyLow(a.UI32, b.UI32); - - /// - /// Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst". - /// - /// PACKUSDW xmm, xmm - /// __m128i {SI32} - /// __m128i {SI32} - /// __m128i dst {UI16} - public static __m128i _mm_packus_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.PackUnsignedSaturate(a.SI32, b.SI32); - - /// - /// Round the packed double-precision (64-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed double-precision floating-point elements in "dst". [round_note] - /// - /// ROUNDPD xmm, xmm, imm8 - /// __m128d {FP64} - /// int {IMM} - /// __m128d dst {FP64} - public static __m128d _mm_round_pd(__m128d a, int rounding) => System.Runtime.Intrinsics.X86.Sse41.RoundToNearestInteger(a.FP64); - - /// - /// Round the packed single-precision (32-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed single-precision floating-point elements in "dst". [round_note] - /// - /// ROUNDPS xmm, xmm, imm8 - /// __m128 {FP32} - /// int {IMM} - /// __m128 dst {FP32} - public static __m128 _mm_round_ps(__m128 a, int rounding) => System.Runtime.Intrinsics.X86.Sse41.RoundToNearestInteger(a.FP32); - - /// - /// Round the lower double-precision (64-bit) floating-point element in "b" using the "rounding" parameter, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_note] - /// - /// ROUNDSD xmm, xmm, imm8 - /// __m128d {FP64} - /// __m128d {FP64} - /// int {IMM} - /// __m128d dst {FP64} - public static __m128d _mm_round_sd(__m128d a, __m128d b, int rounding) => System.Runtime.Intrinsics.X86.Sse41.RoundCurrentDirectionScalar(a.FP64); - - /// - /// Round the lower single-precision (32-bit) floating-point element in "b" using the "rounding" parameter, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_note] - /// - /// ROUNDSS xmm, xmm, imm8 - /// __m128 {FP32} - /// __m128 {FP32} - /// int {IMM} - /// __m128 dst {FP32} - public static __m128 _mm_round_ss(__m128 a, __m128 b, int rounding) => System.Runtime.Intrinsics.X86.Sse41.RoundCurrentDirectionScalar(a.FP32); - - /// - /// Load 128-bits of integer data from memory into "dst" using a non-temporal memory hint. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - /// - /// MOVNTDQA xmm, m128 - /// __m128i {M128} - /// __m128i dst {M128} - public static __m128i _mm_stream_load_si128(__m128i* mem_addr) => System.Runtime.Intrinsics.X86.Sse41.LoadAlignedVector128NonTemporal((sbyte*)mem_addr); - - /// - /// Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "CF" value. - /// - /// PTEST xmm, xmm - /// __m128i {M128} - /// __m128i {M128} - /// bool k {UI8} - public static bool _mm_testc_si128(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.TestC(a.SI8, b.SI8); - - /// - /// Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. - /// - /// PTEST xmm, xmm - /// __m128i {M128} - /// __m128i {M128} - /// bool dst {UI8} - public static bool _mm_testnzc_si128(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.TestNotZAndNotC(a.SI8, b.SI8); - - /// - /// Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "ZF" value. - /// - /// PTEST xmm, xmm - /// __m128i {M128} - /// __m128i {M128} - /// bool k {UI8} - public static bool _mm_testz_si128(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.TestZ(a.SI8, b.SI8); - - } -} diff --git a/src/External/RawIntrinsics/SSE42.cs b/src/External/RawIntrinsics/SSE42.cs deleted file mode 100644 index e0b939b..0000000 --- a/src/External/RawIntrinsics/SSE42.cs +++ /dev/null @@ -1,15 +0,0 @@ -namespace RawIntrinsics -{ - public static unsafe partial class SSE42 - { - /// - /// Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in "dst". - /// - /// PCMPGTQ xmm, xmm - /// __m128i {SI64} - /// __m128i {SI64} - /// __m128i dst {UI64} - public static __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse42.CompareGreaterThan(a.SI64, b.SI64); - - } -} diff --git a/src/External/RawIntrinsics/SSSE3.cs b/src/External/RawIntrinsics/SSSE3.cs deleted file mode 100644 index 66c40b0..0000000 --- a/src/External/RawIntrinsics/SSSE3.cs +++ /dev/null @@ -1,148 +0,0 @@ -namespace RawIntrinsics -{ - public static unsafe partial class SSSE3 - { - /// - /// Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst". - /// - /// PABSW xmm, xmm - /// __m128i {SI16} - /// __m128i dst {UI16} - public static __m128i _mm_abs_epi16(__m128i a) => System.Runtime.Intrinsics.X86.Ssse3.Abs(a.SI16); - - /// - /// Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst". - /// - /// PABSD xmm, xmm - /// __m128i {SI32} - /// __m128i dst {UI32} - public static __m128i _mm_abs_epi32(__m128i a) => System.Runtime.Intrinsics.X86.Ssse3.Abs(a.SI32); - - /// - /// Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst". - /// - /// PABSB xmm, xmm - /// __m128i {SI8} - /// __m128i dst {UI8} - public static __m128i _mm_abs_epi8(__m128i a) => System.Runtime.Intrinsics.X86.Ssse3.Abs(a.SI8); - - /// - /// Concatenate 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst". - /// - /// PALIGNR xmm, xmm, imm8 - /// __m128i {UI8} - /// __m128i {UI8} - /// int {IMM} - /// __m128i dst {UI8} - public static __m128i _mm_alignr_epi8(__m128i a, __m128i b, int imm8) => System.Runtime.Intrinsics.X86.Ssse3.AlignRight(a.UI8, b.UI8, (byte)imm8); - - /// - /// Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". - /// - /// PHADDW xmm, xmm - /// __m128i {SI16} - /// __m128i {SI16} - /// __m128i dst {SI16} - public static __m128i _mm_hadd_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.HorizontalAdd(a.SI16, b.SI16); - - /// - /// Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". - /// - /// PHADDD xmm, xmm - /// __m128i {SI32} - /// __m128i {SI32} - /// __m128i dst {SI32} - public static __m128i _mm_hadd_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.HorizontalAdd(a.SI32, b.SI32); - - /// - /// Horizontally add adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". - /// - /// PHADDSW xmm, xmm - /// __m128i {SI16} - /// __m128i {SI16} - /// __m128i dst {SI16} - public static __m128i _mm_hadds_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.HorizontalAddSaturate(a.SI16, b.SI16); - - /// - /// Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". - /// - /// PHSUBW xmm, xmm - /// __m128i {SI16} - /// __m128i {SI16} - /// __m128i dst {SI16} - public static __m128i _mm_hsub_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.HorizontalSubtract(a.SI16, b.SI16); - - /// - /// Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". - /// - /// PHSUBD xmm, xmm - /// __m128i {SI32} - /// __m128i {SI32} - /// __m128i dst {SI32} - public static __m128i _mm_hsub_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.HorizontalSubtract(a.SI32, b.SI32); - - /// - /// Horizontally subtract adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". - /// - /// PHSUBSW xmm, xmm - /// __m128i {SI16} - /// __m128i {SI16} - /// __m128i dst {SI16} - public static __m128i _mm_hsubs_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.HorizontalSubtractSaturate(a.SI16, b.SI16); - - /// - /// Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst". - /// - /// PMADDUBSW xmm, xmm - /// __m128i {UI8} - /// __m128i {SI8} - /// __m128i dst {SI16} - public static __m128i _mm_maddubs_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.MultiplyAddAdjacent(a.UI8, b.SI8); - - /// - /// Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". - /// - /// PMULHRSW xmm, xmm - /// __m128i {SI16} - /// __m128i {SI16} - /// __m128i dst {UI16} - public static __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.MultiplyHighRoundScale(a.SI16, b.SI16); - - /// - /// Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst". - /// - /// PSHUFB xmm, xmm - /// __m128i {UI8} - /// __m128i {UI8} - /// __m128i dst {UI8} - public static __m128i _mm_shuffle_epi8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.Shuffle(a.UI8, b.UI8); - - /// - /// Negate packed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. - /// - /// PSIGNW xmm, xmm - /// __m128i {SI16} - /// __m128i {SI16} - /// __m128i dst {UI16} - public static __m128i _mm_sign_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.Sign(a.SI16, b.SI16); - - /// - /// Negate packed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. - /// - /// PSIGND xmm, xmm - /// __m128i {SI32} - /// __m128i {SI32} - /// __m128i dst {UI32} - public static __m128i _mm_sign_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.Sign(a.SI32, b.SI32); - - /// - /// Negate packed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. - /// - /// PSIGNB xmm, xmm - /// __m128i {SI8} - /// __m128i {SI8} - /// __m128i dst {UI8} - public static __m128i _mm_sign_epi8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.Sign(a.SI8, b.SI8); - - } -} diff --git a/src/External/RawIntrinsics/Types.cs b/src/External/RawIntrinsics/Types.cs deleted file mode 100644 index c08378b..0000000 --- a/src/External/RawIntrinsics/Types.cs +++ /dev/null @@ -1,178 +0,0 @@ -namespace RawIntrinsics -{ - public struct __m64 - { - private System.Runtime.Intrinsics.Vector64 _; - public System.Runtime.Intrinsics.Vector64 UI8 => System.Runtime.Intrinsics.Vector64.AsByte(_); - public System.Runtime.Intrinsics.Vector64 SI8 => System.Runtime.Intrinsics.Vector64.AsSByte(_); - public System.Runtime.Intrinsics.Vector64 UI16 => System.Runtime.Intrinsics.Vector64.AsUInt16(_); - public System.Runtime.Intrinsics.Vector64 SI16 => System.Runtime.Intrinsics.Vector64.AsInt16(_); - public System.Runtime.Intrinsics.Vector64 UI32 => System.Runtime.Intrinsics.Vector64.AsUInt32(_); - public System.Runtime.Intrinsics.Vector64 SI32 => System.Runtime.Intrinsics.Vector64.AsInt32(_); - public System.Runtime.Intrinsics.Vector64 UI64 => System.Runtime.Intrinsics.Vector64.AsUInt64(_); - public System.Runtime.Intrinsics.Vector64 SI64 => System.Runtime.Intrinsics.Vector64.AsInt64(_); - public System.Runtime.Intrinsics.Vector64 FP32 => System.Runtime.Intrinsics.Vector64.AsSingle(_); - public System.Runtime.Intrinsics.Vector64 FP64 => System.Runtime.Intrinsics.Vector64.AsDouble(_); - public static implicit operator __m64(System.Runtime.Intrinsics.Vector64 v) => new __m64 { _ = System.Runtime.Intrinsics.Vector64.AsByte(v) }; - public static implicit operator __m64(System.Runtime.Intrinsics.Vector64 v) => new __m64 { _ = System.Runtime.Intrinsics.Vector64.AsByte(v) }; - public static implicit operator __m64(System.Runtime.Intrinsics.Vector64 v) => new __m64 { _ = System.Runtime.Intrinsics.Vector64.AsByte(v) }; - public static implicit operator __m64(System.Runtime.Intrinsics.Vector64 v) => new __m64 { _ = System.Runtime.Intrinsics.Vector64.AsByte(v) }; - public static implicit operator __m64(System.Runtime.Intrinsics.Vector64 v) => new __m64 { _ = System.Runtime.Intrinsics.Vector64.AsByte(v) }; - public static implicit operator __m64(System.Runtime.Intrinsics.Vector64 v) => new __m64 { _ = System.Runtime.Intrinsics.Vector64.AsByte(v) }; - public static implicit operator __m64(System.Runtime.Intrinsics.Vector64 v) => new __m64 { _ = System.Runtime.Intrinsics.Vector64.AsByte(v) }; - public static implicit operator __m64(System.Runtime.Intrinsics.Vector64 v) => new __m64 { _ = System.Runtime.Intrinsics.Vector64.AsByte(v) }; - public static implicit operator __m64(System.Runtime.Intrinsics.Vector64 v) => new __m64 { _ = System.Runtime.Intrinsics.Vector64.AsByte(v) }; - public static implicit operator __m64(System.Runtime.Intrinsics.Vector64 v) => new __m64 { _ = System.Runtime.Intrinsics.Vector64.AsByte(v) }; - } - - public struct __m128 - { - private System.Runtime.Intrinsics.Vector128 _; - public System.Runtime.Intrinsics.Vector128 UI8 => System.Runtime.Intrinsics.Vector128.AsByte(_); - public System.Runtime.Intrinsics.Vector128 SI8 => System.Runtime.Intrinsics.Vector128.AsSByte(_); - public System.Runtime.Intrinsics.Vector128 UI16 => System.Runtime.Intrinsics.Vector128.AsUInt16(_); - public System.Runtime.Intrinsics.Vector128 SI16 => System.Runtime.Intrinsics.Vector128.AsInt16(_); - public System.Runtime.Intrinsics.Vector128 UI32 => System.Runtime.Intrinsics.Vector128.AsUInt32(_); - public System.Runtime.Intrinsics.Vector128 SI32 => System.Runtime.Intrinsics.Vector128.AsInt32(_); - public System.Runtime.Intrinsics.Vector128 UI64 => System.Runtime.Intrinsics.Vector128.AsUInt64(_); - public System.Runtime.Intrinsics.Vector128 SI64 => System.Runtime.Intrinsics.Vector128.AsInt64(_); - public System.Runtime.Intrinsics.Vector128 FP32 => System.Runtime.Intrinsics.Vector128.AsSingle(_); - public System.Runtime.Intrinsics.Vector128 FP64 => System.Runtime.Intrinsics.Vector128.AsDouble(_); - public static implicit operator __m128(System.Runtime.Intrinsics.Vector128 v) => new __m128 { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128(System.Runtime.Intrinsics.Vector128 v) => new __m128 { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128(System.Runtime.Intrinsics.Vector128 v) => new __m128 { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128(System.Runtime.Intrinsics.Vector128 v) => new __m128 { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128(System.Runtime.Intrinsics.Vector128 v) => new __m128 { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128(System.Runtime.Intrinsics.Vector128 v) => new __m128 { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128(System.Runtime.Intrinsics.Vector128 v) => new __m128 { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128(System.Runtime.Intrinsics.Vector128 v) => new __m128 { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128(System.Runtime.Intrinsics.Vector128 v) => new __m128 { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128(System.Runtime.Intrinsics.Vector128 v) => new __m128 { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - } - - public struct __m128i - { - private System.Runtime.Intrinsics.Vector128 _; - public System.Runtime.Intrinsics.Vector128 UI8 => System.Runtime.Intrinsics.Vector128.AsByte(_); - public System.Runtime.Intrinsics.Vector128 SI8 => System.Runtime.Intrinsics.Vector128.AsSByte(_); - public System.Runtime.Intrinsics.Vector128 UI16 => System.Runtime.Intrinsics.Vector128.AsUInt16(_); - public System.Runtime.Intrinsics.Vector128 SI16 => System.Runtime.Intrinsics.Vector128.AsInt16(_); - public System.Runtime.Intrinsics.Vector128 UI32 => System.Runtime.Intrinsics.Vector128.AsUInt32(_); - public System.Runtime.Intrinsics.Vector128 SI32 => System.Runtime.Intrinsics.Vector128.AsInt32(_); - public System.Runtime.Intrinsics.Vector128 UI64 => System.Runtime.Intrinsics.Vector128.AsUInt64(_); - public System.Runtime.Intrinsics.Vector128 SI64 => System.Runtime.Intrinsics.Vector128.AsInt64(_); - public System.Runtime.Intrinsics.Vector128 FP32 => System.Runtime.Intrinsics.Vector128.AsSingle(_); - public System.Runtime.Intrinsics.Vector128 FP64 => System.Runtime.Intrinsics.Vector128.AsDouble(_); - public static implicit operator __m128i(System.Runtime.Intrinsics.Vector128 v) => new __m128i { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128i(System.Runtime.Intrinsics.Vector128 v) => new __m128i { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128i(System.Runtime.Intrinsics.Vector128 v) => new __m128i { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128i(System.Runtime.Intrinsics.Vector128 v) => new __m128i { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128i(System.Runtime.Intrinsics.Vector128 v) => new __m128i { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128i(System.Runtime.Intrinsics.Vector128 v) => new __m128i { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128i(System.Runtime.Intrinsics.Vector128 v) => new __m128i { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128i(System.Runtime.Intrinsics.Vector128 v) => new __m128i { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128i(System.Runtime.Intrinsics.Vector128 v) => new __m128i { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128i(System.Runtime.Intrinsics.Vector128 v) => new __m128i { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - } - - public struct __m128d - { - private System.Runtime.Intrinsics.Vector128 _; - public System.Runtime.Intrinsics.Vector128 UI8 => System.Runtime.Intrinsics.Vector128.AsByte(_); - public System.Runtime.Intrinsics.Vector128 SI8 => System.Runtime.Intrinsics.Vector128.AsSByte(_); - public System.Runtime.Intrinsics.Vector128 UI16 => System.Runtime.Intrinsics.Vector128.AsUInt16(_); - public System.Runtime.Intrinsics.Vector128 SI16 => System.Runtime.Intrinsics.Vector128.AsInt16(_); - public System.Runtime.Intrinsics.Vector128 UI32 => System.Runtime.Intrinsics.Vector128.AsUInt32(_); - public System.Runtime.Intrinsics.Vector128 SI32 => System.Runtime.Intrinsics.Vector128.AsInt32(_); - public System.Runtime.Intrinsics.Vector128 UI64 => System.Runtime.Intrinsics.Vector128.AsUInt64(_); - public System.Runtime.Intrinsics.Vector128 SI64 => System.Runtime.Intrinsics.Vector128.AsInt64(_); - public System.Runtime.Intrinsics.Vector128 FP32 => System.Runtime.Intrinsics.Vector128.AsSingle(_); - public System.Runtime.Intrinsics.Vector128 FP64 => System.Runtime.Intrinsics.Vector128.AsDouble(_); - public static implicit operator __m128d(System.Runtime.Intrinsics.Vector128 v) => new __m128d { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128d(System.Runtime.Intrinsics.Vector128 v) => new __m128d { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128d(System.Runtime.Intrinsics.Vector128 v) => new __m128d { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128d(System.Runtime.Intrinsics.Vector128 v) => new __m128d { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128d(System.Runtime.Intrinsics.Vector128 v) => new __m128d { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128d(System.Runtime.Intrinsics.Vector128 v) => new __m128d { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128d(System.Runtime.Intrinsics.Vector128 v) => new __m128d { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128d(System.Runtime.Intrinsics.Vector128 v) => new __m128d { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128d(System.Runtime.Intrinsics.Vector128 v) => new __m128d { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - public static implicit operator __m128d(System.Runtime.Intrinsics.Vector128 v) => new __m128d { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; - } - - public struct __m256 - { - private System.Runtime.Intrinsics.Vector256 _; - public System.Runtime.Intrinsics.Vector256 UI8 => System.Runtime.Intrinsics.Vector256.AsByte(_); - public System.Runtime.Intrinsics.Vector256 SI8 => System.Runtime.Intrinsics.Vector256.AsSByte(_); - public System.Runtime.Intrinsics.Vector256 UI16 => System.Runtime.Intrinsics.Vector256.AsUInt16(_); - public System.Runtime.Intrinsics.Vector256 SI16 => System.Runtime.Intrinsics.Vector256.AsInt16(_); - public System.Runtime.Intrinsics.Vector256 UI32 => System.Runtime.Intrinsics.Vector256.AsUInt32(_); - public System.Runtime.Intrinsics.Vector256 SI32 => System.Runtime.Intrinsics.Vector256.AsInt32(_); - public System.Runtime.Intrinsics.Vector256 UI64 => System.Runtime.Intrinsics.Vector256.AsUInt64(_); - public System.Runtime.Intrinsics.Vector256 SI64 => System.Runtime.Intrinsics.Vector256.AsInt64(_); - public System.Runtime.Intrinsics.Vector256 FP32 => System.Runtime.Intrinsics.Vector256.AsSingle(_); - public System.Runtime.Intrinsics.Vector256 FP64 => System.Runtime.Intrinsics.Vector256.AsDouble(_); - public static implicit operator __m256(System.Runtime.Intrinsics.Vector256 v) => new __m256 { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256(System.Runtime.Intrinsics.Vector256 v) => new __m256 { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256(System.Runtime.Intrinsics.Vector256 v) => new __m256 { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256(System.Runtime.Intrinsics.Vector256 v) => new __m256 { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256(System.Runtime.Intrinsics.Vector256 v) => new __m256 { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256(System.Runtime.Intrinsics.Vector256 v) => new __m256 { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256(System.Runtime.Intrinsics.Vector256 v) => new __m256 { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256(System.Runtime.Intrinsics.Vector256 v) => new __m256 { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256(System.Runtime.Intrinsics.Vector256 v) => new __m256 { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256(System.Runtime.Intrinsics.Vector256 v) => new __m256 { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - } - - public struct __m256i - { - private System.Runtime.Intrinsics.Vector256 _; - public System.Runtime.Intrinsics.Vector256 UI8 => System.Runtime.Intrinsics.Vector256.AsByte(_); - public System.Runtime.Intrinsics.Vector256 SI8 => System.Runtime.Intrinsics.Vector256.AsSByte(_); - public System.Runtime.Intrinsics.Vector256 UI16 => System.Runtime.Intrinsics.Vector256.AsUInt16(_); - public System.Runtime.Intrinsics.Vector256 SI16 => System.Runtime.Intrinsics.Vector256.AsInt16(_); - public System.Runtime.Intrinsics.Vector256 UI32 => System.Runtime.Intrinsics.Vector256.AsUInt32(_); - public System.Runtime.Intrinsics.Vector256 SI32 => System.Runtime.Intrinsics.Vector256.AsInt32(_); - public System.Runtime.Intrinsics.Vector256 UI64 => System.Runtime.Intrinsics.Vector256.AsUInt64(_); - public System.Runtime.Intrinsics.Vector256 SI64 => System.Runtime.Intrinsics.Vector256.AsInt64(_); - public System.Runtime.Intrinsics.Vector256 FP32 => System.Runtime.Intrinsics.Vector256.AsSingle(_); - public System.Runtime.Intrinsics.Vector256 FP64 => System.Runtime.Intrinsics.Vector256.AsDouble(_); - public static implicit operator __m256i(System.Runtime.Intrinsics.Vector256 v) => new __m256i { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256i(System.Runtime.Intrinsics.Vector256 v) => new __m256i { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256i(System.Runtime.Intrinsics.Vector256 v) => new __m256i { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256i(System.Runtime.Intrinsics.Vector256 v) => new __m256i { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256i(System.Runtime.Intrinsics.Vector256 v) => new __m256i { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256i(System.Runtime.Intrinsics.Vector256 v) => new __m256i { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256i(System.Runtime.Intrinsics.Vector256 v) => new __m256i { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256i(System.Runtime.Intrinsics.Vector256 v) => new __m256i { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256i(System.Runtime.Intrinsics.Vector256 v) => new __m256i { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256i(System.Runtime.Intrinsics.Vector256 v) => new __m256i { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - } - - public struct __m256d - { - private System.Runtime.Intrinsics.Vector256 _; - public System.Runtime.Intrinsics.Vector256 UI8 => System.Runtime.Intrinsics.Vector256.AsByte(_); - public System.Runtime.Intrinsics.Vector256 SI8 => System.Runtime.Intrinsics.Vector256.AsSByte(_); - public System.Runtime.Intrinsics.Vector256 UI16 => System.Runtime.Intrinsics.Vector256.AsUInt16(_); - public System.Runtime.Intrinsics.Vector256 SI16 => System.Runtime.Intrinsics.Vector256.AsInt16(_); - public System.Runtime.Intrinsics.Vector256 UI32 => System.Runtime.Intrinsics.Vector256.AsUInt32(_); - public System.Runtime.Intrinsics.Vector256 SI32 => System.Runtime.Intrinsics.Vector256.AsInt32(_); - public System.Runtime.Intrinsics.Vector256 UI64 => System.Runtime.Intrinsics.Vector256.AsUInt64(_); - public System.Runtime.Intrinsics.Vector256 SI64 => System.Runtime.Intrinsics.Vector256.AsInt64(_); - public System.Runtime.Intrinsics.Vector256 FP32 => System.Runtime.Intrinsics.Vector256.AsSingle(_); - public System.Runtime.Intrinsics.Vector256 FP64 => System.Runtime.Intrinsics.Vector256.AsDouble(_); - public static implicit operator __m256d(System.Runtime.Intrinsics.Vector256 v) => new __m256d { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256d(System.Runtime.Intrinsics.Vector256 v) => new __m256d { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256d(System.Runtime.Intrinsics.Vector256 v) => new __m256d { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256d(System.Runtime.Intrinsics.Vector256 v) => new __m256d { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256d(System.Runtime.Intrinsics.Vector256 v) => new __m256d { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256d(System.Runtime.Intrinsics.Vector256 v) => new __m256d { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256d(System.Runtime.Intrinsics.Vector256 v) => new __m256d { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256d(System.Runtime.Intrinsics.Vector256 v) => new __m256d { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256d(System.Runtime.Intrinsics.Vector256 v) => new __m256d { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - public static implicit operator __m256d(System.Runtime.Intrinsics.Vector256 v) => new __m256d { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; - } - -} diff --git a/src/External/RawIntrinsics/Utils.ManuallyAdded.cs b/src/External/RawIntrinsics/Utils.ManuallyAdded.cs deleted file mode 100644 index 314cb9d..0000000 --- a/src/External/RawIntrinsics/Utils.ManuallyAdded.cs +++ /dev/null @@ -1,7 +0,0 @@ -namespace RawIntrinsics -{ - public static class Utils - { - public static int _MM_SHUFFLE(int z, int y, int x, int w) => (z << 6) | (y << 4) | (x << 2) | w; - } -} diff --git a/src/External/RawIntrinsicsGenerator/Generator.cs b/src/External/RawIntrinsicsGenerator/Generator.cs deleted file mode 100644 index 7eeb8b8..0000000 --- a/src/External/RawIntrinsicsGenerator/Generator.cs +++ /dev/null @@ -1,556 +0,0 @@ -using System.Collections.Concurrent; -using System.Diagnostics; -using System.Runtime.Intrinsics; -using System.Text; -using System.Text.RegularExpressions; -using System.Xml; - -using Microsoft.CodeAnalysis; -using Microsoft.CodeAnalysis.CSharp; -using Microsoft.CodeAnalysis.CSharp.Syntax; - -namespace RawIntrinsicsGenerator -{ - public static class Generator - { - private const string SriDataUrl1 = @"https://raw.githubusercontent.com/dotnet/runtime/release/7.0/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/"; - private const string SriDataUrl2 = @"https://raw.githubusercontent.com/dotnet/runtime/release/7.0/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/"; - private const string IntelDataUrl = @"https://www.intel.com/content/dam/develop/public/us/en/include/intrinsics-guide/data-3-6-5.xml"; - - private static readonly Regex IntelMethodSignature = new(@"///\s+?(?[\w_]+)\s+?(?_mm[\w_]+)\s*?\((?[\w\s,*]+)\)", RegexOptions.Compiled); - private static readonly Regex IntelMethodSignatureSimpilfied = new(@"\s+?(?[\w_]+)\s+?(?_mm[\w_]+)\s*?", RegexOptions.Compiled); - private static readonly Regex IntelTypeDef = new(@"(?:(?unsigned)\s+?)?(?:const\s+)?(?void|char|short|int|long|long\s+?long|float|double|__int32|__int64|(?:(?:__m64|__m128|__m256)(?:i|d)?)|__mmask8|__mmask16|__mmask32|__mmask64)[^*""]*(?\*)?", RegexOptions.Compiled); - - private static readonly Dictionary TechnologyMap = new() - { - {"Sse", (SriDataUrl1, IntelMethodSignature)}, - {"Sse2", (SriDataUrl1, IntelMethodSignature)}, - {"Sse3", (SriDataUrl1, IntelMethodSignature)}, - {"Sse41", (SriDataUrl1, IntelMethodSignature)}, - {"Sse42", (SriDataUrl1, IntelMethodSignature)}, - {"Ssse3", (SriDataUrl1, IntelMethodSignature)}, - {"Avx", (SriDataUrl1, IntelMethodSignature)}, - {"Avx2", (SriDataUrl1, IntelMethodSignature)}, - {"Fma", (SriDataUrl1, IntelMethodSignature)}, - {"Aes", (SriDataUrl1, IntelMethodSignature)}, - {"Bmi1", (SriDataUrl1, IntelMethodSignature)}, - {"Bmi2", (SriDataUrl1, IntelMethodSignature)}, - {"Lzcnt", (SriDataUrl1, IntelMethodSignature)}, - {"Popcnt", (SriDataUrl1, IntelMethodSignature)}, - {"Pclmulqdq", (SriDataUrl1, IntelMethodSignature)}, - {"Vector64", (SriDataUrl2, IntelMethodSignatureSimpilfied)}, - {"Vector128", (SriDataUrl2, IntelMethodSignatureSimpilfied)}, - {"Vector256", (SriDataUrl2, IntelMethodSignatureSimpilfied)}, - }; - - public static async Task Generate(string ns, string saveToPath) - { - var intelDataFile = await FetchFileContent(IntelDataUrl); - intelDataFile = intelDataFile.Trim(); - - var xml = new XmlDocument(); - xml.LoadXml(intelDataFile); - var intelData = new ConcurrentBag(xml.SelectNodes(@"//intrinsic")?.Cast().ToList() ?? new List()); - - var outputData = new ConcurrentDictionary>(); - - foreach (var kv in TechnologyMap) - { - await Generate($"{kv.Value.srcUrl}{kv.Key}.cs", kv.Value.matcher, intelData, outputData); - } - - if (Directory.Exists(saveToPath)) - { - foreach(var fi in new DirectoryInfo(saveToPath).GetFiles()) - { - fi.Delete(); - } - Directory.Delete(saveToPath); - } - Directory.CreateDirectory(saveToPath); - - var codeGenSb = new StringBuilder(); - string tabOffset; - foreach (var (tech, generatedSrc) in outputData) - { - codeGenSb.Clear(); - tabOffset = ""; - codeGenSb.AppendLine($"{tabOffset}namespace {ns}"); - codeGenSb.AppendLine($"{tabOffset}{{"); - - tabOffset = "\t"; - codeGenSb.AppendLine($"{tabOffset}public static unsafe partial class {tech}"); - codeGenSb.AppendLine($"{tabOffset}{{"); - - foreach (var intelMethodName in generatedSrc.Keys.OrderBy(_ => _)) - { - codeGenSb.AppendLine(generatedSrc[intelMethodName]); - } - - tabOffset = "\t"; - codeGenSb.AppendLine($"{tabOffset}}}"); - - tabOffset = ""; - codeGenSb.AppendLine($"{tabOffset}}}"); - - await File.WriteAllTextAsync(Path.Combine(saveToPath, $"{tech}.cs"), codeGenSb.ToString()); - } - - codeGenSb.Clear(); - tabOffset = ""; - codeGenSb.AppendLine($"{tabOffset}namespace {ns}"); - codeGenSb.AppendLine($"{tabOffset}{{"); - - foreach (var t in new[] {(64, null), (128, null), (128, "i"), (128, "d"), (256, null), (256, "i"), (256, "d")}) - { - var (size, pf) = t; - codeGenSb.AppendLine(GenerateMType(size, pf)); - } - tabOffset = ""; - codeGenSb.AppendLine($"{tabOffset}}}"); - - await File.WriteAllTextAsync(Path.Combine(saveToPath, $"Types.cs"), codeGenSb.ToString()); - } - - private static readonly (string etype, string convFn)[] EtypeToReninterpretMethodMap = {("UI8", "AsByte"), ("SI8", "AsSByte"), ("UI16", "AsUInt16"), ("SI16", "AsInt16"), ("UI32", "AsUInt32"), ("SI32", "AsInt32"), ("UI64", "AsUInt64"), ("SI64", "AsInt64"), ("FP32", "AsSingle"), ("FP64", "AsDouble")}; - - private static string GenerateMType(int size, string pf = null) - { - var mTypeName = pf == null ? $"__m{size}" : $"__m{size}{pf}" ; - var codeGenSb = new StringBuilder(); - - var csVectorTypeName = $"System.Runtime.Intrinsics.Vector{size}"; - - var tabOffset = "\t"; - codeGenSb.AppendLine($"{tabOffset}public struct {mTypeName}"); - codeGenSb.AppendLine($"{tabOffset}{{"); - - tabOffset = "\t\t"; - codeGenSb.AppendLine($"{tabOffset}private {csVectorTypeName} _;"); - - foreach (var (etype, convFn) in EtypeToReninterpretMethodMap) - { - var csType = EtypeToCsTypeName(etype); - codeGenSb.AppendLine($"{tabOffset}public {csVectorTypeName}<{csType}> {etype} => {csVectorTypeName}.{convFn}(_);"); - } - - foreach (var (etype, _) in EtypeToReninterpretMethodMap) - { - var csType = EtypeToCsTypeName(etype); - codeGenSb.AppendLine($"{tabOffset}public static implicit operator {mTypeName}({csVectorTypeName}<{csType}> v) => new {mTypeName} {{ _ = {csVectorTypeName}.AsByte(v) }};"); - } - - tabOffset = "\t"; - codeGenSb.AppendLine($"{tabOffset}}}"); - return codeGenSb.ToString(); - } - - private static async Task Generate(string sriUrl, Regex cppIntrinsicNameMatcher, ConcurrentBag intelData, ConcurrentDictionary> outputData) - { - var intelMethod2CsMethodMap = new Dictionary>(); - - var sriData = await FetchFileContent(sriUrl); - - var syntaxTree = CSharpSyntaxTree.ParseText(sriData); - var compilation = CSharpCompilation.Create("Test").AddReferences(MetadataReference.CreateFromFile(typeof(object).Assembly.Location)).AddReferences(MetadataReference.CreateFromFile(typeof(Vector128).Assembly.Location)).AddSyntaxTrees(syntaxTree); - var semanticModel = compilation.GetSemanticModel(syntaxTree); - var syntaxTreeRoot = (CompilationUnitSyntax) await syntaxTree.GetRootAsync(); - - var methodDeclarations = syntaxTreeRoot.DescendantNodes(_ => true, true).OfType(); - - foreach (var methodDeclaration in methodDeclarations) - { - SyntaxTrivia comments = default; - if (!methodDeclaration.HasLeadingTrivia || methodDeclaration.GetLeadingTrivia().All(t => (comments = t).Kind() != SyntaxKind.SingleLineDocumentationCommentTrivia || comments.GetStructure() is not DocumentationCommentTriviaSyntax)) continue; - - Match match = default; - var _ = ((DocumentationCommentTriviaSyntax) comments.GetStructure()).Content.OfType().FirstOrDefault(x => (match = cppIntrinsicNameMatcher.Match(x.Content.ToFullString())).Success); - - if (!match.Success) continue; - - var methodSymbol = semanticModel.GetDeclaredSymbol(methodDeclaration); - var csMethod = new CsMethod - { - Name = methodDeclaration.Identifier.ToString(), - ClassPath = methodSymbol.ReceiverType.ToDisplayString(), - Parameters = new CsMethodParam[methodSymbol.Parameters.Length] - }; - - if (IsCsIntrinsicType(methodSymbol.ReturnType.Name)) - { - csMethod.ReturnType.Name = methodSymbol.ReturnType.Name; - csMethod.ReturnType.TypeParameter = methodDeclaration.ReturnType is GenericNameSyntax returnType ? returnType.TypeArgumentList.Arguments[0].ToString() : null; - } - else if (methodSymbol.ReturnType is not INamedTypeSymbol {IsGenericType: true}) - { - if (methodDeclaration.ReturnType is PointerTypeSyntax) - { - csMethod.ReturnType.Name = ((IPointerTypeSymbol) methodSymbol.ReturnType).PointedAtType.ToDisplayString(); - csMethod.ReturnType.IsPointer = true; - } - else - { - csMethod.ReturnType.Name = methodSymbol.ReturnType.ToDisplayString(); - } - } - else - { - throw new InvalidOperationException($"Unknown return type {methodSymbol.ReturnType.Name}"); - } - - for (var j = 0; j < methodSymbol.Parameters.Length; j++) - { - var parameter = methodDeclaration.ParameterList.Parameters[j]; - - var parameterSymbol = methodSymbol.Parameters[j]; - var csParameter = new CsMethodParam {Name = parameterSymbol.Name}; - if (parameterSymbol.Type is not INamedTypeSymbol {IsGenericType: true} || !IsCsIntrinsicType(parameterSymbol.Type.Name)) - { - if (parameter.Type is PointerTypeSyntax) - { - csParameter.Type = new CsType - { - Name = ((IPointerTypeSymbol) parameterSymbol.Type).PointedAtType.ToDisplayString(), - IsPointer = true - }; - } - else - { - csParameter.Type = new CsType {Name = parameterSymbol.Type.ToDisplayString()}; - } - - csMethod.Parameters[j] = csParameter; - continue; - } - - var parameterTypeArgument = parameter.Type is GenericNameSyntax parameterType ? parameterType.TypeArgumentList.Arguments[0].ToString() : null; - csParameter.Type = new CsType - { - Name = parameterSymbol.Type.Name, - TypeParameter = parameterTypeArgument - }; - csMethod.Parameters[j] = csParameter; - } - - var intelName = match.Groups["fn"].Value; - - if (!intelMethod2CsMethodMap.ContainsKey(intelName)) - { - intelMethod2CsMethodMap[intelName] = new List(); - } - - intelMethod2CsMethodMap[intelName].Add(csMethod); - } - - foreach (var (intelMethodName, csMethods) in intelMethod2CsMethodMap) - { - var intelDataNode = intelData.FirstOrDefault(x => x.Attributes?.GetNamedItem("name")?.Value?.AsSpan().Equals(intelMethodName, StringComparison.InvariantCultureIgnoreCase) ?? false); - if (intelDataNode == null) - { - Debug.WriteLine(intelMethodName); - continue; - } - - var tech = intelDataNode?.Attributes?.GetNamedItem("tech")?.Value.Replace(".", ""); - var intelDataNodeReturn = intelDataNode?.SelectSingleNode("return"); - - var intelMethod = new IntelMethod - { - Name = intelDataNode?.Attributes?.GetNamedItem("name")?.Value, - Return = new IntelMethodParam - { - Name = intelDataNodeReturn?.Attributes?.GetNamedItem("varname")?.Value, - Type = ParseIntelType(intelDataNodeReturn?.Attributes?.GetNamedItem("type")?.Value, intelDataNodeReturn?.Attributes?.GetNamedItem("etype")?.Value) - }, - Description = intelDataNode?.SelectNodes(@"description")?.Cast().Select(n => n.InnerText.Replace(Environment.NewLine, "")).FirstOrDefault(), - Instructions = intelDataNode?.SelectNodes(@"instruction")?.Cast().Select(n => $"{n?.Attributes?.GetNamedItem("name")?.Value} {n?.Attributes?.GetNamedItem("form")?.Value}").FirstOrDefault(), - }; - - var intelMethodParameters = intelDataNode?.SelectNodes(@"parameter")?.Cast().Select(x => new IntelMethodParam - { - Name = x.Attributes?.GetNamedItem("varname")?.Value, - Type = ParseIntelType(x.Attributes?.GetNamedItem("type")?.Value, x.Attributes?.GetNamedItem("etype")?.Value) - }).ToArray(); - intelMethod.Parameters = intelMethodParameters.Where(x => x.Type.Name != "void" || x.Type.IsPointer).ToArray(); - - if (csMethods.Count == 0) - { - throw new InvalidOperationException($"No method matching Intel's {intelMethodName} found in SR.Intrinsics namespace"); - } - - var csMethod = FindMostSuited(intelMethod, csMethods); - if (!csMethod.ReturnType.IsPointer && csMethod.ReturnType.Name == "bool" && intelMethod.Return.Type.Name == "int") - { - intelMethod.Return.Type = new IntelType - { - Name = csMethod.ReturnType.Name, - CsType = csMethod.ReturnType, - Hint = "UI8" - }; - } - - var mappedParameters = new List(); - for (var k = 0; k < intelMethod.Parameters.Length; k++) - { - if (csMethod.Parameters.Length == k) break; - var intelMethodParameter = intelMethod.Parameters[k]; - var csMethodParameter = csMethod.Parameters[k]; - - if (IsCsIntrinsicType(csMethodParameter.Type.Name)) - { - mappedParameters.Add($"{intelMethodParameter.Name}.{CsTypeNameToEtype(csMethodParameter.Type.TypeParameter)}"); - continue; - } - - if (intelMethodParameter.Type.Name == csMethodParameter.Type.Name) - { - mappedParameters.Add($"{intelMethodParameter.Name}"); - continue; - } - - if (csMethodParameter.Type.IsPointer && !intelMethodParameter.Type.IsPointer) - { - mappedParameters.Add($"({csMethodParameter.Type})&{intelMethodParameter.Name}"); - } - else - { - mappedParameters.Add($"({csMethodParameter.Type}){intelMethodParameter.Name}"); - } - } - - var codeGenSb = new StringBuilder(); - var tabOffset = "\t\t"; - var returnCast = ""; - if (!IsCsIntrinsicType(csMethod.ReturnType.Name) && csMethod.ReturnType.Name != intelMethod.Return.Type.Name) - { - returnCast = $"({intelMethod.Return.Type.ToRenderString()})"; - } - - codeGenSb.AppendLine($"{tabOffset}/// "); - codeGenSb.AppendLine($"{tabOffset}/// {intelMethod.Description}"); - codeGenSb.AppendLine($"{tabOffset}/// "); - codeGenSb.AppendLine($"{tabOffset}/// {intelMethod.Instructions}"); - foreach (var intelMethodParameter in intelMethod.Parameters) - { - codeGenSb.AppendLine($"{tabOffset}/// {intelMethodParameter.Type.Name} {{{intelMethodParameter.Type.Hint}}}"); - } - - codeGenSb.AppendLine($"{tabOffset}/// {intelMethod.Return.Type.Name} {intelMethod.Return.Name} {{{intelMethod.Return.Type.Hint}}}"); - codeGenSb.AppendLine($"{tabOffset}public static {intelMethod.ToRenderString()} => {returnCast}{csMethod.ClassPath}.{csMethod.Name}({string.Join(", ", mappedParameters)});"); - - if (!outputData.ContainsKey(tech)) - { - outputData[tech] = new ConcurrentDictionary(); - } - outputData[tech][intelMethodName] = codeGenSb.ToString(); - } - } - - private static async Task FetchFileContent(string url) - { - var request = new HttpRequestMessage - { - Method = HttpMethod.Get, - RequestUri = new Uri(url), - }; - var client = new HttpClient(); - var result = await client.SendAsync(request); - return await result.Content.ReadAsStringAsync(); - } - - private static string CsTypeNameToEtype(string cst) - { - return cst switch - { - "byte" => "UI8", - "sbyte" => "SI8", - "ushort" => "UI16", - "short" => "SI16", - "uint" => "UI32", - "int" => "SI32", - "ulong" => "UI64", - "long" => "SI64", - "float" => "FP32", - "double" => "FP64", - _ => null - }; - } - - private static string EtypeToCsTypeName(string et) - { - return et switch - { - "UI8" => "byte", - "SI8" => "sbyte", - "UI16" => "ushort", - "SI16" => "short", - "UI32" => "uint", - "SI32" => "int", - "UI64" => "ulong", - "SI64" => "long", - "FP32" => "float", - "FP64" => "double", - _ => null - }; - } - - private static IntelType ParseIntelType(string type, string etype) - { - var match = IntelTypeDef.Match(type); - if (!match.Success) - { - throw new InvalidOperationException($"Unknown Intel's type {type}"); - } - - static string IntelTypeNameToSystemTypeName(string itn) - { - return itn switch - { - "__int8" or "char" or "__mmask8" => "byte", - "__int16" or "__mmask16" => "short", - "__int32" or "__mmask32" => "int", - "__int64" or "long long" or "__mmask64" => "long", - _ => itn - }; - } - - var isUnsigned = match.Groups["is_unsigned"].Success; - var isPointer = match.Groups["is_ptr"].Success; - var intelTypeName = IntelTypeNameToSystemTypeName(match.Groups["type_name"].Value); - var csType = intelTypeName switch - { - "void" => new CsType - { - Name = "void", - IsPointer = isPointer - }, - "byte" => new CsType - { - Name = EtypeToCsTypeName(etype) ?? (isUnsigned ? "byte" : "sbyte"), - IsPointer = isPointer - }, - "short" => new CsType - { - Name = EtypeToCsTypeName(etype) ?? (isUnsigned ? "ushort" : "short"), - IsPointer = isPointer - }, - "int" => new CsType - { - Name = EtypeToCsTypeName(etype) ?? (isUnsigned ? "uint" : "int"), - IsPointer = isPointer - }, - "long" => new CsType - { - Name = EtypeToCsTypeName(etype) ?? (isUnsigned ? "ulong" : "long"), - IsPointer = isPointer - }, - "float" => new CsType - { - Name = "float", - IsPointer = isPointer - }, - "double" => new CsType - { - Name = "double", - IsPointer = isPointer - }, - "__m64" or "__m64i" or "__m64d" => new CsType - { - Name = "Vector64", - IsPointer = isPointer, - TypeParameter = EtypeToCsTypeName(etype) - }, - "__m128" or "__m128i" or "__m128d" => new CsType - { - Name = "Vector128", - IsPointer = isPointer, - TypeParameter = EtypeToCsTypeName(etype) - }, - "__m256" or "__m256i" or "__m256d" => new CsType - { - Name = "Vector256", - IsPointer = isPointer, - TypeParameter = EtypeToCsTypeName(etype) - }, - _ => throw new InvalidOperationException($"No type matching Intel's {intelTypeName} found") - }; - return new IntelType - { - Name = intelTypeName, - IsPointer = isPointer, - Hint = etype, - CsType = csType - }; - } - - private static bool IsCsIntrinsicType(string name) => name == nameof(Vector64) || name == nameof(Vector128) || name == nameof(Vector256); - - private static CsMethod FindMostSuited(IntelMethod intelMethod, List csMethods) - { - foreach (var csMethod in csMethods.Where(csMethod => csMethod.Parameters.Length > 0 && intelMethod.Parameters[0].Type.CsType.Name == csMethod.Parameters[0].Type.Name && intelMethod.Parameters[0].Type.CsType.TypeParameter == csMethod.Parameters[0].Type.TypeParameter)) - { - return csMethod; - } - - return csMethods[0]; - } - - private struct CsType - { - public string Name; - public string TypeParameter; - public bool IsPointer; - - public override string ToString() - { - var name = TypeParameter == null ? Name : $"{Name}<{TypeParameter}>"; - return IsPointer ? $"{name}*" : $"{name}"; - } - } - - private struct CsMethod - { - public string ClassPath; - public string Name; - public CsType ReturnType; - public CsMethodParam[] Parameters; - public override string ToString() => $"{ReturnType} {Name}({string.Join(", ", Parameters)})"; - } - - private struct CsMethodParam - { - public string Name; - public CsType Type; - public override string ToString() => $"{Type} {Name}"; - } - - private struct IntelType - { - public string Name; - public string Hint; - public bool IsPointer; - public CsType CsType; - public string ToRenderString() => IsPointer ? $"{Name}*" : $"{Name}"; - public override string ToString() => IsPointer ? $"{Name}* /* {Hint} */" : $"{Name} /* {Hint} */"; - } - - private struct IntelMethod - { - public string Name; - public IntelMethodParam Return; - public IntelMethodParam[] Parameters; - public string Description; - public string Instructions; - public string ToRenderString() => $"{Return.Type.ToRenderString()} {Name}({string.Join(", ", Parameters.Select(x => x.ToRenderString()))})"; - public override string ToString() => $"{Return.Type} {Name}({string.Join(", ", Parameters)})"; - } - - private struct IntelMethodParam - { - public string Name; - public IntelType Type; - public string ToRenderString() => $"{Type.ToRenderString()} {Name}"; - public override string ToString() => $"{Type} {Name}"; - } - } -} \ No newline at end of file diff --git a/src/External/RawIntrinsicsGenerator/Program.cs b/src/External/RawIntrinsicsGenerator/Program.cs deleted file mode 100644 index 735c982..0000000 --- a/src/External/RawIntrinsicsGenerator/Program.cs +++ /dev/null @@ -1,16 +0,0 @@ -using System.Reflection; - -namespace RawIntrinsicsGenerator -{ - public static class Program - { - private async static Task Main(string[] _) - { - var savePath = Path.Combine(Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location), "RawIntrinsics"); - - await Generator.Generate("RawIntrinsics", savePath); - - Console.WriteLine($"Done! Generated files were saved to {savePath}"); - } - } -} \ No newline at end of file diff --git a/src/External/RawIntrinsicsGenerator/RawIntrinsicsGenerator.csproj b/src/External/RawIntrinsicsGenerator/RawIntrinsicsGenerator.csproj deleted file mode 100644 index 3fb0ffa..0000000 --- a/src/External/RawIntrinsicsGenerator/RawIntrinsicsGenerator.csproj +++ /dev/null @@ -1,16 +0,0 @@ - - - - net7.0 - Exe - - - - - - - - - - - diff --git a/src/Fast.PRNGs/Common.cs b/src/Fast.PRNGs/Common.cs index d6c56c7..c501dc6 100644 --- a/src/Fast.PRNGs/Common.cs +++ b/src/Fast.PRNGs/Common.cs @@ -1,13 +1,41 @@ using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; namespace Fast.PRNGs; internal static class Common { - internal const ulong DoubleMask = (1L << 53) - 1; - internal const double Norm53 = 1.0d / (1L << 53); - internal const ulong FloatMask = (1L << 24) - 1; - internal const float Norm24 = 1.0f / (1L << 24); + // From http://prng.di.unimi.it/ + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static double ExtractDouble(ulong value) => + (value >> 11) * (1.0 / (1ul << 53)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static float ExtractSingle(ulong value) => + (value >> 40) * (1.0f / (1u << 24)); + + private static readonly Vector256 DoubleMultiplier256 = Vector256.Create(1.0 / (1ul << 53)); + private static readonly Vector256 SingleMultiplier256 = Vector256.Create(1.0f / (1u << 24)); + private static readonly Vector512 DoubleMultiplier512 = Vector512.Create(1.0 / (1ul << 53)); + private static readonly Vector512 SingleMultiplier512 = Vector512.Create(1.0f / (1u << 24)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void ExtractDoubles256(in Vector256 values, ref Vector256 result) => + result = Avx2.Multiply(Vector256.ConvertToDouble(Avx2.ShiftRightLogical(values, 11)), DoubleMultiplier256); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void ExtractSingles256(in Vector256 values, ref Vector256 result) => + result = Avx2.Multiply(Vector256.ConvertToSingle(Avx2.ShiftRightLogical(values, 40).AsInt32()), SingleMultiplier256); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void ExtractDoubles512(in Vector512 values, ref Vector512 result) => + result = Avx512F.Multiply(Vector512.ConvertToDouble(Avx512F.ShiftRightLogical(values, 11)), DoubleMultiplier512); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void ExtractSingles512(in Vector512 values, ref Vector512 result) => + result = Avx512F.Multiply(Vector512.ConvertToSingle(Avx512F.ShiftRightLogical(values, 40).AsInt32()), SingleMultiplier512); + [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static ulong Rotl(ulong x, int k) diff --git a/src/Fast.PRNGs/Fast.PRNGs.csproj b/src/Fast.PRNGs/Fast.PRNGs.csproj index dfd20ad..cb8e770 100644 --- a/src/Fast.PRNGs/Fast.PRNGs.csproj +++ b/src/Fast.PRNGs/Fast.PRNGs.csproj @@ -1,7 +1,7 @@ - net7.0 + net8.0 true true @@ -32,8 +32,8 @@ - + diff --git a/src/Fast.PRNGs/MWC256.cs b/src/Fast.PRNGs/MWC256.cs index adfa6cf..3b46235 100644 --- a/src/Fast.PRNGs/MWC256.cs +++ b/src/Fast.PRNGs/MWC256.cs @@ -76,14 +76,8 @@ public int Next() } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public double NextDouble() - { - return (NextInternal() & DoubleMask) * Norm53; - } + public double NextDouble() => ExtractDouble(NextInternal()); [MethodImpl(MethodImplOptions.AggressiveInlining)] - public float NextFloat() - { - return (NextInternal() & FloatMask) * Norm24; - } + public float NextFloat() => ExtractSingle(NextInternal()); } diff --git a/src/Fast.PRNGs/Shishua.cs b/src/Fast.PRNGs/Shishua.cs index f2e71fd..343a270 100644 --- a/src/Fast.PRNGs/Shishua.cs +++ b/src/Fast.PRNGs/Shishua.cs @@ -1,12 +1,9 @@ using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; -using RawIntrinsics; - using static Fast.PRNGs.Common; -using static RawIntrinsics.AVX; -using static RawIntrinsics.AVX2; namespace Fast.PRNGs; @@ -26,7 +23,7 @@ namespace Fast.PRNGs; 0x626E33B8D04B4331, 0xBBF73C790D94F79D, 0x471C4AB3ED3D82A5, 0xFEC507705E4AE6E5, }; - private const int BufferSize = 1 << 17; + private const int BufferSize = 1 << 18; private readonly nuint _state; @@ -101,23 +98,61 @@ private ulong NextInternal() } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public int Next() + private ref Vector256 NextInternalVec256() { - return (int)(NextInternal() >> 32); + const int size = sizeof(ulong) * 4; + + ref var bufferedState = ref this.State; + if (bufferedState.BufferIndex >= BufferSize || BufferSize - bufferedState.BufferIndex < size) + { + FillBuffer(ref bufferedState); + } + + ref var value = ref Unsafe.As>(ref bufferedState.Buffer[bufferedState.BufferIndex]); + bufferedState.BufferIndex += size; + return ref value; } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public double NextDouble() + private ref Vector512 NextInternalVec512() { - return (NextInternal() & DoubleMask) * Norm53; + const int size = sizeof(ulong) * 8; + + ref var bufferedState = ref this.State; + if (bufferedState.BufferIndex >= BufferSize || BufferSize - bufferedState.BufferIndex < size) + { + FillBuffer(ref bufferedState); + } + + ref var value = ref Unsafe.As>(ref bufferedState.Buffer[bufferedState.BufferIndex]); + bufferedState.BufferIndex += size; + return ref value; } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public float NextFloat() + public int Next() { - return (NextInternal() & FloatMask) * Norm24; + return (int)(NextInternal() >> 32); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public double NextDouble() => ExtractDouble(NextInternal()); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void NextDoubles256(ref Vector256 result) => ExtractDoubles256(NextInternalVec256(), ref result); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void NextDoubles512(ref Vector512 result) => ExtractDoubles512(NextInternalVec512(), ref result); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public float NextFloat() => ExtractSingle(NextInternal()); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void NextFloats256(ref Vector256 result) => ExtractSingles256(NextInternalVec256(), ref result); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void NextFloats512(ref Vector512 result) => ExtractSingles512(NextInternalVec512(), ref result); + public void Dispose() { FreeState(); @@ -142,10 +177,10 @@ private void InitState(ref RawState state, ref Seed seed) Span buf = stackalloc byte[128 * steps]; - state.State[0] = _mm256_setr_epi64x((long)(Phi[0] ^ seed[0]), (long)(Phi[1]), (long)(Phi[2] ^ seed[1]), (long)(Phi[3])); - state.State[1] = _mm256_setr_epi64x((long)(Phi[4] ^ seed[2]), (long)(Phi[5]), (long)(Phi[6] ^ seed[3]), (long)(Phi[7])); - state.State[2] = _mm256_setr_epi64x((long)(Phi[8] ^ seed[2]), (long)(Phi[9]), (long)(Phi[10] ^ seed[3]), (long)(Phi[11])); - state.State[3] = _mm256_setr_epi64x((long)(Phi[12] ^ seed[0]), (long)(Phi[13]), (long)(Phi[14] ^ seed[1]), (long)(Phi[15])); + state.State[0] = Vector256.Create((ulong)(Phi[0] ^ seed[0]), (ulong)(Phi[1]), (ulong)(Phi[2] ^ seed[1]), (ulong)(Phi[3])); + state.State[1] = Vector256.Create((ulong)(Phi[4] ^ seed[2]), (ulong)(Phi[5]), (ulong)(Phi[6] ^ seed[3]), (ulong)(Phi[7])); + state.State[2] = Vector256.Create((ulong)(Phi[8] ^ seed[2]), (ulong)(Phi[9]), (ulong)(Phi[10] ^ seed[3]), (ulong)(Phi[11])); + state.State[3] = Vector256.Create((ulong)(Phi[12] ^ seed[0]), (ulong)(Phi[13]), (ulong)(Phi[14] ^ seed[1]), (ulong)(Phi[15])); for (int i = 0; i < rounds; i++) { @@ -155,6 +190,7 @@ private void InitState(ref RawState state, ref Seed seed) } } + [MethodImpl(MethodImplOptions.NoInlining)] private void FillBuffer(ref BufferedState bufferedState) { PrngGen(ref bufferedState.State, bufferedState.Buffer); @@ -165,17 +201,17 @@ unsafe private void PrngGen(ref RawState state, Span buffer) { var size = buffer.Length; - __m256i + Vector256 o0 = state.Output[0], o1 = state.Output[1], o2 = state.Output[2], o3 = state.Output[3], s0 = state.State[0], s1 = state.State[1], s2 = state.State[2], s3 = state.State[3], t0, t1, t2, t3, u0, u1, u2, u3, counter = state.Counter; - __m256i shu0 = _mm256_setr_epi32(5, 6, 7, 0, 1, 2, 3, 4), - shu1 = _mm256_setr_epi32(3, 4, 5, 6, 7, 0, 1 ,2); + Vector256 shu0 = Vector256.Create(5u, 6u, 7u, 0u, 1u, 2u, 3u, 4u), + shu1 = Vector256.Create(3u, 4u, 5u, 6u, 7u, 0u, 1u, 2u); - __m256i increment = _mm256_setr_epi64x(7, 5, 3, 1); + Vector256 increment = Vector256.Create(7UL, 5UL, 3UL, 1UL); Debug.Assert(size % 128 == 0, "buf's size must be a multiple of 128 bytes"); @@ -183,28 +219,28 @@ unsafe private void PrngGen(ref RawState state, Span buffer) { if (!buffer.IsEmpty) { - _mm256_storeu_si256((__m256i*)Unsafe.AsPointer(ref buffer[i + 0]), o0); - _mm256_storeu_si256((__m256i*)Unsafe.AsPointer(ref buffer[i + 32]), o1); - _mm256_storeu_si256((__m256i*)Unsafe.AsPointer(ref buffer[i + 64]), o2); - _mm256_storeu_si256((__m256i*)Unsafe.AsPointer(ref buffer[i + 96]), o3); + Avx.Store((ulong*)Unsafe.AsPointer(ref buffer[i + 0]), o0); + Avx.Store((ulong*)Unsafe.AsPointer(ref buffer[i + 32]), o0); + Avx.Store((ulong*)Unsafe.AsPointer(ref buffer[i + 64]), o0); + Avx.Store((ulong*)Unsafe.AsPointer(ref buffer[i + 96]), o0); } - s1 = _mm256_add_epi64(s1, counter); - s3 = _mm256_add_epi64(s3, counter); - counter = _mm256_add_epi64(counter, increment); + s1 = Avx2.Add(s1, counter); + s3 = Avx2.Add(s3, counter); + counter = Avx2.Add(counter, increment); - u0 = _mm256_srli_epi64(s0, 1); u1 = _mm256_srli_epi64(s1, 3); - u2 = _mm256_srli_epi64(s2, 1); u3 = _mm256_srli_epi64(s3, 3); - t0 = _mm256_permutevar8x32_epi32(s0, shu0); t1 = _mm256_permutevar8x32_epi32(s1, shu1); - t2 = _mm256_permutevar8x32_epi32(s2, shu0); t3 = _mm256_permutevar8x32_epi32(s3, shu1); + u0 = Avx2.ShiftRightLogical(s0, 1); u1 = Avx2.ShiftRightLogical(s1, 3); + u2 = Avx2.ShiftRightLogical(s2, 1); u3 = Avx2.ShiftRightLogical(s3, 3); + t0 = Avx2.PermuteVar8x32(s0.AsUInt32(), shu0).AsUInt64(); t1 = Avx2.PermuteVar8x32(s1.AsUInt32(), shu1).AsUInt64(); + t2 = Avx2.PermuteVar8x32(s2.AsUInt32(), shu0).AsUInt64(); t3 = Avx2.PermuteVar8x32(s3.AsUInt32(), shu1).AsUInt64(); - s0 = _mm256_add_epi64(t0, u0); s1 = _mm256_add_epi64(t1, u1); - s2 = _mm256_add_epi64(t2, u2); s3 = _mm256_add_epi64(t3, u3); + s0 = Avx2.Add(t0, u0); s1 = Avx2.Add(t1, u1); + s2 = Avx2.Add(t2, u2); s3 = Avx2.Add(t3, u3); - o0 = _mm256_xor_si256(u0, t1); - o1 = _mm256_xor_si256(u2, t3); - o2 = _mm256_xor_si256(s0, s3); - o3 = _mm256_xor_si256(s2, s1); + o0 = Avx2.Xor(u0, t1); + o1 = Avx2.Xor(u2, t3); + o2 = Avx2.Xor(s0, s3); + o3 = Avx2.Xor(s2, s1); } state.Output[0] = o0; state.Output[1] = o1; state.Output[2] = o2; state.Output[3] = o3; @@ -230,20 +266,20 @@ unsafe private struct BufferedState [StructLayout(LayoutKind.Sequential)] private struct RawState { - private __m256i _state00; - private __m256i _state01; - private __m256i _state02; - private __m256i _state03; - public Span<__m256i> State => MemoryMarshal.CreateSpan(ref _state00, 4); + private Vector256 _state00; + private Vector256 _state01; + private Vector256 _state02; + private Vector256 _state03; + public Span> State => MemoryMarshal.CreateSpan(ref _state00, 4); - private __m256i _output00; - private __m256i _output01; - private __m256i _output02; - private __m256i _output03; - public Span<__m256i> Output => MemoryMarshal.CreateSpan(ref _output00, 4); + private Vector256 _output00; + private Vector256 _output01; + private Vector256 _output02; + private Vector256 _output03; + public Span> Output => MemoryMarshal.CreateSpan(ref _output00, 4); - public __m256i Counter; + public Vector256 Counter; } [StructLayout(LayoutKind.Sequential)] diff --git a/src/Fast.PRNGs/Splitmix64.cs b/src/Fast.PRNGs/Splitmix64.cs index 73c8fba..a30c50b 100644 --- a/src/Fast.PRNGs/Splitmix64.cs +++ b/src/Fast.PRNGs/Splitmix64.cs @@ -28,14 +28,8 @@ public ulong Next() } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public double NextDouble() - { - return (Next() & DoubleMask) * Norm53; - } + public double NextDouble() => ExtractDouble(Next()); [MethodImpl(MethodImplOptions.AggressiveInlining)] - public float NextFloat() - { - return (Next() & FloatMask) * Norm24; - } + public float NextFloat() => ExtractSingle(Next()); } diff --git a/src/Fast.PRNGs/Xoroshiro128Plus.cs b/src/Fast.PRNGs/Xoroshiro128Plus.cs index 2537628..b957cf3 100644 --- a/src/Fast.PRNGs/Xoroshiro128Plus.cs +++ b/src/Fast.PRNGs/Xoroshiro128Plus.cs @@ -1,4 +1,3 @@ -using System; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using static Fast.PRNGs.Common; @@ -50,14 +49,8 @@ public int Next() } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public double NextDouble() - { - return (NextInternal() & DoubleMask) * Norm53; - } + public double NextDouble() => ExtractDouble(NextInternal()); [MethodImpl(MethodImplOptions.AggressiveInlining)] - public float NextFloat() - { - return (NextInternal() & FloatMask) * Norm24; - } + public float NextFloat() => ExtractSingle(NextInternal()); } diff --git a/src/Fast.PRNGs/Xoshiro256Plus.cs b/src/Fast.PRNGs/Xoshiro256Plus.cs index ebe390e..c508a5e 100644 --- a/src/Fast.PRNGs/Xoshiro256Plus.cs +++ b/src/Fast.PRNGs/Xoshiro256Plus.cs @@ -80,14 +80,8 @@ public int Next() } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public double NextDouble() - { - return (NextInternal() & DoubleMask) * Norm53; - } + public double NextDouble() => ExtractDouble(NextInternal()); [MethodImpl(MethodImplOptions.AggressiveInlining)] - public float NextFloat() - { - return (NextInternal() & FloatMask) * Norm24; - } + public float NextFloat() => ExtractSingle(NextInternal()); } diff --git a/test/Fast.PRNGs.Tests/Fast.PRNGs.Tests.csproj b/test/Fast.PRNGs.Tests/Fast.PRNGs.Tests.csproj index dc3bd16..a35bfaa 100644 --- a/test/Fast.PRNGs.Tests/Fast.PRNGs.Tests.csproj +++ b/test/Fast.PRNGs.Tests/Fast.PRNGs.Tests.csproj @@ -1,7 +1,7 @@ - net7.0 + net8.0 diff --git a/test/Fast.PRNGs.Tests/ShishuaTests.cs b/test/Fast.PRNGs.Tests/ShishuaTests.cs index 6e7d552..a921bcb 100644 --- a/test/Fast.PRNGs.Tests/ShishuaTests.cs +++ b/test/Fast.PRNGs.Tests/ShishuaTests.cs @@ -1,7 +1,8 @@ -using Accord.Statistics.Distributions.Univariate; +using Accord.Statistics.Distributions.Univariate; using Accord.Statistics.Testing; using Plotly.NET.CSharp; using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; namespace Fast.PRNGs.Tests; @@ -68,6 +69,24 @@ public void InitFromNew() using var _ = Shishua.Create(new Random()); } + public void Compare() + { + if (!Shishua.IsSupported) + return; + + using var rng1 = Shishua.Create(new Random(0)); + using var rng2 = Shishua.Create(new Random(0)); + using var rng3 = Shishua.Create(new Random(0)); + + Vector256 vec256 = default; + + var val1 = rng1.NextDouble(); + rng2.NextDoubles256(ref vec256); + //var val3 = rng3.NextDoubles512()[0]; + Assert.True(Math.Abs(vec256[0] - val1) < 0.0001d); + //Assert.True(Math.Abs(val3 - val1) < 0.0001d); + } + public void InitFromBytes() { if (!Shishua.IsSupported)