From 57449a31374ac2ddf63aafbf3b21e2feea8458ca Mon Sep 17 00:00:00 2001 From: Martin Othamar Date: Sun, 9 Jul 2023 01:38:23 +0200 Subject: [PATCH] .NET 8, more vectorization --- .../Fast.PRNGs.Benchmarks.csproj | 2 +- .../Fast.PRNGs.Benchmarks/PRNGsScaling.cs | 39 +++++++++++-- global.json | 7 ++- .../RawIntrinsics/RawIntrinsics.csproj | 2 +- .../RawIntrinsicsGenerator/Generator.cs | 55 ++++++++++++++----- .../RawIntrinsicsGenerator.csproj | 2 +- src/Fast.PRNGs/Common.cs | 8 +++ src/Fast.PRNGs/Fast.PRNGs.csproj | 2 +- src/Fast.PRNGs/Shishua.cs | 46 +++++++++++++++- test/Fast.PRNGs.Tests/Fast.PRNGs.Tests.csproj | 2 +- test/Fast.PRNGs.Tests/ShishuaTests.cs | 21 ++++++- 11 files changed, 155 insertions(+), 31 deletions(-) diff --git a/benchmark/Fast.PRNGs.Benchmarks/Fast.PRNGs.Benchmarks.csproj b/benchmark/Fast.PRNGs.Benchmarks/Fast.PRNGs.Benchmarks.csproj index 03ae3ed..deed779 100644 --- a/benchmark/Fast.PRNGs.Benchmarks/Fast.PRNGs.Benchmarks.csproj +++ b/benchmark/Fast.PRNGs.Benchmarks/Fast.PRNGs.Benchmarks.csproj @@ -2,7 +2,7 @@ Exe latest - net7.0 + net8.0 disable pdbonly true diff --git a/benchmark/Fast.PRNGs.Benchmarks/PRNGsScaling.cs b/benchmark/Fast.PRNGs.Benchmarks/PRNGsScaling.cs index 1ce48d5..ed61290 100644 --- a/benchmark/Fast.PRNGs.Benchmarks/PRNGsScaling.cs +++ b/benchmark/Fast.PRNGs.Benchmarks/PRNGsScaling.cs @@ -1,22 +1,28 @@ +using System.Runtime.Intrinsics; + namespace Fast.PRNGs.Benchmarks; [Config(typeof(Config))] public class PRNGsScaling { private Random _random; - private Shishua _shishua; + private Shishua _shishuaSeq; + private Shishua _shishuaVec256; + private Shishua _shishuaVec512; private Xoroshiro128Plus _xoroshiro128plus; private Xoshiro256Plus _xoshiro256plus; private MWC256 _mwc256; - [Params(100_000, 1_000_000)] + [Params(1 << 17/*, 1 << 20*/)] public int Iterations { get; set; } [GlobalSetup] public void Setup() { _random = new Random(); - _shishua = Shishua.Create(); + _shishuaSeq = Shishua.Create(); + _shishuaVec256 = Shishua.Create(); + _shishuaVec512 = Shishua.Create(); _xoroshiro128plus = Xoroshiro128Plus.Create(); _xoshiro256plus = Xoshiro256Plus.Create(); _mwc256 = MWC256.Create(); @@ -25,7 +31,9 @@ public void Setup() [GlobalCleanup] public void Cleanup() { - _shishua.Dispose(); + _shishuaSeq.Dispose(); + _shishuaVec256.Dispose(); + _shishuaVec512.Dispose(); } [Benchmark(Baseline = true)] @@ -38,10 +46,29 @@ public double SystemRandomGen() } [Benchmark] - public double ShishuaGen() + public double ShishuaSeqGen() { for (int i = 0; i < Iterations; i++) - _ = _shishua.NextDouble(); + _ = _shishuaSeq.NextDouble(); + + return default; + } + + [Benchmark] + public double ShishuaVec256Gen() + { + Vector256 result = default; + for (int i = 0; i < Iterations; i += 4) + _shishuaVec256.NextDoubles256(ref result); + + return default; + } + + [Benchmark] + public double ShishuaVec512Gen() + { + for (int i = 0; i < Iterations; i += 8) + _ = _shishuaVec512.NextDoubles512(); return default; } diff --git a/global.json b/global.json index 3672f82..717bcdd 100644 --- a/global.json +++ b/global.json @@ -1,6 +1,7 @@ { "sdk": { - "version": "7.0.201", - "rollForward": "latestFeature" + "version": "8.0.100-preview.5.23303.2", + "rollForward": "latestFeature", + "allowPrerelease": true } -} \ No newline at end of file +} diff --git a/src/External/RawIntrinsics/RawIntrinsics.csproj b/src/External/RawIntrinsics/RawIntrinsics.csproj index 2e66377..fe10553 100644 --- a/src/External/RawIntrinsics/RawIntrinsics.csproj +++ b/src/External/RawIntrinsics/RawIntrinsics.csproj @@ -1,7 +1,7 @@ - net7.0 + net8.0 true diff --git a/src/External/RawIntrinsicsGenerator/Generator.cs b/src/External/RawIntrinsicsGenerator/Generator.cs index 7eeb8b8..d4cc477 100644 --- a/src/External/RawIntrinsicsGenerator/Generator.cs +++ b/src/External/RawIntrinsicsGenerator/Generator.cs @@ -13,8 +13,8 @@ namespace RawIntrinsicsGenerator { public static class Generator { - private const string SriDataUrl1 = @"https://raw.githubusercontent.com/dotnet/runtime/release/7.0/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/"; - private const string SriDataUrl2 = @"https://raw.githubusercontent.com/dotnet/runtime/release/7.0/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/"; + private const string SriDataUrl1 = @"https://raw.githubusercontent.com/dotnet/runtime/main/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/"; + private const string SriDataUrl2 = @"https://raw.githubusercontent.com/dotnet/runtime/main/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/"; private const string IntelDataUrl = @"https://www.intel.com/content/dam/develop/public/us/en/include/intrinsics-guide/data-3-6-5.xml"; private static readonly Regex IntelMethodSignature = new(@"///\s+?(?[\w_]+)\s+?(?_mm[\w_]+)\s*?\((?[\w\s,*]+)\)", RegexOptions.Compiled); @@ -203,8 +203,14 @@ private static async Task Generate(string sriUrl, Regex cppIntrinsicNameMatcher, var parameter = methodDeclaration.ParameterList.Parameters[j]; var parameterSymbol = methodSymbol.Parameters[j]; - var csParameter = new CsMethodParam {Name = parameterSymbol.Name}; - if (parameterSymbol.Type is not INamedTypeSymbol {IsGenericType: true} || !IsCsIntrinsicType(parameterSymbol.Type.Name)) + + var csParameter = new CsMethodParam + { + Name = parameterSymbol.Name, + Attrs = parameterSymbol.GetAttributes().Select(a => $"[{a}]").ToArray(), + }; + + if (parameterSymbol.Type is not INamedTypeSymbol {IsGenericType: true} || !IsCsIntrinsicType(parameterSymbol.Type.Name)) { if (parameter.Type is PointerTypeSyntax) { @@ -260,25 +266,42 @@ private static async Task Generate(string sriUrl, Regex cppIntrinsicNameMatcher, Return = new IntelMethodParam { Name = intelDataNodeReturn?.Attributes?.GetNamedItem("varname")?.Value, - Type = ParseIntelType(intelDataNodeReturn?.Attributes?.GetNamedItem("type")?.Value, intelDataNodeReturn?.Attributes?.GetNamedItem("etype")?.Value) + Type = ParseIntelType(intelDataNodeReturn?.Attributes?.GetNamedItem("type")?.Value, intelDataNodeReturn?.Attributes?.GetNamedItem("etype")?.Value), + Attrs = Array.Empty(), }, Description = intelDataNode?.SelectNodes(@"description")?.Cast().Select(n => n.InnerText.Replace(Environment.NewLine, "")).FirstOrDefault(), Instructions = intelDataNode?.SelectNodes(@"instruction")?.Cast().Select(n => $"{n?.Attributes?.GetNamedItem("name")?.Value} {n?.Attributes?.GetNamedItem("form")?.Value}").FirstOrDefault(), }; - var intelMethodParameters = intelDataNode?.SelectNodes(@"parameter")?.Cast().Select(x => new IntelMethodParam + var intelMethodParameters = intelDataNode?.SelectNodes(@"parameter")?.Cast().Select(x => new IntelMethodParam { Name = x.Attributes?.GetNamedItem("varname")?.Value, - Type = ParseIntelType(x.Attributes?.GetNamedItem("type")?.Value, x.Attributes?.GetNamedItem("etype")?.Value) - }).ToArray(); + Type = ParseIntelType(x.Attributes?.GetNamedItem("type")?.Value, x.Attributes?.GetNamedItem("etype")?.Value), + Attrs = Array.Empty(), + }).ToArray(); intelMethod.Parameters = intelMethodParameters.Where(x => x.Type.Name != "void" || x.Type.IsPointer).ToArray(); - if (csMethods.Count == 0) - { - throw new InvalidOperationException($"No method matching Intel's {intelMethodName} found in SR.Intrinsics namespace"); - } + if (csMethods.Count == 0) + { + throw new InvalidOperationException($"No method matching Intel's {intelMethodName} found in SR.Intrinsics namespace"); + } + + var csMethod = FindMostSuited(intelMethod, csMethods); + + var paramsWithAttrs = csMethod.Parameters.Select((p, i) => (p, i)).Where(t => t.p.Attrs.Length > 0).ToArray(); + foreach (var (csParam, i) in paramsWithAttrs) + { + var intelParamCand = intelMethod.Parameters + .Select((p, j) => (p, j, s: 10)) + .Select(t => (t.p, t.j, s: t.s - Math.Abs(t.j - i))) + .Select(t => (t.p, t.j, s: t.s - (t.p.Type.CsType.Name == csParam.Type.Name && t.p.Type.CsType.TypeParameter == csParam.Type.TypeParameter ? 0 : 1))) + .OrderByDescending(t => t.s) + .ToArray(); + + var intelParam = intelParamCand[0]; + intelMethod.Parameters[intelParam.j].Attrs = csParam.Attrs; + } - var csMethod = FindMostSuited(intelMethod, csMethods); if (!csMethod.ReturnType.IsPointer && csMethod.ReturnType.Name == "bool" && intelMethod.Return.Type.Name == "int") { intelMethod.Return.Type = new IntelType @@ -521,6 +544,7 @@ private struct CsMethodParam { public string Name; public CsType Type; + public string[] Attrs; public override string ToString() => $"{Type} {Name}"; } @@ -549,8 +573,9 @@ private struct IntelMethodParam { public string Name; public IntelType Type; - public string ToRenderString() => $"{Type.ToRenderString()} {Name}"; + public string[] Attrs; + public string ToRenderString() => $"{string.Join("", Attrs)} {Type.ToRenderString()} {Name}"; public override string ToString() => $"{Type} {Name}"; } } -} \ No newline at end of file +} diff --git a/src/External/RawIntrinsicsGenerator/RawIntrinsicsGenerator.csproj b/src/External/RawIntrinsicsGenerator/RawIntrinsicsGenerator.csproj index 3fb0ffa..a28d819 100644 --- a/src/External/RawIntrinsicsGenerator/RawIntrinsicsGenerator.csproj +++ b/src/External/RawIntrinsicsGenerator/RawIntrinsicsGenerator.csproj @@ -1,7 +1,7 @@ - net7.0 + net8.0 Exe diff --git a/src/Fast.PRNGs/Common.cs b/src/Fast.PRNGs/Common.cs index d6c56c7..34f9a7f 100644 --- a/src/Fast.PRNGs/Common.cs +++ b/src/Fast.PRNGs/Common.cs @@ -1,4 +1,5 @@ using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; namespace Fast.PRNGs; @@ -9,6 +10,13 @@ internal static class Common internal const ulong FloatMask = (1L << 24) - 1; internal const float Norm24 = 1.0f / (1L << 24); + + internal static readonly Vector256 DoubleMaskVec256 = Vector256.Create(DoubleMask); + internal static readonly Vector256 Norm53Vec256 = Vector256.Create(Norm53); + + internal static readonly Vector512 DoubleMaskVec512 = Vector512.Create(DoubleMask); + internal static readonly Vector512 Norm53Vec512 = Vector512.Create(Norm53); + [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static ulong Rotl(ulong x, int k) { diff --git a/src/Fast.PRNGs/Fast.PRNGs.csproj b/src/Fast.PRNGs/Fast.PRNGs.csproj index dfd20ad..9a21ce9 100644 --- a/src/Fast.PRNGs/Fast.PRNGs.csproj +++ b/src/Fast.PRNGs/Fast.PRNGs.csproj @@ -1,7 +1,7 @@ - net7.0 + net8.0 true true diff --git a/src/Fast.PRNGs/Shishua.cs b/src/Fast.PRNGs/Shishua.cs index f2e71fd..3d3d171 100644 --- a/src/Fast.PRNGs/Shishua.cs +++ b/src/Fast.PRNGs/Shishua.cs @@ -1,6 +1,8 @@ using System.Diagnostics; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; using RawIntrinsics; @@ -26,7 +28,7 @@ namespace Fast.PRNGs; 0x626E33B8D04B4331, 0xBBF73C790D94F79D, 0x471C4AB3ED3D82A5, 0xFEC507705E4AE6E5, }; - private const int BufferSize = 1 << 17; + private const int BufferSize = 1 << 18; private readonly nuint _state; @@ -100,6 +102,36 @@ private ulong NextInternal() return value; } + private ref Vector256 NextInternalVec256() + { + const int size = sizeof(ulong) * 4; + + ref var bufferedState = ref this.State; + if (bufferedState.BufferIndex >= BufferSize || BufferSize - bufferedState.BufferIndex < size) + { + FillBuffer(ref bufferedState); + } + + ref var value = ref Unsafe.As>(ref bufferedState.Buffer[bufferedState.BufferIndex]); + bufferedState.BufferIndex += size; + return ref value; + } + + private Vector512 NextInternalVec512() + { + const int size = sizeof(ulong) * 8; + + ref var bufferedState = ref this.State; + if (bufferedState.BufferIndex >= BufferSize || BufferSize - bufferedState.BufferIndex < size) + { + FillBuffer(ref bufferedState); + } + + var value = Unsafe.As>(ref bufferedState.Buffer[bufferedState.BufferIndex]); + bufferedState.BufferIndex += size; + return value; + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] public int Next() { @@ -112,6 +144,18 @@ public double NextDouble() return (NextInternal() & DoubleMask) * Norm53; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void NextDoubles256(ref Vector256 result) + { + result = Avx2.Multiply(Vector256.ConvertToDouble(Avx2.And(NextInternalVec256(), DoubleMaskVec256)), Norm53Vec256); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public Vector512 NextDoubles512() + { + return Avx512F.Multiply(Vector512.ConvertToDouble(Avx512F.And(NextInternalVec512(), DoubleMaskVec512)), Norm53Vec512); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] public float NextFloat() { diff --git a/test/Fast.PRNGs.Tests/Fast.PRNGs.Tests.csproj b/test/Fast.PRNGs.Tests/Fast.PRNGs.Tests.csproj index dc3bd16..a35bfaa 100644 --- a/test/Fast.PRNGs.Tests/Fast.PRNGs.Tests.csproj +++ b/test/Fast.PRNGs.Tests/Fast.PRNGs.Tests.csproj @@ -1,7 +1,7 @@ - net7.0 + net8.0 diff --git a/test/Fast.PRNGs.Tests/ShishuaTests.cs b/test/Fast.PRNGs.Tests/ShishuaTests.cs index 6e7d552..a921bcb 100644 --- a/test/Fast.PRNGs.Tests/ShishuaTests.cs +++ b/test/Fast.PRNGs.Tests/ShishuaTests.cs @@ -1,7 +1,8 @@ -using Accord.Statistics.Distributions.Univariate; +using Accord.Statistics.Distributions.Univariate; using Accord.Statistics.Testing; using Plotly.NET.CSharp; using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; namespace Fast.PRNGs.Tests; @@ -68,6 +69,24 @@ public void InitFromNew() using var _ = Shishua.Create(new Random()); } + public void Compare() + { + if (!Shishua.IsSupported) + return; + + using var rng1 = Shishua.Create(new Random(0)); + using var rng2 = Shishua.Create(new Random(0)); + using var rng3 = Shishua.Create(new Random(0)); + + Vector256 vec256 = default; + + var val1 = rng1.NextDouble(); + rng2.NextDoubles256(ref vec256); + //var val3 = rng3.NextDoubles512()[0]; + Assert.True(Math.Abs(vec256[0] - val1) < 0.0001d); + //Assert.True(Math.Abs(val3 - val1) < 0.0001d); + } + public void InitFromBytes() { if (!Shishua.IsSupported)