Skip to content

Commit

Permalink
More vectorization of Shishua, .NET 8, use standard intrinsics API (#1)
Browse files Browse the repository at this point in the history
  • Loading branch information
martinothamar committed Jul 9, 2023
1 parent 1be51d3 commit ffd97e9
Show file tree
Hide file tree
Showing 42 changed files with 298 additions and 7,825 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
- name: Setup .NET Core
uses: actions/setup-dotnet@v1
with:
dotnet-version: '7.0.x'
dotnet-version: '8.0.100-preview.5.23303.2'

- name: Install dependencies
run: dotnet restore
Expand Down
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"dotnet.defaultSolution": "Fast.PRNGs.sln"
}
50 changes: 15 additions & 35 deletions Fast.PRNGs.sln
Original file line number Diff line number Diff line change
Expand Up @@ -7,58 +7,30 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{B56AF188-D99
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{82A9760F-251B-4220-9263-153755FA2EC3}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "External", "External", "{12A7C294-6EF5-4FDF-A2BA-A01E320B9C36}"
ProjectSection(SolutionItems) = preProject
src\External\Directory.Build.props = src\External\Directory.Build.props
EndProjectSection
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "RawIntrinsics", "src\External\RawIntrinsics\RawIntrinsics.csproj", "{BA5145CD-6180-4BA3-817F-197158280327}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "RawIntrinsicsGenerator", "src\External\RawIntrinsicsGenerator\RawIntrinsicsGenerator.csproj", "{A161A378-55BF-48D2-84FF-DA3F09EA5258}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "_files", "_files", "{3D9E2A5B-D3F0-49AB-BEC3-647C5063537C}"
ProjectSection(SolutionItems) = preProject
Directory.Build.props = Directory.Build.props
global.json = global.json
Fast.PRNGs.sln = Fast.PRNGs.sln
.editorconfig = .editorconfig
.gitignore = .gitignore
.gitattributes = .gitattributes
.gitignore = .gitignore
Directory.Build.props = Directory.Build.props
Fast.PRNGs.sln = Fast.PRNGs.sln
global.json = global.json
EndProjectSection
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Fast.PRNGs", "src\Fast.PRNGs\Fast.PRNGs.csproj", "{AE271FFA-B5D2-40D8-92E4-71D970142F6D}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Fast.PRNGs", "src\Fast.PRNGs\Fast.PRNGs.csproj", "{AE271FFA-B5D2-40D8-92E4-71D970142F6D}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Fast.PRNGs.Tests", "test\Fast.PRNGs.Tests\Fast.PRNGs.Tests.csproj", "{732E59B8-C209-495B-8608-77E746A68F22}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Fast.PRNGs.Tests", "test\Fast.PRNGs.Tests\Fast.PRNGs.Tests.csproj", "{732E59B8-C209-495B-8608-77E746A68F22}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "benchmark", "benchmark", "{089CE6DA-C860-48D3-95D2-353C7A71C9CD}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Fast.PRNGs.Benchmarks", "benchmark\Fast.PRNGs.Benchmarks\Fast.PRNGs.Benchmarks.csproj", "{2A875B02-B84C-43A3-BF16-593F5E6276BC}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Fast.PRNGs.Benchmarks", "benchmark\Fast.PRNGs.Benchmarks\Fast.PRNGs.Benchmarks.csproj", "{2A875B02-B84C-43A3-BF16-593F5E6276BC}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(NestedProjects) = preSolution
{12A7C294-6EF5-4FDF-A2BA-A01E320B9C36} = {B56AF188-D999-4444-AE68-4971A573FAA4}
{BA5145CD-6180-4BA3-817F-197158280327} = {12A7C294-6EF5-4FDF-A2BA-A01E320B9C36}
{A161A378-55BF-48D2-84FF-DA3F09EA5258} = {12A7C294-6EF5-4FDF-A2BA-A01E320B9C36}
{AE271FFA-B5D2-40D8-92E4-71D970142F6D} = {B56AF188-D999-4444-AE68-4971A573FAA4}
{732E59B8-C209-495B-8608-77E746A68F22} = {82A9760F-251B-4220-9263-153755FA2EC3}
{2A875B02-B84C-43A3-BF16-593F5E6276BC} = {089CE6DA-C860-48D3-95D2-353C7A71C9CD}
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{BA5145CD-6180-4BA3-817F-197158280327}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{BA5145CD-6180-4BA3-817F-197158280327}.Debug|Any CPU.Build.0 = Debug|Any CPU
{BA5145CD-6180-4BA3-817F-197158280327}.Release|Any CPU.ActiveCfg = Release|Any CPU
{BA5145CD-6180-4BA3-817F-197158280327}.Release|Any CPU.Build.0 = Release|Any CPU
{A161A378-55BF-48D2-84FF-DA3F09EA5258}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{A161A378-55BF-48D2-84FF-DA3F09EA5258}.Debug|Any CPU.Build.0 = Debug|Any CPU
{A161A378-55BF-48D2-84FF-DA3F09EA5258}.Release|Any CPU.ActiveCfg = Release|Any CPU
{A161A378-55BF-48D2-84FF-DA3F09EA5258}.Release|Any CPU.Build.0 = Release|Any CPU
{AE271FFA-B5D2-40D8-92E4-71D970142F6D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{AE271FFA-B5D2-40D8-92E4-71D970142F6D}.Debug|Any CPU.Build.0 = Debug|Any CPU
{AE271FFA-B5D2-40D8-92E4-71D970142F6D}.Release|Any CPU.ActiveCfg = Release|Any CPU
Expand All @@ -72,4 +44,12 @@ Global
{2A875B02-B84C-43A3-BF16-593F5E6276BC}.Release|Any CPU.ActiveCfg = Release|Any CPU
{2A875B02-B84C-43A3-BF16-593F5E6276BC}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(NestedProjects) = preSolution
{AE271FFA-B5D2-40D8-92E4-71D970142F6D} = {B56AF188-D999-4444-AE68-4971A573FAA4}
{732E59B8-C209-495B-8608-77E746A68F22} = {82A9760F-251B-4220-9263-153755FA2EC3}
{2A875B02-B84C-43A3-BF16-593F5E6276BC} = {089CE6DA-C860-48D3-95D2-353C7A71C9CD}
EndGlobalSection
EndGlobal
8 changes: 1 addition & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,5 @@ NOTE - MWC256 is likely poorly implemented (it is supposed to be faster). As see
This is clear from the generated assembly atm but I'm not sure why those branching instructions are generated. `UInt128` support is pretty new
so maybe there are some inefficiencies there.

![Scaling iterations](/img/perf-scaling.png "Scaling iterations")

#### With hardware counters

Instrumented with more diagnostics, including hardware counters

![With hardware counters](/img/perf-hardwarecounters.png "With hardware counters")
![Scaling iterations](/img/perf-scaling-2.png "Scaling iterations")

Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<PropertyGroup>
<OutputType>Exe</OutputType>
<LangVersion>latest</LangVersion>
<TargetFramework>net7.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<Nullable>disable</Nullable>
<DebugType>pdbonly</DebugType>
<DebugSymbols>true</DebugSymbols>
Expand Down
30 changes: 30 additions & 0 deletions benchmark/Fast.PRNGs.Benchmarks/Internals/ToDoubleBenchmarks.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
namespace Fast.PRNGs.Benchmarks.Internals;

[ConfigSource]
public class ToDoublesBenchmark
{
internal const ulong DoubleMask = (1L << 53) - 1;
internal const double Norm53 = 1.0d / (1L << 53);

[Params(31512512431231UL)]
public ulong Value { get; set; }

[Benchmark]
public double Original()
{
return (Value & DoubleMask) * Norm53;
}

[Benchmark]
public double New()
{
return (Value >> 11) * (1.0 / (1ul << 53));
}

private class ConfigSourceAttribute : Attribute, IConfigSource
{
public IConfig Config { get; }

public ConfigSourceAttribute() => Config = new SimpleBenchConfig(8);
}
}
56 changes: 42 additions & 14 deletions benchmark/Fast.PRNGs.Benchmarks/PRNGsScaling.cs
Original file line number Diff line number Diff line change
@@ -1,22 +1,31 @@
using BenchmarkDotNet.Environments;
using System.Runtime.Intrinsics;

namespace Fast.PRNGs.Benchmarks;

[Config(typeof(Config))]
[ConfigSource]
public class PRNGsScaling
{
private const int _iterations = 1 << 17;

private Random _random;
private Shishua _shishua;
private Shishua _shishuaSeq;
private Shishua _shishuaVec256;
private Shishua _shishuaVec512;
private Xoroshiro128Plus _xoroshiro128plus;
private Xoshiro256Plus _xoshiro256plus;
private MWC256 _mwc256;

[Params(100_000, 1_000_000)]
[Params(_iterations)]
public int Iterations { get; set; }

[GlobalSetup]
public void Setup()
{
_random = new Random();
_shishua = Shishua.Create();
_shishuaSeq = Shishua.Create();
_shishuaVec256 = Shishua.Create();
_shishuaVec512 = Shishua.Create();
_xoroshiro128plus = Xoroshiro128Plus.Create();
_xoshiro256plus = Xoshiro256Plus.Create();
_mwc256 = MWC256.Create();
Expand All @@ -25,7 +34,9 @@ public void Setup()
[GlobalCleanup]
public void Cleanup()
{
_shishua.Dispose();
_shishuaSeq.Dispose();
_shishuaVec256.Dispose();
_shishuaVec512.Dispose();
}

[Benchmark(Baseline = true)]
Expand All @@ -38,10 +49,30 @@ public double SystemRandomGen()
}

[Benchmark]
public double ShishuaGen()
public double ShishuaSeqGen()
{
for (int i = 0; i < Iterations; i++)
_ = _shishua.NextDouble();
_ = _shishuaSeq.NextDouble();

return default;
}

[Benchmark]
public double ShishuaVec256Gen()
{
Vector256<double> result = default;
for (int i = 0; i < Iterations; i += 4)
_shishuaVec256.NextDoubles256(ref result);

return default;
}

[Benchmark]
public double ShishuaVec512Gen()
{
Vector512<double> result = default;
for (int i = 0; i < Iterations; i += 8)
_shishuaVec512.NextDoubles512(ref result);

return default;
}
Expand Down Expand Up @@ -73,13 +104,10 @@ public double MWC256Gen()
return default;
}

private sealed class Config : ManualConfig
private class ConfigSourceAttribute : Attribute, IConfigSource
{
public Config()
{
this.SummaryStyle = SummaryStyle.Default.WithRatioStyle(RatioStyle.Trend);
this.AddColumn(RankColumn.Arabic);
this.Orderer = new DefaultOrderer(SummaryOrderPolicy.SlowestToFastest, MethodOrderPolicy.Declared);
}
public IConfig Config { get; }

public ConfigSourceAttribute() => Config = new SimpleBenchConfig(_iterations * sizeof(double));
}
}
13 changes: 13 additions & 0 deletions benchmark/Fast.PRNGs.Benchmarks/SimpleBenchConfig.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
namespace Fast.PRNGs.Benchmarks;

internal sealed class SimpleBenchConfig : ManualConfig
{
public SimpleBenchConfig(ulong? byteSizePerIteration = null)
{
this.SummaryStyle = SummaryStyle.Default.WithRatioStyle(RatioStyle.Trend);
this.AddColumn(RankColumn.Arabic);
this.Orderer = new DefaultOrderer(SummaryOrderPolicy.SlowestToFastest, MethodOrderPolicy.Declared);
if (byteSizePerIteration != null)
this.AddColumn(new ThroughputColumn(byteSizePerIteration.Value));
}
}
41 changes: 41 additions & 0 deletions benchmark/Fast.PRNGs.Benchmarks/ThroughputColumn.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
using BenchmarkDotNet.Running;

namespace Fast.PRNGs.Benchmarks;

public class ThroughputColumn : IColumn
{
public string Id { get; }

public string ColumnName { get; }

private readonly ulong _byteSizePerIteration;

public ThroughputColumn(ulong byteSizePerIteration)
{
ColumnName = "Throughput";
Id = nameof(TagColumn) + "." + ColumnName;

_byteSizePerIteration = byteSizePerIteration;
}

public bool IsDefault(Summary summary, BenchmarkCase benchmarkCase) => false;
public string GetValue(Summary summary, BenchmarkCase benchmarkCase)
{
var stats = summary[benchmarkCase].ResultStatistics;
if (stats is null || stats.Mean == default || double.IsNaN(stats.Mean))
return "?";

var gbs = (_byteSizePerIteration / 1e9d) / (stats.Mean / 1e9d);
return $"{gbs:0.00} GB/s";
}

public bool IsAvailable(Summary summary) => true;
public bool AlwaysShow => true;
public ColumnCategory Category => ColumnCategory.Metric;
public int PriorityInCategory => 0;
public bool IsNumeric => true;
public UnitType UnitType => UnitType.Size;
public string Legend => $"Throughput in GB/s";
public string GetValue(Summary summary, BenchmarkCase benchmarkCase, SummaryStyle style) => GetValue(summary, benchmarkCase);
public override string ToString() => ColumnName;
}
7 changes: 4 additions & 3 deletions global.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
"sdk": {
"version": "7.0.201",
"rollForward": "latestFeature"
"version": "8.0.100-preview.5.23303.2",
"rollForward": "latestFeature",
"allowPrerelease": true
}
}
}
Binary file added img/perf-scaling-2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
8 changes: 0 additions & 8 deletions src/External/Directory.Build.props

This file was deleted.

25 changes: 0 additions & 25 deletions src/External/RawIntrinsics/AVX.ManuallyAdded.cs

This file was deleted.

Loading

0 comments on commit ffd97e9

Please sign in to comment.