Skip to content

Commit

Permalink
Merge pull request #139 from dpanfilyonok/profiling
Browse files Browse the repository at this point in the history
Add benchmarks and fix performance
  • Loading branch information
gsvgit authored Jun 3, 2022
2 parents 4494d3f + f2729cb commit 4e17f32
Show file tree
Hide file tree
Showing 37 changed files with 1,050 additions and 308 deletions.
19 changes: 18 additions & 1 deletion Brahma.FSharp.sln
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.26124.0
MinimumVisualStudioVersion = 15.0.26124.0
Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "docsTool", "docsTool\docsTool.fsproj", "{8855EC73-F6A1-43D3-AFBC-04A3E09F9BD9}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{569DF79C-F1A0-4AE9-BE73-17628649518C}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "YC.OpenCL.NET", "src\YC.OpenCL.NET\YC.OpenCL.NET.csproj", "{455B3C80-98CD-484A-8AD0-6EB93E504D84}"
Expand All @@ -21,7 +23,9 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{4E140F34
EndProject
Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "Brahma.FSharp.Tests", "tests\Brahma.FSharp.Tests\Brahma.FSharp.Tests.fsproj", "{D607C727-7FFB-494B-B481-01B1A2569EE8}"
EndProject
Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "docsTool", "docsTool\docsTool.fsproj", "{8855EC73-F6A1-43D3-AFBC-04A3E09F9BD9}"
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "benchmarks", "benchmarks", "{ABA8692D-16D1-4258-94E2-D0D73E86F8A1}"
EndProject
Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "Brahma.FSharp.Benchmarks", "benchmarks\Brahma.FSharp.Benchmarks\Brahma.FSharp.Benchmarks.fsproj", "{3CB90A8F-D372-426E-930A-65833F46E796}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Expand Down Expand Up @@ -132,6 +136,18 @@ Global
{4179A51B-86A4-4426-9923-4621506E2490}.Release|x64.Build.0 = Release|Any CPU
{4179A51B-86A4-4426-9923-4621506E2490}.Release|x86.ActiveCfg = Release|Any CPU
{4179A51B-86A4-4426-9923-4621506E2490}.Release|x86.Build.0 = Release|Any CPU
{3CB90A8F-D372-426E-930A-65833F46E796}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{3CB90A8F-D372-426E-930A-65833F46E796}.Debug|Any CPU.Build.0 = Debug|Any CPU
{3CB90A8F-D372-426E-930A-65833F46E796}.Debug|x64.ActiveCfg = Debug|Any CPU
{3CB90A8F-D372-426E-930A-65833F46E796}.Debug|x64.Build.0 = Debug|Any CPU
{3CB90A8F-D372-426E-930A-65833F46E796}.Debug|x86.ActiveCfg = Debug|Any CPU
{3CB90A8F-D372-426E-930A-65833F46E796}.Debug|x86.Build.0 = Debug|Any CPU
{3CB90A8F-D372-426E-930A-65833F46E796}.Release|Any CPU.ActiveCfg = Release|Any CPU
{3CB90A8F-D372-426E-930A-65833F46E796}.Release|Any CPU.Build.0 = Release|Any CPU
{3CB90A8F-D372-426E-930A-65833F46E796}.Release|x64.ActiveCfg = Release|Any CPU
{3CB90A8F-D372-426E-930A-65833F46E796}.Release|x64.Build.0 = Release|Any CPU
{3CB90A8F-D372-426E-930A-65833F46E796}.Release|x86.ActiveCfg = Release|Any CPU
{3CB90A8F-D372-426E-930A-65833F46E796}.Release|x86.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(NestedProjects) = preSolution
{455B3C80-98CD-484A-8AD0-6EB93E504D84} = {569DF79C-F1A0-4AE9-BE73-17628649518C}
Expand All @@ -141,5 +157,6 @@ Global
{25D0A829-1B1D-4918-B559-A6339930E9C5} = {569DF79C-F1A0-4AE9-BE73-17628649518C}
{D607C727-7FFB-494B-B481-01B1A2569EE8} = {4E140F34-4036-4BCF-9FC9-968AD475FC85}
{4179A51B-86A4-4426-9923-4621506E2490} = {569DF79C-F1A0-4AE9-BE73-17628649518C}
{3CB90A8F-D372-426E-930A-65833F46E796} = {ABA8692D-16D1-4258-94E2-D0D73E86F8A1}
EndGlobalSection
EndGlobal
68 changes: 51 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,43 +5,77 @@
[![NuGet Badge](https://buildstats.info/nuget/Brahma.FSharp?includePreReleases=true)](https://www.nuget.org/packages/Brahma.FSharp/)
[![License](https://img.shields.io/badge/License-EPL_1.0-red.svg)](https://opensource.org/licenses/EPL-1.0)

**Brahma.FSharp** provides a way to utilize GPGPUs in your F# programms. It is based on F# quotations to OpenCL translation.
**Brahma.FSharp** provides a way to utilize GPGPU in your F# programs. It is based on F# quotations to OpenCL translation.

Features of Brahma.FSharp:
## Features
* Utilization of OpenCL for communication with GPU. So, you can work not only with NVIDIA devices but with any device which supports OpenCL (e.g. with AMD ot Intel devices).
* Not only primitive types, but olso discriminated unions, structs, records are supported.
* Not only primitive types, but also discriminated unions, structs, records are supported.
* Pattern matching, mutable and immutable bindings, nested bindings are supported.
* Fine-grained memory managenent and kernels compilation porcess.
* Mailbox prcessor based interface for communication with devices.
* Fine-grained memory management and kernels compilation process.
* Mailbox processor based interface for communication with devices.

More detailes are available [here](https://yaccconstructor.github.io/Brahma.FSharp/).
More details are available [here](https://yaccconstructor.github.io/Brahma.FSharp/).
Examples of usage are available [here](https://github.com/YaccConstructor/Brahma.FSharp.Examples).

---
## Installation
Install Brahma.FSharp by running:
```shell
dotnet add package Brahma.FSharp
```

### Developing
## Quick Start
```f# script
open Brahma.FSharp
We use [MiniScaffold](https://github.com/TheAngryByrd/MiniScaffold).
let device = ClDevice.GetFirstAppropriateDevice()
let context = RuntimeContext(device)
Make sure the following **requirements** are installed on your system:
let kernel =
<@
fun (range: Range1D) (buffer: int clarray) ->
let gid = range.GlobalID0
buffer.[gid] <- buffer.[gid] + 1
@>
- [dotnet SDK](https://dotnet.microsoft.com/en-us/download/dotnet/5.0) 5.0 or higher
- OpenCL-compatible device and respective OpenCL driver.
opencl {
use! buffer = ClArray.alloc<int> 1024
do! runCommand kernel <| fun kernel ->
kernel
<| Range1D(1024, 256)
<| buffer
return! ClArray.toHost buffer
}
|> ClTask.runSync context
```

---
## Contributing
Contributions, issues and feature requests are welcome.
Feel free to check [issues](https://github.com/YaccConstructor/Brahma.FSharp/issues) page if you want to contribute.

[//]: # (We use [MiniScaffold]&#40;https://github.com/TheAngryByrd/MiniScaffold&#41; template for this library.)

### Build
Make sure the following **requirements** are installed on your system:
- [dotnet SDK](https://dotnet.microsoft.com/en-us/download/dotnet/5.0) 5.0 or higher
- OpenCL-compatible device and respective OpenCL driver

To build and run all tests:

On Windows
- on Windows
```cmd
build.cmd
```

On Linux/macOS
```sh
- on Linux/macOS
```shell
./build.sh
```
To find more options look at [MiniScaffold](https://github.com/TheAngryByrd/MiniScaffold).

## License
This project licensed under EPL-1.0 License. License text can be found in the [license file](https://github.com/YaccConstructor/Brahma.FSharp/blob/master/LICENSE.md).

To find more options look at [MiniScaffold](https://github.com/TheAngryByrd/MiniScaffold).
## FAQ
Setup BRAHMA_OCL_PATH environment variable

Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
namespace Brahma.FSharp.Benchmarks

open BenchmarkDotNet.Attributes

[<AbstractClass>]
type AtomicBenchamrks() =
member this.WgSize = 256

[<ParamsSource("GlobalWorkSizeProvider")>]
member val GlobalWorkSize = 0 with get, set

static member GlobalWorkSizeProvider =
seq {
1000
100_000
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
namespace Brahma.FSharp.Benchmarks

open BenchmarkDotNet.Attributes
open Brahma.FSharp
open Brahma.FSharp.OpenCL.Shared
open FSharp.Quotations

[<AbstractClass>]
type BrahmaAtomicBenchamrks() =
inherit AtomicBenchamrks()

member val Program = Unchecked.defaultof<ClProgram<Range1D, int clcell -> unit>> with get, set

member val Cell = Unchecked.defaultof<int clcell> with get, set

[<ParamsSource("AvaliableContextsProvider")>]
member val Context = Unchecked.defaultof<RuntimeContext> with get, set

abstract Command: Expr<Range1D -> int clcell -> unit>

[<GlobalSetup>]
member this.CompileProgram() =
this.Program <- this.Context.ClContext.Compile(this.Command)

[<IterationSetup>]
member this.AllocCellOnDevice() =
this.Cell <-
opencl {
return! ClCell.alloc<int> ()
}
|> ClTask.runSync this.Context

abstract RunProgram : unit -> unit
default this.RunProgram() =
opencl {
do! runProgram this.Program <| fun kernel ->
kernel
<| Range1D.CreateValid(this.GlobalWorkSize, this.WgSize)
<| this.Cell
}
|> ClTask.runSync this.Context

[<IterationCleanup>]
member this.CleanCell() =
this.Cell.Dispose()

static member AvaliableContextsProvider =
ClDevice.GetAvailableDevices(Platform.Nvidia)
|> Seq.map RuntimeContext

type BrahmaNativeAtomicBenchmarks() =
inherit BrahmaAtomicBenchamrks()

override this.Command =
<@
fun (range: Range1D) (acc: int clcell) ->
atomic (+) acc.Value 1 |> ignore
@>

[<Benchmark(Baseline = true)>]
override this.RunProgram() = base.RunProgram()

type BrahmaSpinlockAtomicBenchmarks() =
inherit BrahmaAtomicBenchamrks()

override this.Command =
<@
fun (range: Range1D) (acc: int clcell) ->
atomic (fun x -> x + 1) acc.Value |> ignore
@>

[<Benchmark>]
override this.RunProgram() = base.RunProgram()
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
namespace Brahma.FSharp.Benchmarks.Ilgpu

open System
open Brahma.FSharp.Benchmarks
open BenchmarkDotNet.Attributes
open ILGPU
open ILGPU.AtomicOperations
open ILGPU.Runtime
open ILGPU.Runtime.Cuda

[<AbstractClass>]
type IlgpuAtomicBenchamrks() =
inherit AtomicBenchamrks()

member val Program = Unchecked.defaultof<Action<Index1D, VariableView<int>>> with get, set

member val Cell = Unchecked.defaultof<MemoryBuffer1D<int, Stride1D.Dense>> with get, set

member val Accelerator =
let context = Context.CreateDefault()
context.CreateCudaAccelerator(0)

abstract Command: Action<Index1D, VariableView<int>>

[<GlobalSetup>]
member this.CompileProgram() =
this.Program <- this.Accelerator.LoadAutoGroupedStreamKernel<Index1D, VariableView<int>>(this.Command)

[<IterationSetup>]
member this.AllocCellOnDevice() =
this.Cell <- this.Accelerator.Allocate1D<int>(1L)
this.Cell.MemSetToZero()

abstract RunProgram : unit -> unit
default this.RunProgram() =
this.Program.Invoke(Index1D this.GlobalWorkSize, this.Cell.View.VariableView(Index1D 0))
this.Accelerator.Synchronize()

[<IterationCleanup>]
member this.CleanCell() =
this.Cell.Dispose()

type IlgpuNativeAtomicBenchmarks() =
inherit IlgpuAtomicBenchamrks()

override this.Command =
Action<Index1D, VariableView<int>>(fun index dataView ->
Atomic.Add(&dataView.Value, 1) |> ignore
)

[<Benchmark(Baseline = true)>]
override this.RunProgram() = base.RunProgram()

[<Struct>]
type AddOp =
interface IAtomicOperation<int> with
member this.Operation(current, value) = current + value

[<Struct>]
type CmpXchOp =
interface ICompareExchangeOperation<int> with
member this.CompareExchange(target, compare, value) = Atomic.CompareExchange(&target, compare, value);
member this.IsSame(left, right) = left = right

type IlgpuSpinlockAtomicBenchmarks() =
inherit IlgpuAtomicBenchamrks()

override this.Command =
Action<Index1D, VariableView<int>>(fun index dataView ->
Atomic.MakeAtomic(
&dataView.Value,
1,
AddOp(),
CmpXchOp()
) |> ignore
)

[<Benchmark>]
override this.RunProgram() = base.RunProgram()
73 changes: 73 additions & 0 deletions benchmarks/Brahma.FSharp.Benchmarks/BenchmarksSimple.fs
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
namespace Brahma.FSharp.Benchmarks

open BenchmarkDotNet.Attributes
open Brahma.FSharp
open Brahma.FSharp.OpenCL.Shared
open FSharp.Quotations

[<AbstractClass>]
type SimpleBenchamrks() =
member val Program = Unchecked.defaultof<ClProgram<Range1D, int clcell -> unit>> with get, set

member val Kernel = Unchecked.defaultof<ClKernel<Range1D, int clcell -> unit>> with get, set

member val Cell = Unchecked.defaultof<int clcell> with get, set

[<ParamsSource("AvaliableContextsProvider")>]
member val Context = Unchecked.defaultof<RuntimeContext> with get, set

member this.GlobalWorkSize = 100
member this.WgSize = 256

abstract Command: Expr<Range1D -> int clcell -> unit>

[<GlobalSetup>]
member this.CompileProgram() =
this.Program <- this.Context.ClContext.Compile(this.Command)

[<IterationSetup>]
member this.AllocCellOnDevice() =
this.Cell <-
opencl {
return! ClCell.alloc<int> ()
}
|> ClTask.runSync this.Context

this.Kernel <- this.Program.GetKernel()

[<Benchmark>]
member this.SetAndRunKernel() =
this.Context.CommandQueue.Post(Msg.MsgSetArguments (fun () -> this.Kernel.KernelFunc (Range1D.CreateValid(this.GlobalWorkSize, this.WgSize)) this.Cell))
this.Context.CommandQueue.Post(Msg.CreateRunMsg this.Kernel)
this.Context.CommandQueue.PostAndReply(MsgNotifyMe)

[<Benchmark>]
member this.SetWithoutRunningKernel() =
this.Context.CommandQueue.Post(Msg.MsgSetArguments (fun () -> this.Kernel.KernelFunc (Range1D.CreateValid(this.GlobalWorkSize, this.WgSize)) this.Cell))
this.Context.CommandQueue.PostAndReply(MsgNotifyMe)

[<IterationCleanup>]
member this.CleanCell() =
this.Cell.Dispose()

static member AvaliableContextsProvider =
ClDevice.GetAvailableDevices(Platform.Nvidia)
|> Seq.map RuntimeContext

type SimpleBenchamrks1() =
inherit SimpleBenchamrks()

override this.Command =
<@
fun (range: Range1D) (acc: int clcell) ->
acc.Value <- 1
@>

type SimpleBenchamrks2() =
inherit SimpleBenchamrks()

override this.Command =
<@
fun (range: Range1D) (acc: int clcell) ->
atomic (fun x -> x + 1) acc.Value |> ignore
@>
Loading

0 comments on commit 4e17f32

Please sign in to comment.