Skip to content

Commit

Permalink
Add Distributed module (#41)
Browse files Browse the repository at this point in the history
Closes #21. This PR implements the functionality to run FastIce on multiple GPUs and multiple nodes using MPI for communication. Features include overlapping MPI communication and computations on GPU, compatibility with Fields and BoundaryConditions.
  • Loading branch information
utkinis authored Nov 24, 2023
1 parent 5cc6c3e commit 8da15f4
Show file tree
Hide file tree
Showing 74 changed files with 2,495 additions and 1,342 deletions.
7 changes: 7 additions & 0 deletions .JuliaFormatter.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
style = "yas"
margin = 140
align_assignment = true
whitespace_ops_in_indices = false
import_to_using = false
pipe_to_function_call = false
always_use_return = false
5 changes: 3 additions & 2 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ steps:
matrix:
setup:
version:
- "1.9"
- "1.10"
plugins:
- JuliaCI/julia#v1:
version: "{{matrix.version}}"
Expand All @@ -45,10 +45,11 @@ steps:
agents:
queue: "juliagpu"
rocm: "*"
rocmgpu: "gfx1101"
timeout_in_minutes: 120
soft_fail:
- exit_status: 3
env:
JULIA_NUM_THREADS: 4
env:
SECRET_CODECOV_TOKEN: "0IoqMRJlTdzvkxpJfv/d4uQBzH0u5Odph6JiQLEASjdh7OPCxmy8ADN7tRPYECguthAFTVnsKeIWpgCyvaJcLY6+sFqlYraL0XOGGX/BCrBQfRvMNKfY8WRf6Hc3NFCyHqFkONFYbxYnFbpXYtdZKbfWDkRHB0bu2JqCbzhN2Yk29dmj2PZPAtUkM+0Uab7cDEzfM/FDwOEssm8bnR/HQRe02DASAyxQGVxcnSZJGZr9IWiPLq6a5qyvN7tkk6FnkMbobwkA48L2fffZQCQF/jlIxc4/yOk9r7P9RVTjWIoSxA59mfuUbKlVHokvXwlVvNS9gXbGOf9gqabfyjcqUA==;U2FsdGVkX19S+m5lHSaFCpYeyDqSxPrqJ9OGWCWUTNDao2X1lzTtCEYQG7YI4abf+9pMnp2msk8JAuw2W7ugQQ=="
SECRET_CODECOV_TOKEN: "0IoqMRJlTdzvkxpJfv/d4uQBzH0u5Odph6JiQLEASjdh7OPCxmy8ADN7tRPYECguthAFTVnsKeIWpgCyvaJcLY6+sFqlYraL0XOGGX/BCrBQfRvMNKfY8WRf6Hc3NFCyHqFkONFYbxYnFbpXYtdZKbfWDkRHB0bu2JqCbzhN2Yk29dmj2PZPAtUkM+0Uab7cDEzfM/FDwOEssm8bnR/HQRe02DASAyxQGVxcnSZJGZr9IWiPLq6a5qyvN7tkk6FnkMbobwkA48L2fffZQCQF/jlIxc4/yOk9r7P9RVTjWIoSxA59mfuUbKlVHokvXwlVvNS9gXbGOf9gqabfyjcqUA==;U2FsdGVkX19S+m5lHSaFCpYeyDqSxPrqJ9OGWCWUTNDao2X1lzTtCEYQG7YI4abf+9pMnp2msk8JAuw2W7ugQQ=="
16 changes: 12 additions & 4 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "FastIce"
uuid = "e0de9f13-a007-490e-b696-b07d031015ca"
authors = ["Ludovic Raess <ludovic.rass@gmail.com>, Ivan Utkin <utkin@hey.com> and contributors"]
version = "0.1.0"
version = "0.2.0"

[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
Expand All @@ -13,19 +13,27 @@ LightXML = "9c8b4983-aa76-5018-a973-4c85ecc9e179"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"

[compat]
Adapt = "3"
AMDGPU = "0.7"
CUDA = "5"
ElasticArrays = "1"
GeometryBasics = "0.4"
HDF5 = "0.17"
KernelAbstractions = "0.9"
LightXML = "0.9"
MPI = "0.20"
MPIPreferences = "0.1"
OffsetArrays = "1"
Preferences = "1"
Preferences = "1"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"

[extensions]
FastIceCUDAExt = "CUDA"
FastIceAMDGPUExt = "AMDGPU"
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ Parallel multi-xPU iterative **FastIce** flow solvers

FastIce is currently under active development in order to run at scale on the LUMI AMD-powered GPU supercomputer within the EuroHPC [**STREAM** project](https://ptsolvers.github.io/GPU4GEO/stream/).

Checkout the non-existing [documentation](https://PTsolvers.github.io/FastIce.jl/dev).
Checkout the [documentation](https://PTsolvers.github.io/FastIce.jl/dev) for API reference and usage.
4 changes: 3 additions & 1 deletion docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@ push!(LOAD_PATH,"../src/")

makedocs(
sitename = "FastIce",
authors="Ludovic Räss, Ivan utkin and contributors",
authors="Ludovic Räss, Ivan Utkin and contributors",
format = Documenter.HTML(; prettyurls=get(ENV, "CI", nothing) == "true"), # easier local build
modules = [FastIce],
pages=[
"Home" => "index.md",
"Usage" => "usage.md",
"Library" => "library.md"
]
)

Expand Down
10 changes: 10 additions & 0 deletions docs/src/library.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Library

## Modules

### Grids

```@autodocs
Modules = [FastIce.Grids, FastIce.Distributed]
Order = [:type, :function]
```
29 changes: 29 additions & 0 deletions docs/src/usage.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Library

## Running tests

### CPU tests

To run the FastIce test suite on the CPU, simple run `test` from within the package mode or using `Pkg`:
```julia-repl
using Pkg
Pkg.test("FastIce")
```

### GPU tests

To run the FastIce test suite on CUDA or ROC Backend (Nvidia or AMD GPUs), respectively, run the tests using `Pkg` adding following `test_args`:

#### For CUDA backend (Nvidia GPU):

```julia-repl
using Pkg
Pkg.test("FastIce"; test_args=["--backend=CUDA"])
```

#### For ROC backend (AMD GPU):

```julia-repl
using Pkg
Pkg.test("FastIce"; test_args=["--backend=AMDGPU"])
```
14 changes: 0 additions & 14 deletions ext/AMDGPUExt/AMDGPUExt.jl

This file was deleted.

14 changes: 0 additions & 14 deletions ext/CUDAExt/CUDAExt.jl

This file was deleted.

14 changes: 14 additions & 0 deletions ext/FastIceAMDGPUExt/FastIceAMDGPUExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
module FastIceAMDGPUExt

using FastIce, AMDGPU, AMDGPU.ROCKernels
import FastIce.Architectures: heuristic_groupsize, set_device!, get_device

set_device!(dev::HIPDevice) = AMDGPU.device!(dev)

get_device(::ROCBackend, id::Integer) = HIPDevice(id)

heuristic_groupsize(::HIPDevice, ::Val{1}) = (256, )
heuristic_groupsize(::HIPDevice, ::Val{2}) = (128, 2, )
heuristic_groupsize(::HIPDevice, ::Val{3}) = (128, 2, 1, )

end
15 changes: 15 additions & 0 deletions ext/FastIceCUDAExt/FastIceCUDAExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
module FastIceCUDAExt

using FastIce, CUDA, CUDA.CUDAKernels

import FastIce.Architectures: heuristic_groupsize, set_device!, get_device

set_device!(dev::CuDevice) = CUDA.device!(dev)

get_device(::CUDABackend, id::Integer) = CuDevice(id - 1)

heuristic_groupsize(::CuDevice, ::Val{1}) = (256,)
heuristic_groupsize(::CuDevice, ::Val{2}) = (32, 8)
heuristic_groupsize(::CuDevice, ::Val{3}) = (32, 8, 1)

end
3 changes: 2 additions & 1 deletion scripts_future_API/Project.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[deps]
CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
FastIce = "e0de9f13-a007-490e-b696-b07d031015ca"
GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
56 changes: 56 additions & 0 deletions scripts_future_API/benchmark_dbc.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
using FastIce.Architectures
using FastIce.Distributed
using FastIce.Fields
using FastIce.Grids
using FastIce.BoundaryConditions
using FastIce.KernelLaunch

using KernelAbstractions
using MPI

@kernel function fill_field!(f, val, offset=nothing)
I = @index(Global, Cartesian)
if !isnothing(offset)
I += offset
end
f[I] = val
end

MPI.Init()

arch = Architecture(CPU(), (2, 2, 2))
grid = CartesianGrid(; origin=(0.0, 0.0, 0.0), extent=(1.0, 1.0, 1.0), size=(5, 7, 5))
field = Field(backend(arch), grid, (Center(), Center(), Center()); halo=1)

me = global_rank(details(arch))

fill!(parent(field), Inf)

bc = BoundaryConditionsBatch((field,), (DirichletBC{FullCell}(-me-10),))

boundary_conditions = override_boundary_conditions(arch, ((bc, bc), (bc, bc), (bc, bc)); exchange=true)

hide_boundaries = HideBoundaries{3}(arch)

outer_width = (2, 2, 2)

launch!(arch, grid, fill_field! => (field, me); location=location(field), hide_boundaries, boundary_conditions, outer_width)

# sleep(0.25me)
# @show coordinates(details(arch))
# display(parent(field))

field_g = if global_rank(details(arch)) == 0
KernelAbstractions.allocate(Architectures.backend(arch), eltype(field), dimensions(details(arch)) .* size(field))
else
nothing
end

gather!(arch, field_g, field)

if global_rank(details(arch)) == 0
println("global matrix:")
display(field_g)
end

MPI.Finalize()
109 changes: 109 additions & 0 deletions scripts_future_API/benchmark_diffusion_2D.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
using FastIce.Grids
using FastIce.GridOperators
using FastIce.Fields
using FastIce.Architectures
using FastIce.BoundaryConditions
using FastIce.Distributed
using FastIce.KernelLaunch

using KernelAbstractions
using MPI

using Plots

@kernel function update_C!(C, qC, dt, Δ, offset=nothing)
I = @index(Global, Cartesian)
isnothing(offset) || (I += offset)
@inbounds if checkbounds(Bool, C, I)
C[I] -= dt * (∂ᶜx(qC.x, I) / Δ.x +
∂ᶜy(qC.y, I) / Δ.y)
end
end

@kernel function update_qC!(qC, C, dc, Δ, offset=nothing)
I = @index(Global, Cartesian)
isnothing(offset) || (I += offset)
@inbounds if checkbounds(Bool, qC.x, I)
qC.x[I] = -dc * ∂ᵛx(C, I) / Δ.x
end
@inbounds if checkbounds(Bool, qC.y, I)
qC.y[I] = -dc * ∂ᵛy(C, I) / Δ.y
end
end

function diffusion_2D(ka_backend=CPU())
# setup arch
arch = Architecture(ka_backend, (0, 0))
topo = details(arch)
# physics
lx, ly = 10.0, 10.0
dc = 1
# numerics
size_g = (32, 32)
nt = 1000
# preprocessing
size_g = global_grid_size(topo, size_g)
global_grid = CartesianGrid(; origin=(-0.5lx, -0.5ly),
extent=(lx, ly),
size=size_g)
grid = local_grid(global_grid, topo)
Δ = NamedTuple{(:x, :y)}(spacing(global_grid))
dt = minimum(Δ)^2 / dc / ndims(grid) / 2.1
hide_boundaries = HideBoundaries{ndims(grid)}(arch)
outer_width = (4, 4)
# fields
C = Field(arch, grid, Center(); halo=1)
qC = (x = Field(arch, grid, (Vertex(), Center()); halo=1),
y = Field(arch, grid, (Center(), Vertex()); halo=1))
C_g = if global_rank(topo) == 0
KernelAbstractions.allocate(Architectures.backend(arch), eltype(C), size_g)
else
nothing
end
# initial condition
foreach(comp -> fill!(parent(comp), 0.0), qC)
# fill!(parent(C), me)
set!(C, grid, (x, y) -> exp(-x^2 - y^2))
# set!(C, me)
# boundary conditions
zero_flux_bc = DirichletBC{FullCell}(0.0)
bc_q = (x = BoundaryConditionsBatch((qC.x, qC.y), (zero_flux_bc, nothing)),
y = BoundaryConditionsBatch((qC.x, qC.y), (nothing, zero_flux_bc)))
# zero flux at physical boundaries and nothing at MPI boundaries
bc_q = override_boundary_conditions(arch, ((bc_q.x, bc_q.x), (bc_q.y, bc_q.y)); exchange=true)
# nothing at physical boundaries and communication at MPI boundaries
bc_c = BoundaryConditionsBatch((C,), nothing)
bc_c = override_boundary_conditions(arch, ((bc_c, bc_c), (bc_c, bc_c)); exchange=true)
for D in ndims(grid):-1:1
apply_boundary_conditions!(Val(1), Val(D), arch, grid, bc_c[D][1])
apply_boundary_conditions!(Val(2), Val(D), arch, grid, bc_c[D][2])
apply_boundary_conditions!(Val(1), Val(D), arch, grid, bc_q[D][1])
apply_boundary_conditions!(Val(2), Val(D), arch, grid, bc_q[D][2])
end
# time loop
if global_rank(topo) == 0
anim = Animation()
end
for it in 1:nt
(global_rank(topo) == 0) && println("it = $it")
launch!(arch, grid, update_qC! => (qC, C, dc, Δ); location=Vertex(), hide_boundaries, boundary_conditions=bc_q, outer_width)
launch!(arch, grid, update_C! => (C, qC, dt, Δ); location=Center(), expand=1)
synchronize(Architectures.backend(arch))
if it % 5 == 0
gather!(arch, C_g, C)
if global_rank(topo) == 0
heatmap(C_g; aspect_ratio=1, size=(600, 600), clims=(0, 1))
frame(anim)
end
end
end
if global_rank(topo) == 0
gif(anim, "C.gif")
end

return
end

MPI.Init()
diffusion_2D()
MPI.Finalize()
Loading

0 comments on commit 8da15f4

Please sign in to comment.