Add Distributed module (#41)

Closes #21. This PR implements the functionality to run FastIce on multiple GPUs and multiple nodes using MPI for communication. Features include overlapping MPI communication and computations on GPU, compatibility with Fields and BoundaryConditions.
PTsolvers · Nov 24, 2023 · 8da15f4 · 8da15f4
1 parent 5cc6c3e
commit 8da15f4
Show file tree

Hide file tree

Showing 74 changed files with 2,495 additions and 1,342 deletions.
diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
@@ -0,0 +1,7 @@
+style = "yas"
+margin = 140
+align_assignment = true
+whitespace_ops_in_indices = false
+import_to_using = false
+pipe_to_function_call = false
+always_use_return = false
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -28,7 +28,7 @@ steps:
     matrix:
       setup:
         version:
-          - "1.9"
+          - "1.10"
     plugins:
       - JuliaCI/julia#v1:
           version: "{{matrix.version}}"
@@ -45,10 +45,11 @@ steps:
     agents:
       queue: "juliagpu"
       rocm: "*"
+      rocmgpu: "gfx1101"
     timeout_in_minutes: 120
     soft_fail:
       - exit_status: 3
     env:
       JULIA_NUM_THREADS: 4
 env:
-  SECRET_CODECOV_TOKEN: "0IoqMRJlTdzvkxpJfv/d4uQBzH0u5Odph6JiQLEASjdh7OPCxmy8ADN7tRPYECguthAFTVnsKeIWpgCyvaJcLY6+sFqlYraL0XOGGX/BCrBQfRvMNKfY8WRf6Hc3NFCyHqFkONFYbxYnFbpXYtdZKbfWDkRHB0bu2JqCbzhN2Yk29dmj2PZPAtUkM+0Uab7cDEzfM/FDwOEssm8bnR/HQRe02DASAyxQGVxcnSZJGZr9IWiPLq6a5qyvN7tkk6FnkMbobwkA48L2fffZQCQF/jlIxc4/yOk9r7P9RVTjWIoSxA59mfuUbKlVHokvXwlVvNS9gXbGOf9gqabfyjcqUA==;U2FsdGVkX19S+m5lHSaFCpYeyDqSxPrqJ9OGWCWUTNDao2X1lzTtCEYQG7YI4abf+9pMnp2msk8JAuw2W7ugQQ=="
+  SECRET_CODECOV_TOKEN: "0IoqMRJlTdzvkxpJfv/d4uQBzH0u5Odph6JiQLEASjdh7OPCxmy8ADN7tRPYECguthAFTVnsKeIWpgCyvaJcLY6+sFqlYraL0XOGGX/BCrBQfRvMNKfY8WRf6Hc3NFCyHqFkONFYbxYnFbpXYtdZKbfWDkRHB0bu2JqCbzhN2Yk29dmj2PZPAtUkM+0Uab7cDEzfM/FDwOEssm8bnR/HQRe02DASAyxQGVxcnSZJGZr9IWiPLq6a5qyvN7tkk6FnkMbobwkA48L2fffZQCQF/jlIxc4/yOk9r7P9RVTjWIoSxA59mfuUbKlVHokvXwlVvNS9gXbGOf9gqabfyjcqUA==;U2FsdGVkX19S+m5lHSaFCpYeyDqSxPrqJ9OGWCWUTNDao2X1lzTtCEYQG7YI4abf+9pMnp2msk8JAuw2W7ugQQ=="
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "FastIce"
 uuid = "e0de9f13-a007-490e-b696-b07d031015ca"
 authors = ["Ludovic Raess <ludovic.rass@gmail.com>, Ivan Utkin <utkin@hey.com> and contributors"]
-version = "0.1.0"
+version = "0.2.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -13,19 +13,27 @@ LightXML = "9c8b4983-aa76-5018-a973-4c85ecc9e179"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
-MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 
 [compat]
 Adapt = "3"
+AMDGPU = "0.7"
+CUDA = "5"
 ElasticArrays = "1"
 GeometryBasics = "0.4"
 HDF5 = "0.17"
 KernelAbstractions = "0.9"
 LightXML = "0.9"
 MPI = "0.20"
-MPIPreferences = "0.1"
 OffsetArrays = "1"
-Preferences = "1"
+Preferences = "1"
+
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+
+[extensions]
+FastIceCUDAExt = "CUDA"
+FastIceAMDGPUExt = "AMDGPU"
diff --git a/README.md b/README.md
@@ -8,4 +8,4 @@ Parallel multi-xPU iterative **FastIce** flow solvers
 
 FastIce is currently under active development in order to run at scale on the LUMI AMD-powered GPU supercomputer within the EuroHPC [**STREAM** project](https://ptsolvers.github.io/GPU4GEO/stream/).
 
-Checkout the non-existing [documentation](https://PTsolvers.github.io/FastIce.jl/dev).
+Checkout the [documentation](https://PTsolvers.github.io/FastIce.jl/dev) for API reference and usage.
diff --git a/docs/make.jl b/docs/make.jl
@@ -5,11 +5,13 @@ push!(LOAD_PATH,"../src/")
 
 makedocs(
     sitename = "FastIce",
-    authors="Ludovic Räss, Ivan utkin and contributors",
+    authors="Ludovic Räss, Ivan Utkin and contributors",
     format = Documenter.HTML(; prettyurls=get(ENV, "CI", nothing) == "true"), # easier local build
     modules = [FastIce],
     pages=[
         "Home" => "index.md",
+        "Usage" => "usage.md",
+        "Library" => "library.md"
     ]
 )
 

diff --git a/docs/src/library.md b/docs/src/library.md
@@ -0,0 +1,10 @@
+# Library
+
+## Modules
+
+### Grids
+
+```@autodocs
+Modules = [FastIce.Grids, FastIce.Distributed]
+Order   = [:type, :function]
+```
diff --git a/docs/src/usage.md b/docs/src/usage.md
@@ -0,0 +1,29 @@
+# Library
+
+## Running tests
+
+### CPU tests
+
+To run the FastIce test suite on the CPU, simple run `test` from within the package mode or using `Pkg`:
+```julia-repl
+using Pkg
+Pkg.test("FastIce")
+```
+
+### GPU tests
+
+To run the FastIce test suite on CUDA or ROC Backend (Nvidia or AMD GPUs), respectively, run the tests using `Pkg` adding following `test_args`:
+
+#### For CUDA backend (Nvidia GPU):
+
+```julia-repl
+using Pkg
+Pkg.test("FastIce"; test_args=["--backend=CUDA"])
+```
+
+#### For ROC backend (AMD GPU):
+
+```julia-repl
+using Pkg
+Pkg.test("FastIce"; test_args=["--backend=AMDGPU"])
+```
diff --git a/ext/AMDGPUExt/AMDGPUExt.jl b/ext/AMDGPUExt/AMDGPUExt.jl
diff --git a/ext/CUDAExt/CUDAExt.jl b/ext/CUDAExt/CUDAExt.jl
diff --git a/ext/FastIceAMDGPUExt/FastIceAMDGPUExt.jl b/ext/FastIceAMDGPUExt/FastIceAMDGPUExt.jl
@@ -0,0 +1,14 @@
+module FastIceAMDGPUExt
+
+using FastIce, AMDGPU, AMDGPU.ROCKernels
+import FastIce.Architectures: heuristic_groupsize, set_device!, get_device
+
+set_device!(dev::HIPDevice) = AMDGPU.device!(dev)
+
+get_device(::ROCBackend, id::Integer) = HIPDevice(id)
+
+heuristic_groupsize(::HIPDevice, ::Val{1}) = (256, )
+heuristic_groupsize(::HIPDevice, ::Val{2}) = (128, 2, )
+heuristic_groupsize(::HIPDevice, ::Val{3}) = (128, 2, 1, )
+
+end
diff --git a/ext/FastIceCUDAExt/FastIceCUDAExt.jl b/ext/FastIceCUDAExt/FastIceCUDAExt.jl
@@ -0,0 +1,15 @@
+module FastIceCUDAExt
+
+using FastIce, CUDA, CUDA.CUDAKernels
+
+import FastIce.Architectures: heuristic_groupsize, set_device!, get_device
+
+set_device!(dev::CuDevice) = CUDA.device!(dev)
+
+get_device(::CUDABackend, id::Integer) = CuDevice(id - 1)
+
+heuristic_groupsize(::CuDevice, ::Val{1}) = (256,)
+heuristic_groupsize(::CuDevice, ::Val{2}) = (32, 8)
+heuristic_groupsize(::CuDevice, ::Val{3}) = (32, 8, 1)
+
+end
diff --git a/scripts_future_API/Project.toml b/scripts_future_API/Project.toml
@@ -1,5 +1,6 @@
 [deps]
+CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 FastIce = "e0de9f13-a007-490e-b696-b07d031015ca"
+GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
-MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
diff --git a/scripts_future_API/benchmark_dbc.jl b/scripts_future_API/benchmark_dbc.jl
@@ -0,0 +1,56 @@
+using FastIce.Architectures
+using FastIce.Distributed
+using FastIce.Fields
+using FastIce.Grids
+using FastIce.BoundaryConditions
+using FastIce.KernelLaunch
+
+using KernelAbstractions
+using MPI
+
+@kernel function fill_field!(f, val, offset=nothing)
+    I = @index(Global, Cartesian)
+    if !isnothing(offset)
+        I += offset
+    end
+    f[I] = val
+end
+
+MPI.Init()
+
+arch = Architecture(CPU(), (2, 2, 2))
+grid = CartesianGrid(; origin=(0.0, 0.0, 0.0), extent=(1.0, 1.0, 1.0), size=(5, 7, 5))
+field = Field(backend(arch), grid, (Center(), Center(), Center()); halo=1)
+
+me = global_rank(details(arch))
+
+fill!(parent(field), Inf)
+
+bc = BoundaryConditionsBatch((field,), (DirichletBC{FullCell}(-me-10),))
+
+boundary_conditions = override_boundary_conditions(arch, ((bc, bc), (bc, bc), (bc, bc)); exchange=true)
+
+hide_boundaries = HideBoundaries{3}(arch)
+
+outer_width = (2, 2, 2)
+
+launch!(arch, grid, fill_field! => (field, me); location=location(field), hide_boundaries, boundary_conditions, outer_width)
+
+# sleep(0.25me)
+# @show coordinates(details(arch))
+# display(parent(field))
+
+field_g = if global_rank(details(arch)) == 0
+    KernelAbstractions.allocate(Architectures.backend(arch), eltype(field), dimensions(details(arch)) .* size(field))
+else
+    nothing
+end
+
+gather!(arch, field_g, field)
+
+if global_rank(details(arch)) == 0
+    println("global matrix:")
+    display(field_g)
+end
+
+MPI.Finalize()
diff --git a/scripts_future_API/benchmark_diffusion_2D.jl b/scripts_future_API/benchmark_diffusion_2D.jl
@@ -0,0 +1,109 @@
+using FastIce.Grids
+using FastIce.GridOperators
+using FastIce.Fields
+using FastIce.Architectures
+using FastIce.BoundaryConditions
+using FastIce.Distributed
+using FastIce.KernelLaunch
+
+using KernelAbstractions
+using MPI
+
+using Plots
+
+@kernel function update_C!(C, qC, dt, Δ, offset=nothing)
+    I = @index(Global, Cartesian)
+    isnothing(offset) || (I += offset)
+    @inbounds if checkbounds(Bool, C, I)
+        C[I] -= dt * (∂ᶜx(qC.x, I) / Δ.x +
+                      ∂ᶜy(qC.y, I) / Δ.y)
+    end
+end
+
+@kernel function update_qC!(qC, C, dc, Δ, offset=nothing)
+    I = @index(Global, Cartesian)
+    isnothing(offset) || (I += offset)
+    @inbounds if checkbounds(Bool, qC.x, I)
+        qC.x[I] = -dc * ∂ᵛx(C, I) / Δ.x
+    end
+    @inbounds if checkbounds(Bool, qC.y, I)
+        qC.y[I] = -dc * ∂ᵛy(C, I) / Δ.y
+    end
+end
+
+function diffusion_2D(ka_backend=CPU())
+    # setup arch
+    arch = Architecture(ka_backend, (0, 0))
+    topo = details(arch)
+    # physics
+    lx, ly = 10.0, 10.0
+    dc = 1
+    # numerics
+    size_g = (32, 32)
+    nt = 1000
+    # preprocessing
+    size_g = global_grid_size(topo, size_g)
+    global_grid = CartesianGrid(; origin=(-0.5lx, -0.5ly),
+                                extent=(lx, ly),
+                                size=size_g)
+    grid = local_grid(global_grid, topo)
+    Δ = NamedTuple{(:x, :y)}(spacing(global_grid))
+    dt = minimum(Δ)^2 / dc / ndims(grid) / 2.1
+    hide_boundaries = HideBoundaries{ndims(grid)}(arch)
+    outer_width = (4, 4)
+    # fields
+    C = Field(arch, grid, Center(); halo=1)
+    qC = (x = Field(arch, grid, (Vertex(), Center()); halo=1),
+          y = Field(arch, grid, (Center(), Vertex()); halo=1))
+    C_g = if global_rank(topo) == 0
+        KernelAbstractions.allocate(Architectures.backend(arch), eltype(C), size_g)
+    else
+        nothing
+    end
+    # initial condition
+    foreach(comp -> fill!(parent(comp), 0.0), qC)
+    # fill!(parent(C), me)
+    set!(C, grid, (x, y) -> exp(-x^2 - y^2))
+    # set!(C, me)
+    # boundary conditions
+    zero_flux_bc = DirichletBC{FullCell}(0.0)
+    bc_q = (x = BoundaryConditionsBatch((qC.x, qC.y), (zero_flux_bc, nothing)),
+            y = BoundaryConditionsBatch((qC.x, qC.y), (nothing, zero_flux_bc)))
+    # zero flux at physical boundaries and nothing at MPI boundaries
+    bc_q = override_boundary_conditions(arch, ((bc_q.x, bc_q.x), (bc_q.y, bc_q.y)); exchange=true)
+    # nothing at physical boundaries and communication at MPI boundaries
+    bc_c = BoundaryConditionsBatch((C,), nothing)
+    bc_c = override_boundary_conditions(arch, ((bc_c, bc_c), (bc_c, bc_c)); exchange=true)
+    for D in ndims(grid):-1:1
+        apply_boundary_conditions!(Val(1), Val(D), arch, grid, bc_c[D][1])
+        apply_boundary_conditions!(Val(2), Val(D), arch, grid, bc_c[D][2])
+        apply_boundary_conditions!(Val(1), Val(D), arch, grid, bc_q[D][1])
+        apply_boundary_conditions!(Val(2), Val(D), arch, grid, bc_q[D][2])
+    end
+    # time loop
+    if global_rank(topo) == 0
+        anim = Animation()
+    end
+    for it in 1:nt
+        (global_rank(topo) == 0) && println("it = $it")
+        launch!(arch, grid, update_qC! => (qC, C, dc, Δ); location=Vertex(), hide_boundaries, boundary_conditions=bc_q, outer_width)
+        launch!(arch, grid, update_C! => (C, qC, dt, Δ); location=Center(), expand=1)
+        synchronize(Architectures.backend(arch))
+        if it % 5 == 0
+            gather!(arch, C_g, C)
+            if global_rank(topo) == 0
+                heatmap(C_g; aspect_ratio=1, size=(600, 600), clims=(0, 1))
+                frame(anim)
+            end
+        end
+    end
+    if global_rank(topo) == 0
+        gif(anim, "C.gif")
+    end
+
+    return
+end
+
+MPI.Init()
+diffusion_2D()
+MPI.Finalize()