diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
new file mode 100644
index 00000000..85a6f88b
--- /dev/null
+++ b/.JuliaFormatter.toml
@@ -0,0 +1,7 @@
+style = "yas"
+margin = 140
+align_assignment = true
+whitespace_ops_in_indices = false
+import_to_using = false
+pipe_to_function_call = false
+always_use_return = false
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 79e07aba..92ee9d2a 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -28,7 +28,7 @@ steps:
     matrix:
       setup:
         version:
-          - "1.9"
+          - "1.10"
     plugins:
       - JuliaCI/julia#v1:
           version: "{{matrix.version}}"
@@ -45,10 +45,11 @@ steps:
     agents:
       queue: "juliagpu"
       rocm: "*"
+      rocmgpu: "gfx1101"
     timeout_in_minutes: 120
     soft_fail:
       - exit_status: 3
     env:
       JULIA_NUM_THREADS: 4
 env:
-  SECRET_CODECOV_TOKEN: "0IoqMRJlTdzvkxpJfv/d4uQBzH0u5Odph6JiQLEASjdh7OPCxmy8ADN7tRPYECguthAFTVnsKeIWpgCyvaJcLY6+sFqlYraL0XOGGX/BCrBQfRvMNKfY8WRf6Hc3NFCyHqFkONFYbxYnFbpXYtdZKbfWDkRHB0bu2JqCbzhN2Yk29dmj2PZPAtUkM+0Uab7cDEzfM/FDwOEssm8bnR/HQRe02DASAyxQGVxcnSZJGZr9IWiPLq6a5qyvN7tkk6FnkMbobwkA48L2fffZQCQF/jlIxc4/yOk9r7P9RVTjWIoSxA59mfuUbKlVHokvXwlVvNS9gXbGOf9gqabfyjcqUA==;U2FsdGVkX19S+m5lHSaFCpYeyDqSxPrqJ9OGWCWUTNDao2X1lzTtCEYQG7YI4abf+9pMnp2msk8JAuw2W7ugQQ=="
\ No newline at end of file
+  SECRET_CODECOV_TOKEN: "0IoqMRJlTdzvkxpJfv/d4uQBzH0u5Odph6JiQLEASjdh7OPCxmy8ADN7tRPYECguthAFTVnsKeIWpgCyvaJcLY6+sFqlYraL0XOGGX/BCrBQfRvMNKfY8WRf6Hc3NFCyHqFkONFYbxYnFbpXYtdZKbfWDkRHB0bu2JqCbzhN2Yk29dmj2PZPAtUkM+0Uab7cDEzfM/FDwOEssm8bnR/HQRe02DASAyxQGVxcnSZJGZr9IWiPLq6a5qyvN7tkk6FnkMbobwkA48L2fffZQCQF/jlIxc4/yOk9r7P9RVTjWIoSxA59mfuUbKlVHokvXwlVvNS9gXbGOf9gqabfyjcqUA==;U2FsdGVkX19S+m5lHSaFCpYeyDqSxPrqJ9OGWCWUTNDao2X1lzTtCEYQG7YI4abf+9pMnp2msk8JAuw2W7ugQQ=="
diff --git a/Project.toml b/Project.toml
index 618e7a90..1977a8c5 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "FastIce"
 uuid = "e0de9f13-a007-490e-b696-b07d031015ca"
 authors = ["Ludovic Raess <ludovic.rass@gmail.com>, Ivan Utkin <utkin@hey.com> and contributors"]
-version = "0.1.0"
+version = "0.2.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -13,19 +13,27 @@ LightXML = "9c8b4983-aa76-5018-a973-4c85ecc9e179"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
-MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 
 [compat]
 Adapt = "3"
+AMDGPU = "0.7"
+CUDA = "5"
 ElasticArrays = "1"
 GeometryBasics = "0.4"
 HDF5 = "0.17"
 KernelAbstractions = "0.9"
 LightXML = "0.9"
 MPI = "0.20"
-MPIPreferences = "0.1"
 OffsetArrays = "1"
-Preferences = "1"
\ No newline at end of file
+Preferences = "1"
+
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+
+[extensions]
+FastIceCUDAExt = "CUDA"
+FastIceAMDGPUExt = "AMDGPU"
diff --git a/README.md b/README.md
index eec2c53a..b92d5422 100644
--- a/README.md
+++ b/README.md
@@ -8,4 +8,4 @@ Parallel multi-xPU iterative **FastIce** flow solvers
 
 FastIce is currently under active development in order to run at scale on the LUMI AMD-powered GPU supercomputer within the EuroHPC [**STREAM** project](https://ptsolvers.github.io/GPU4GEO/stream/).
 
-Checkout the non-existing [documentation](https://PTsolvers.github.io/FastIce.jl/dev).
+Checkout the [documentation](https://PTsolvers.github.io/FastIce.jl/dev) for API reference and usage.
diff --git a/docs/make.jl b/docs/make.jl
index ff0dfb53..a6eb60dd 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -5,11 +5,13 @@ push!(LOAD_PATH,"../src/")
 
 makedocs(
     sitename = "FastIce",
-    authors="Ludovic Räss, Ivan utkin and contributors",
+    authors="Ludovic Räss, Ivan Utkin and contributors",
     format = Documenter.HTML(; prettyurls=get(ENV, "CI", nothing) == "true"), # easier local build
     modules = [FastIce],
     pages=[
         "Home" => "index.md",
+        "Usage" => "usage.md",
+        "Library" => "library.md"
     ]
 )
 
diff --git a/docs/src/library.md b/docs/src/library.md
new file mode 100644
index 00000000..73037e89
--- /dev/null
+++ b/docs/src/library.md
@@ -0,0 +1,10 @@
+# Library
+
+## Modules
+
+### Grids
+
+```@autodocs
+Modules = [FastIce.Grids, FastIce.Distributed]
+Order   = [:type, :function]
+```
diff --git a/docs/src/usage.md b/docs/src/usage.md
new file mode 100644
index 00000000..7890ed66
--- /dev/null
+++ b/docs/src/usage.md
@@ -0,0 +1,29 @@
+# Library
+
+## Running tests
+
+### CPU tests
+
+To run the FastIce test suite on the CPU, simple run `test` from within the package mode or using `Pkg`:
+```julia-repl
+using Pkg
+Pkg.test("FastIce")
+```
+
+### GPU tests
+
+To run the FastIce test suite on CUDA or ROC Backend (Nvidia or AMD GPUs), respectively, run the tests using `Pkg` adding following `test_args`:
+
+#### For CUDA backend (Nvidia GPU):
+
+```julia-repl
+using Pkg
+Pkg.test("FastIce"; test_args=["--backend=CUDA"])
+```
+
+#### For ROC backend (AMD GPU):
+
+```julia-repl
+using Pkg
+Pkg.test("FastIce"; test_args=["--backend=AMDGPU"])
+```
diff --git a/ext/AMDGPUExt/AMDGPUExt.jl b/ext/AMDGPUExt/AMDGPUExt.jl
deleted file mode 100644
index 5bb5f5a7..00000000
--- a/ext/AMDGPUExt/AMDGPUExt.jl
+++ /dev/null
@@ -1,14 +0,0 @@
-module AMDGPUExt
-
-using AMDGPU
-using KernelAbstractions
-
-using FastIce.Architecture
-
-set_device!(dev::HIPDevice) = device!(dev)
-
-heuristic_groupsize(::ROCBackend, ::Val{1}) = (256, )
-heuristic_groupsize(::ROCBackend, ::Val{2}) = (128, 2, )
-heuristic_groupsize(::ROCBackend, ::Val{3}) = (128, 2, 1, )
-
-end
\ No newline at end of file
diff --git a/ext/CUDAExt/CUDAExt.jl b/ext/CUDAExt/CUDAExt.jl
deleted file mode 100644
index 27f8841f..00000000
--- a/ext/CUDAExt/CUDAExt.jl
+++ /dev/null
@@ -1,14 +0,0 @@
-module CUDAExt
-
-using CUDA
-using KernelAbstractions
-
-using FastIce.Architecture
-
-set_device!(dev::CuDevice) = device!(dev)
-
-heuristic_groupsize(::CUDABackend, ::Val{1}) = (256, )
-heuristic_groupsize(::CUDABackend, ::Val{2}) = (32, 8, )
-heuristic_groupsize(::CUDABackend, ::Val{3}) = (32, 8, 1, )
-
-end
\ No newline at end of file
diff --git a/ext/FastIceAMDGPUExt/FastIceAMDGPUExt.jl b/ext/FastIceAMDGPUExt/FastIceAMDGPUExt.jl
new file mode 100644
index 00000000..30e5a159
--- /dev/null
+++ b/ext/FastIceAMDGPUExt/FastIceAMDGPUExt.jl
@@ -0,0 +1,14 @@
+module FastIceAMDGPUExt
+
+using FastIce, AMDGPU, AMDGPU.ROCKernels
+import FastIce.Architectures: heuristic_groupsize, set_device!, get_device
+
+set_device!(dev::HIPDevice) = AMDGPU.device!(dev)
+
+get_device(::ROCBackend, id::Integer) = HIPDevice(id)
+
+heuristic_groupsize(::HIPDevice, ::Val{1}) = (256, )
+heuristic_groupsize(::HIPDevice, ::Val{2}) = (128, 2, )
+heuristic_groupsize(::HIPDevice, ::Val{3}) = (128, 2, 1, )
+
+end
diff --git a/ext/FastIceCUDAExt/FastIceCUDAExt.jl b/ext/FastIceCUDAExt/FastIceCUDAExt.jl
new file mode 100644
index 00000000..9918672e
--- /dev/null
+++ b/ext/FastIceCUDAExt/FastIceCUDAExt.jl
@@ -0,0 +1,15 @@
+module FastIceCUDAExt
+
+using FastIce, CUDA, CUDA.CUDAKernels
+
+import FastIce.Architectures: heuristic_groupsize, set_device!, get_device
+
+set_device!(dev::CuDevice) = CUDA.device!(dev)
+
+get_device(::CUDABackend, id::Integer) = CuDevice(id - 1)
+
+heuristic_groupsize(::CuDevice, ::Val{1}) = (256,)
+heuristic_groupsize(::CuDevice, ::Val{2}) = (32, 8)
+heuristic_groupsize(::CuDevice, ::Val{3}) = (32, 8, 1)
+
+end
diff --git a/scripts_future_API/Project.toml b/scripts_future_API/Project.toml
index 54f6d3a4..6d8438c2 100644
--- a/scripts_future_API/Project.toml
+++ b/scripts_future_API/Project.toml
@@ -1,5 +1,6 @@
 [deps]
+CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 FastIce = "e0de9f13-a007-490e-b696-b07d031015ca"
+GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
-MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
diff --git a/scripts_future_API/benchmark_dbc.jl b/scripts_future_API/benchmark_dbc.jl
new file mode 100644
index 00000000..ba40cadd
--- /dev/null
+++ b/scripts_future_API/benchmark_dbc.jl
@@ -0,0 +1,56 @@
+using FastIce.Architectures
+using FastIce.Distributed
+using FastIce.Fields
+using FastIce.Grids
+using FastIce.BoundaryConditions
+using FastIce.KernelLaunch
+
+using KernelAbstractions
+using MPI
+
+@kernel function fill_field!(f, val, offset=nothing)
+    I = @index(Global, Cartesian)
+    if !isnothing(offset)
+        I += offset
+    end
+    f[I] = val
+end
+
+MPI.Init()
+
+arch = Architecture(CPU(), (2, 2, 2))
+grid = CartesianGrid(; origin=(0.0, 0.0, 0.0), extent=(1.0, 1.0, 1.0), size=(5, 7, 5))
+field = Field(backend(arch), grid, (Center(), Center(), Center()); halo=1)
+
+me = global_rank(details(arch))
+
+fill!(parent(field), Inf)
+
+bc = BoundaryConditionsBatch((field,), (DirichletBC{FullCell}(-me-10),))
+
+boundary_conditions = override_boundary_conditions(arch, ((bc, bc), (bc, bc), (bc, bc)); exchange=true)
+
+hide_boundaries = HideBoundaries{3}(arch)
+
+outer_width = (2, 2, 2)
+
+launch!(arch, grid, fill_field! => (field, me); location=location(field), hide_boundaries, boundary_conditions, outer_width)
+
+# sleep(0.25me)
+# @show coordinates(details(arch))
+# display(parent(field))
+
+field_g = if global_rank(details(arch)) == 0
+    KernelAbstractions.allocate(Architectures.backend(arch), eltype(field), dimensions(details(arch)) .* size(field))
+else
+    nothing
+end
+
+gather!(arch, field_g, field)
+
+if global_rank(details(arch)) == 0
+    println("global matrix:")
+    display(field_g)
+end
+
+MPI.Finalize()
diff --git a/scripts_future_API/benchmark_diffusion_2D.jl b/scripts_future_API/benchmark_diffusion_2D.jl
new file mode 100644
index 00000000..8f920a29
--- /dev/null
+++ b/scripts_future_API/benchmark_diffusion_2D.jl
@@ -0,0 +1,109 @@
+using FastIce.Grids
+using FastIce.GridOperators
+using FastIce.Fields
+using FastIce.Architectures
+using FastIce.BoundaryConditions
+using FastIce.Distributed
+using FastIce.KernelLaunch
+
+using KernelAbstractions
+using MPI
+
+using Plots
+
+@kernel function update_C!(C, qC, dt, Δ, offset=nothing)
+    I = @index(Global, Cartesian)
+    isnothing(offset) || (I += offset)
+    @inbounds if checkbounds(Bool, C, I)
+        C[I] -= dt * (∂ᶜx(qC.x, I) / Δ.x +
+                      ∂ᶜy(qC.y, I) / Δ.y)
+    end
+end
+
+@kernel function update_qC!(qC, C, dc, Δ, offset=nothing)
+    I = @index(Global, Cartesian)
+    isnothing(offset) || (I += offset)
+    @inbounds if checkbounds(Bool, qC.x, I)
+        qC.x[I] = -dc * ∂ᵛx(C, I) / Δ.x
+    end
+    @inbounds if checkbounds(Bool, qC.y, I)
+        qC.y[I] = -dc * ∂ᵛy(C, I) / Δ.y
+    end
+end
+
+function diffusion_2D(ka_backend=CPU())
+    # setup arch
+    arch = Architecture(ka_backend, (0, 0))
+    topo = details(arch)
+    # physics
+    lx, ly = 10.0, 10.0
+    dc = 1
+    # numerics
+    size_g = (32, 32)
+    nt = 1000
+    # preprocessing
+    size_g = global_grid_size(topo, size_g)
+    global_grid = CartesianGrid(; origin=(-0.5lx, -0.5ly),
+                                extent=(lx, ly),
+                                size=size_g)
+    grid = local_grid(global_grid, topo)
+    Δ = NamedTuple{(:x, :y)}(spacing(global_grid))
+    dt = minimum(Δ)^2 / dc / ndims(grid) / 2.1
+    hide_boundaries = HideBoundaries{ndims(grid)}(arch)
+    outer_width = (4, 4)
+    # fields
+    C = Field(arch, grid, Center(); halo=1)
+    qC = (x = Field(arch, grid, (Vertex(), Center()); halo=1),
+          y = Field(arch, grid, (Center(), Vertex()); halo=1))
+    C_g = if global_rank(topo) == 0
+        KernelAbstractions.allocate(Architectures.backend(arch), eltype(C), size_g)
+    else
+        nothing
+    end
+    # initial condition
+    foreach(comp -> fill!(parent(comp), 0.0), qC)
+    # fill!(parent(C), me)
+    set!(C, grid, (x, y) -> exp(-x^2 - y^2))
+    # set!(C, me)
+    # boundary conditions
+    zero_flux_bc = DirichletBC{FullCell}(0.0)
+    bc_q = (x = BoundaryConditionsBatch((qC.x, qC.y), (zero_flux_bc, nothing)),
+            y = BoundaryConditionsBatch((qC.x, qC.y), (nothing, zero_flux_bc)))
+    # zero flux at physical boundaries and nothing at MPI boundaries
+    bc_q = override_boundary_conditions(arch, ((bc_q.x, bc_q.x), (bc_q.y, bc_q.y)); exchange=true)
+    # nothing at physical boundaries and communication at MPI boundaries
+    bc_c = BoundaryConditionsBatch((C,), nothing)
+    bc_c = override_boundary_conditions(arch, ((bc_c, bc_c), (bc_c, bc_c)); exchange=true)
+    for D in ndims(grid):-1:1
+        apply_boundary_conditions!(Val(1), Val(D), arch, grid, bc_c[D][1])
+        apply_boundary_conditions!(Val(2), Val(D), arch, grid, bc_c[D][2])
+        apply_boundary_conditions!(Val(1), Val(D), arch, grid, bc_q[D][1])
+        apply_boundary_conditions!(Val(2), Val(D), arch, grid, bc_q[D][2])
+    end
+    # time loop
+    if global_rank(topo) == 0
+        anim = Animation()
+    end
+    for it in 1:nt
+        (global_rank(topo) == 0) && println("it = $it")
+        launch!(arch, grid, update_qC! => (qC, C, dc, Δ); location=Vertex(), hide_boundaries, boundary_conditions=bc_q, outer_width)
+        launch!(arch, grid, update_C! => (C, qC, dt, Δ); location=Center(), expand=1)
+        synchronize(Architectures.backend(arch))
+        if it % 5 == 0
+            gather!(arch, C_g, C)
+            if global_rank(topo) == 0
+                heatmap(C_g; aspect_ratio=1, size=(600, 600), clims=(0, 1))
+                frame(anim)
+            end
+        end
+    end
+    if global_rank(topo) == 0
+        gif(anim, "C.gif")
+    end
+
+    return
+end
+
+MPI.Init()
+diffusion_2D()
+MPI.Finalize()
diff --git a/scripts_future_API/benchmark_diffusion_3D.jl b/scripts_future_API/benchmark_diffusion_3D.jl
new file mode 100644
index 00000000..ebcb661e
--- /dev/null
+++ b/scripts_future_API/benchmark_diffusion_3D.jl
@@ -0,0 +1,147 @@
+using FastIce.Grids
+using FastIce.GridOperators
+using FastIce.Fields
+using FastIce.Architectures
+using FastIce.BoundaryConditions
+using FastIce.KernelLaunch
+
+using KernelAbstractions
+using AMDGPU
+
+using FastIce.Distributed
+using MPI
+
+using Printf
+using Plots
+
+# @inline isin(Field) = (I <= CartesianIndex(size(grid, location(Field))))
+
+@kernel function update_C!(C, qC, dt, Δ, offset=nothing)
+    I = @index(Global, Cartesian)
+    isnothing(offset) || (I += offset)
+    @inbounds if checkbounds(Bool, C, I)
+        C[I] -= dt * (∂ᶜx(qC.x, I) / Δ.x +
+                      ∂ᶜy(qC.y, I) / Δ.y +
+                      ∂ᶜz(qC.z, I) / Δ.z)
+    end
+end
+
+@kernel function update_qC!(qC, C, dc, Δ, offset=nothing)
+    I = @index(Global, Cartesian)
+    isnothing(offset) || (I += offset)
+    @inbounds if checkbounds(Bool, qC.x, I)
+        qC.x[I] = -dc * ∂ᵛx(C, I) / Δ.x
+    end
+    @inbounds if checkbounds(Bool, qC.y, I)
+        qC.y[I] = -dc * ∂ᵛy(C, I) / Δ.y
+    end
+    @inbounds if checkbounds(Bool, qC.z, I)
+        qC.z[I] = -dc * ∂ᵛz(C, I) / Δ.z
+    end
+end
+
+function compute(arch, grid, hide_boundaries, bc_q, bc_c, outer_width, qC, C, dc, dt, Δ, iters)
+    tic = time_ns()
+    for _ in 1:iters
+        launch!(arch, grid, update_qC! => (qC, C, dc, Δ); location=Vertex(), hide_boundaries, boundary_conditions=bc_q, outer_width)
+        launch!(arch, grid, update_C! => (C, qC, dt, Δ); location=Center(), hide_boundaries, boundary_conditions=bc_c, outer_width)
+        synchronize(Architectures.backend(arch))
+    end
+    wtime = (time_ns() - tic) * 1e-9
+    return wtime
+end
+
+function diffusion_3D(ka_backend=CPU(), dTyp::DataType=Float64, dims=(0, 0, 0); do_visu=true)
+    # setup arch
+    arch = Architecture(ka_backend, dims)
+    topo = details(arch)
+    set_device!(arch)
+    me = global_rank(topo)
+    comm = cartesian_communicator(topo)
+    # physics
+    lx, ly, lz = 10.0, 10.0, 10.0
+    dc = 1
+    # numerics
+    size = (1023, 1023, 1023)
+    nt = 100
+    iters, warmup = 20, 5
+    # preprocessing
+    size_g = global_grid_size(topo, size)
+    global_grid = CartesianGrid(; origin=(-0.5lx, -0.5ly, -0.5lz),
+                                extent=(lx, ly, lz),
+                                size=size_g)
+    grid = local_grid(global_grid, topo)
+    Δ = NamedTuple{(:x, :y, :z)}(spacing(global_grid))
+    dt = minimum(Δ)^2 / dc / ndims(grid) / 2.1
+    hide_boundaries = HideBoundaries{ndims(grid)}(arch)
+    outer_width = (128, 32, 4)
+    # fields
+    C = Field(arch, grid, Center(), dTyp; halo=1)
+    qC = (x = Field(arch, grid, (Vertex(), Center(), Center()), dTyp),
+          y = Field(arch, grid, (Center(), Vertex(), Center()), dTyp),
+          z = Field(arch, grid, (Center(), Center(), Vertex()), dTyp))
+    # initial condition
+    foreach(comp -> set!(comp, 0.0), qC)
+    set!(C, grid, (x, y, z) -> exp(-x^2 - y^2 - z^2))
+    # boundary conditions
+    zero_flux_bc = DirichletBC{FullCell}(0.0)
+    # zero flux at physical boundaries and nothing at MPI boundaries
+    bc_q = ntuple(Val(3)) do D
+        ntuple(Val(2)) do S
+            has_neighbor(topo, D, S) ? nothing : BoundaryConditionsBatch((qC[D],), (zero_flux_bc,))
+        end
+    end
+    # nothing at physical boundaries and communication at MPI boundaries
+    bc_c = ntuple(Val(3)) do D
+        ntuple(Val(2)) do S
+            has_neighbor(topo, D, S) ? BoundaryConditionsBatch((C,), (ExchangeInfo(Val(S), Val(D), C),)) : nothing
+        end
+    end
+    KernelLaunch.apply_all_boundary_conditions!(arch, grid, bc_c)
+    # # time loop
+    # for it in 1:nt
+    #     (me == 0) && println("it = $it")
+    #     launch!(arch, grid, update_qC! => (qC, C, dc, Δ); location=Vertex(), hide_boundaries, boundary_conditions=bc_q, outer_width)
+    #     launch!(arch, grid, update_C! => (C, qC, dt, Δ); location=Center(), hide_boundaries, boundary_conditions=bc_c, outer_width)
+    #     synchronize(Architectures.backend(arch))
+    # end
+
+    # measure perf
+    # warmup
+    compute(arch, grid, hide_boundaries, bc_q, bc_c, outer_width, qC, C, dc, dt, Δ, warmup)
+    # time
+    MPI.Barrier(comm)
+    for ex in 1:5
+        (me == 0) && (sleep(2); println("Experiment = $ex"))
+        MPI.Barrier(comm)
+        wtime = compute(arch, grid, hide_boundaries, bc_q, bc_c, outer_width, qC, C, dc, dt, Δ, (iters - warmup))
+        # report
+        A_eff = 8 / 1e9 * prod(size) * sizeof(dTyp)
+        wtime_it = wtime ./ (iters - warmup)
+        T_eff = A_eff ./ wtime_it
+        @printf("  Executed %d steps in = %1.3e sec (@ T_eff = %1.2f GB/s - device %s) \n", (iters - warmup), wtime,
+                round(T_eff, sigdigits=3), AMDGPU.device_id(AMDGPU.device()))
+    end
+
+    if do_visu
+        ENV["GKSwstype"] = "nul"
+        C_g = (me == 0) ? KernelAbstractions.allocate(CPU(), eltype(C), size_g) : nothing
+        C_v = Array(C)
+        gather!(arch, C_g, C_v)
+        if me == 0
+            p1 = heatmap(xcenters(global_grid), ycenters(global_grid), C_g[:, :, size_g[3]÷2];
+                         aspect_ratio=1, xlims=extrema(xcenters(global_grid)), ylims=extrema(ycenters(global_grid)))
+            png(p1, "C.png")
+        end
+    end
+
+    return
+end
+
+ka_backend = ROCBackend()
+T::DataType = Float64
+dims = (0, 0, 0)
+
+MPI.Init()
+diffusion_3D(ka_backend, T, dims; do_visu=false)
+MPI.Finalize()
diff --git a/scripts_future_API/exchanger.jl b/scripts_future_API/exchanger.jl
deleted file mode 100644
index 2080d268..00000000
--- a/scripts_future_API/exchanger.jl
+++ /dev/null
@@ -1,160 +0,0 @@
-using KernelAbstractions
-using MPI
-using CUDA
-
-include("mpi_utils.jl")
-
-mutable struct Exchanger
-    @atomic done::Bool
-    top::Base.Event
-    bottom::Base.Event
-    @atomic err
-    task::Task
-
-    function Exchanger(f::F, backend::Backend) where F
-        top = Base.Event(#=autoreset=# true)
-        bottom = Base.Event(#=autoreset=# true)
-        this = new(false, top, bottom, nothing)
-
-        this.task = Threads.@spawn begin
-            KernelAbstractions.priority!(backend, :high)
-            try
-                while !(@atomic this.done)
-                    wait(top)
-                    f()
-                    notify(bottom)
-                end
-            catch err
-                @atomic this.done = true
-                @atomic this.err = err
-            end
-        end
-        errormonitor(this.task)
-        return this
-    end
-end
-
-setdone!(exc::Exchanger) = @atomic exc.done = true
-
-Base.isdone(exc::Exchanger) = @atomic exc.done
-function Base.notify(exc::Exchanger)
-    if !(@atomic exc.done)
-        notify(exc.top)
-    else
-        error("notify: Exchanger is not running")
-    end
-end
-function Base.wait(exc::Exchanger)
-    if !(@atomic exc.done)
-        wait(exc.bottom)
-    else
-        error("wait: Exchanger is not running")
-    end
-end
-
-# TODO: Implement in MPI.jl
-function cooperative_test!(req)
-    done = false
-    while !done
-        done, _ = MPI.Test(req, MPI.Status)
-        yield()
-    end
-end
-
-@kernel function do_work!(A, me, offset)
-    ix, iy, iz = @index(Global, NTuple)
-    ix += offset[1] - 1
-    iy += offset[2] - 1
-    iz += offset[3] - 1
-    for _ in 1:10
-        # if (ix > 1 && ix < size(A, 1)) &&
-        #    (iy > 1 && iy < size(A, 2)) &&
-        #    (iz > 1 && iz < size(A, 3))
-            A[ix, iy, iz] = me
-        # end
-    end
-end
-
-backend = CUDABackend()
-T::DataType = Int
-dims = (0, 0, 0)
-
-# function main(backend = CPU(), T::DataType = Float64, dims = (0, 0, 0))
-MPI.Init()
-
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-
-dims = Tuple(MPI.Dims_create(nprocs, dims))
-
-# create MPI communicator
-comm      = MPI.Cart_create(MPI.COMM_WORLD, dims)
-me        = MPI.Comm_rank(comm)
-neighbors = ntuple(Val(length(dims))) do idim
-    MPI.Cart_shift(comm, idim-1, 1)
-end
-coords = Tuple(MPI.Cart_coords(comm))
-# create communicator for the node and select device
-comm_node = MPI.Comm_split_type(comm, MPI.COMM_TYPE_SHARED, me)
-pid       = MPI.Comm_rank(comm_node)
-CUDA.device!(pid)
-
-nx, ny, nz = 6, 6, 6
-bx, by, bz = 2, 2, 2
-A = KernelAbstractions.allocate(backend, T, nx, ny, nz)
-fill!(A, -1)
-
-ranges = split_ndrange(A, (bx, by, bz))
-
-get_recv_view(::Val{1}, ::Val{D}, A) where D = view(A, ntuple(I -> I == D ? 1          : Colon(), Val(ndims(A)))...)
-get_recv_view(::Val{2}, ::Val{D}, A) where D = view(A, ntuple(I -> I == D ? size(A, D) : Colon(), Val(ndims(A)))...)
-
-get_send_view(::Val{1}, ::Val{D}, A) where D = view(A, ntuple(I -> I == D ? 2              : Colon(), Val(ndims(A)))...)
-get_send_view(::Val{2}, ::Val{D}, A) where D = view(A, ntuple(I -> I == D ? size(A, D) - 1 : Colon(), Val(ndims(A)))...)
-
-exchangers = ntuple(Val(length(neighbors))) do dim
-    ntuple(2) do side
-        Exchanger(backend) do
-            rank = neighbors[dim][side]
-            if rank != -1
-                recv_buf = get_recv_view(Val(side), Val(dim), A)
-                recv = MPI.Irecv!(recv_buf,comm;source=rank)
-            end
-
-            I = 2*(dim-1) + side
-
-            do_work!(backend, 256)(A, me, first(ranges[I]); ndrange=size(ranges[I]))
-            KernelAbstractions.synchronize(backend)
-
-            if rank != -1
-                send_buf = get_send_view(Val(side), Val(dim), A)
-                send = MPI.Isend(send_buf,comm;dest=rank)
-                cooperative_test!(recv)
-                cooperative_test!(send)
-            end
-        end
-    end
-end
-
-do_work!(backend, 256)(A, me, first(ranges[end]); ndrange=size(ranges[end]))
-
-for dim in reverse(eachindex(neighbors))
-    notify.(exchangers[dim])
-    wait.(exchangers[dim])
-end
-
-KernelAbstractions.synchronize(backend)
-
-# for dim in eachindex(neighbors)
-#     setdone!.(exchangers[dim])
-# end
-
-sleep(me)
-@info "me == $me"
-display(A)
-
-MPI.Finalize()
-
-#     return
-# end
-
-# main()
\ No newline at end of file
diff --git a/scripts_future_API/exchanger2.jl b/scripts_future_API/exchanger2.jl
deleted file mode 100644
index 775dec21..00000000
--- a/scripts_future_API/exchanger2.jl
+++ /dev/null
@@ -1,81 +0,0 @@
-using KernelAbstractions
-using MPI
-
-using AMDGPU
-
-# using CUDA
-# using NVTX
-
-include("mpi_utils.jl")
-include("mpi_utils2.jl")
-
-@kernel function do_work!(A, me, offset)
-    ix, iy, iz = @index(Global, NTuple)
-    ix += offset[1] - 1
-    iy += offset[2] - 1
-    iz += offset[3] - 1
-    for _ in 1:10
-        # if (ix > 1 && ix < size(A, 1)) &&
-        #    (iy > 1 && iy < size(A, 2)) &&
-        #    (iz > 1 && iz < size(A, 3))
-            A[ix, iy, iz] = me
-        # end
-    end
-end
-
-function main(backend = CPU(), T::DataType = Float64, dims = (0, 0, 0))
-
-    # numerics
-    dims, comm, me, neighbors, coords, device = init_distributed(dims; init_MPI=true)
-
-    nx, ny, nz = 6, 6, 6
-    b_width = (2, 2, 2)
-
-    # init arrays
-    A = KernelAbstractions.allocate(backend, T, nx, ny, nz)
-    fill!(A, -1)
-
-    ranges = split_ndrange(A, b_width)
-
-    exchangers = ntuple(Val(length(neighbors))) do _
-        ntuple(_ -> Exchanger(backend, device), Val(2))
-    end
-
-    do_work!(backend, 256)(A, me, first(ranges[end]); ndrange=size(ranges[end]))
-
-    for dim in reverse(eachindex(neighbors))
-        ntuple(Val(2)) do side
-            rank   = neighbors[dim][side]
-            halo   = get_recv_view(Val(side), Val(dim), A)
-            border = get_send_view(Val(side), Val(dim), A)
-            range  = ranges[2*(dim-1) + side]
-            offset, ndrange = first(range), size(range)
-            start_exchange(exchangers[dim][side], comm, rank, halo, border) do compute_bc
-                do_work!(backend, 256)(A, me, offset; ndrange)
-                if compute_bc
-                    # apply_bcs!(Val(dim), fields, bcs.velocity)
-                end
-                KernelAbstractions.synchronize(backend)
-            end
-        end
-        wait.(exchangers[dim])
-    end
-    KernelAbstractions.synchronize(backend)
-
-    # for dim in eachindex(neighbors)
-    #     setdone!.(exchangers[dim])
-    # end
-
-    sleep(2me)
-    @info "me == $me"
-    display(A)
-
-    finalize_distributed(; finalize_MPI=true)
-    return
-end
-
-backend = ROCBackend()
-T::DataType = Int
-dims = (0, 0, 1)
-
-main(backend, T, dims)
\ No newline at end of file
diff --git a/scripts_future_API/laplacian_3d.jl b/scripts_future_API/laplacian_3d.jl
deleted file mode 100644
index e2491ead..00000000
--- a/scripts_future_API/laplacian_3d.jl
+++ /dev/null
@@ -1,103 +0,0 @@
-using AMDGPU
-using KernelAbstractions
-using MPI
-using Printf
-# using BenchmarkTools
-
-# node_id = parse(Int, ENV["SLURM_NODEID"])
-# task_id = parse(Int, ENV["SLURM_LOCALID"])
-
-# println("node $node_id, gpu $task_id")
-
-# AMDGPU.device_id!(task_id + 1)
-# dev_id = AMDGPU.device_id(AMDGPU.device())
-# @show AMDGPU.device()
-
-@kernel function update_H!(H2, @Const(H), dc, dt, _dx2, _dy2, _dz2)
-    ix, iy, iz = @index(Global, NTuple)
-    @inbounds H2[ix+1, iy+1, iz+1] = H[ix+1, iy+1, iz+1] + dt #= * dc * ((H[ix, iy+1, iz+1] - 2H[ix+1, iy+1, iz+1] + H[ix+2, iy+1, iz+1]) * _dx2 +
-                                                                      (H[ix+1, iy, iz+1] - 2H[ix+1, iy+1, iz+1] + H[ix+1, iy+2, iz+1]) * _dy2 +
-                                                                      (H[ix+1, iy+1, iz] - 2H[ix+1, iy+1, iz+1] + H[ix+1, iy+1, iz+2]) * _dz2) =#
-end
-
-function update_H_roc!(H2, H, dc, dt, _dx2, _dy2, _dz2)
-    ix = (workgroupIdx().x - UInt32(1)) * workgroupDim().x + workitemIdx().x
-    iy = (workgroupIdx().y - UInt32(1)) * workgroupDim().y + workitemIdx().y
-    iz = (workgroupIdx().z - UInt32(1)) * workgroupDim().z + workitemIdx().z
-    if (ix < size(H, 1) - 2 && iy < size(H, 2) - 2 && iz < size(H, 3) - 2)
-        @inbounds H2[ix+1, iy+1, iz+1] = H[ix+1, iy+1, iz+1] + dt #= * dc * ((H[ix, iy+1, iz+1] - 2H[ix+1, iy+1, iz+1] + H[ix+2, iy+1, iz+1]) * _dx2 +
-                                                                          (H[ix+1, iy, iz+1] - 2H[ix+1, iy+1, iz+1] + H[ix+1, iy+2, iz+1]) * _dy2 +
-                                                                          (H[ix+1, iy+1, iz] - 2H[ix+1, iy+1, iz+1] + H[ix+1, iy+1, iz+2]) * _dz2) =#
-    end
-    return
-end
-
-function compute_ka(backend, nx, ny, nz, H2, H, dc, dt, _dx2, _dy2, _dz2, iters)
-    tic = time_ns()
-    for _ = 1:iters
-        update_H!(backend, 256, (nx - 2, ny - 2, nz - 2))(H2, H, dc, dt, _dx2, _dy2, _dz2)
-        KernelAbstractions.synchronize(backend)
-    end
-    wtime = (time_ns() - tic) * 1e-9
-    return wtime
-end
-
-function compute_roc(nblocks, nthreads, H2, H, dc, dt, _dx2, _dy2, _dz2, iters)
-    tic = time_ns()
-    for _ = 1:iters
-        AMDGPU.@sync @roc gridsize=nblocks groupsize=nthreads update_H_roc!(H2, H, dc, dt, _dx2, _dy2, _dz2)
-    end
-    wtime = (time_ns() - tic) * 1e-9
-    return wtime
-end
-
-function run(backend=CPU(), dims=(0, 0, 0); nx=128, ny=128, nz=128, dtype=Float64)
-    MPI.Init()
-    nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-    dims = Tuple(MPI.Dims_create(nprocs, dims))
-    # create MPI communicator
-    comm = MPI.Cart_create(MPI.COMM_WORLD, dims)
-    me = MPI.Comm_rank(comm)
-    # create communicator for the node and select device
-    comm_node = MPI.Comm_split_type(comm, MPI.COMM_TYPE_SHARED, me)
-    dev_id = MPI.Comm_rank(comm_node)
-    @show device = AMDGPU.device_id!(dev_id + 1)
-    dev_id = AMDGPU.device_id(AMDGPU.device())
-    # physics
-    dc = 1.0
-    _dx2, _dy2, _dz2 = 1.0, 1.0, 1.0
-    dt = 1.0 / (_dx2 * dc * 6.1)
-    # allocate
-    H = KernelAbstractions.zeros(backend, dtype, nx, ny, nz)
-    # benchmark
-    iters, warmup = 35, 5
-    nthreads = (256, 1, 1)
-    nblocks = cld.((nx, ny, nz), nthreads)
-    H2 = copy(H)
-    KernelAbstractions.synchronize(backend)
-    println("Let's fucking gooooooooo!")
-
-    # GC.gc()
-    # GC.enable(false)
-
-    # compute_ka(backend, nx, ny, nz, H2, H, dc, dt, _dx2, _dy2, _dz2, warmup)
-    # wtime = compute_ka(backend, nx, ny, nz, H2, H, dc, dt, _dx2, _dy2, _dz2, (iters - warmup))
-
-    compute_roc(nblocks, nthreads, H2, H, dc, dt, _dx2, _dy2, _dz2, warmup)
-    wtime = compute_roc(nblocks, nthreads, H2, H, dc, dt, _dx2, _dy2, _dz2, (iters - warmup))
-
-    # GC.enable(true)
-    # GC.gc()
-
-    # perf
-    A_eff = 2 / 2^30 * (nx - 2) * (ny - 2) * (nz - 2) * sizeof(Float64)
-    # A_eff = 2 / 2^30 * nx * ny * nz * sizeof(Float64)
-    wtime_it = wtime / (iters - warmup)
-    T_eff = A_eff / wtime_it
-    @printf("Executed %d steps in = %1.3e sec (@ T_eff = %1.2f GB/s - device %s) \n", (iters - warmup), wtime, round(T_eff, sigdigits=3), dev_id)
-    println("Done")
-    MPI.Finalize()
-    return
-end
-
-run(ROCBackend(), (0, 0, 0); nx=1024, ny=1024, nz=1024)
\ No newline at end of file
diff --git a/scripts_future_API/mpi_utils.jl b/scripts_future_API/mpi_utils.jl
deleted file mode 100644
index 37c20128..00000000
--- a/scripts_future_API/mpi_utils.jl
+++ /dev/null
@@ -1,27 +0,0 @@
-@inline subrange(nr,bw,I,::Val{1}) = 1:bw[I]
-@inline subrange(nr,bw,I,::Val{2}) = (size(nr,I)-bw[I]+1):size(nr,I)
-@inline subrange(nr,bw,I,::Val{3}) = (bw[I]+1):(size(nr,I)-bw[I])
-
-@inline split_ndrange(ndrange,ndwidth) = split_ndrange(CartesianIndices(ndrange),ndwidth)
-
-function split_ndrange(ndrange::CartesianIndices{N},ndwidth::NTuple{N,<:Integer}) where N
-    @assert all(size(ndrange) .> ndwidth.*2)
-    @inline ndsubrange(I,::Val{J}) where J = ntuple(Val(N)) do idim
-        if idim < I
-            1:size(ndrange,idim)
-        elseif idim == I
-            subrange(ndrange,ndwidth,idim,Val(J))
-        else
-            subrange(ndrange,ndwidth,idim,Val(3))
-        end
-    end
-    ndinner = ntuple(idim -> subrange(ndrange,ndwidth,idim,Val(3)), Val(N))
-    return ntuple(Val(2N+1)) do i
-        if i == 2N+1
-            ndrange[ndinner...]
-        else
-            idim,idir = divrem(i-1,2) .+ 1
-            ndrange[ndsubrange(idim,Val(idir))...]
-        end
-    end
-end
\ No newline at end of file
diff --git a/scripts_future_API/mpi_utils2.jl b/scripts_future_API/mpi_utils2.jl
deleted file mode 100644
index 10121c9a..00000000
--- a/scripts_future_API/mpi_utils2.jl
+++ /dev/null
@@ -1,117 +0,0 @@
-# MPI
-function init_distributed(dims::Tuple=(0, 0, 0); init_MPI=true)
-    init_MPI && MPI.Init()
-    nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-    dims = Tuple(MPI.Dims_create(nprocs, dims))
-    # create MPI communicator
-    comm = MPI.Cart_create(MPI.COMM_WORLD, dims)
-    me = MPI.Comm_rank(comm)
-    neighbors = ntuple(Val(length(dims))) do idim
-        MPI.Cart_shift(comm, idim-1, 1)
-    end
-    coords = Tuple(MPI.Cart_coords(comm))
-    # create communicator for the node and select device
-    comm_node = MPI.Comm_split_type(comm, MPI.COMM_TYPE_SHARED, me)
-    dev_id = MPI.Comm_rank(comm_node)
-    # @show device = CUDA.device!(dev_id)
-    # @show device = AMDGPU.device_id!(dev_id + 1)
-    @show device = AMDGPU.device_id!(dev_id*2 + 1)
-    return (dims, comm, me, neighbors, coords, device)
-end
-
-function finalize_distributed(; finalize_MPI=true)
-    finalize_MPI && MPI.Finalize()
-    return
-end
-
-# exchanger
-mutable struct Exchanger
-    @atomic done::Bool
-    channel::Channel
-    bottom::Base.Event
-    @atomic err
-    task::Task
-
-    function Exchanger(backend::Backend, device)
-        channel = Channel()
-        bottom  = Base.Event(true)
-
-        this = new(false, channel, bottom, nothing)
-
-        recv_buf = nothing
-        send_buf = nothing
-
-        this.task = Threads.@spawn begin
-            # CUDA.device!(device)
-            AMDGPU.device!(device)
-            KernelAbstractions.priority!(backend, :high)
-            try
-                while !(@atomic this.done)
-                    f, comm, rank, halo, border = take!(channel)
-
-                    has_neighbor = rank != MPI.PROC_NULL
-                    compute_bc   = !has_neighbor
-
-                    if isnothing(recv_buf)
-                        recv_buf = similar(halo)
-                        send_buf = similar(border)
-                    end
-                    if has_neighbor
-                        recv = MPI.Irecv!(recv_buf, comm; source=rank)
-                    end
-                    f(compute_bc)
-                    if has_neighbor
-                        copyto!(send_buf, border)
-                        KernelAbstractions.synchronize(backend)
-                        send = MPI.Isend(send_buf, comm; dest=rank)
-                        flag = false
-                        while true
-                            test_recv = MPI.Test(recv)
-                            test_send = MPI.Test(send)
-                            if test_recv && !flag
-                                copyto!(halo, recv_buf)
-                                flag = true
-                            end
-                            if test_recv && test_send break end
-                            yield()
-                        end
-                    end
-                    KernelAbstractions.synchronize(backend)
-                    notify(bottom)
-                end
-            catch err
-                @show err
-                @atomic this.done = true
-                @atomic this.err = err
-            end
-        end
-        errormonitor(this.task)
-        return this
-    end
-end
-
-setdone!(exc::Exchanger) = @atomic exc.done = true
-
-Base.isdone(exc::Exchanger) = @atomic exc.done
-
-function start_exchange(f, exc::Exchanger, comm, rank, halo, border)
-    if !(@atomic exc.done)
-        put!(exc.channel, (f, comm, rank, halo, border))
-    else
-        error("notify: Exchanger is not running")
-    end
-end
-
-function Base.wait(exc::Exchanger)
-    if !(@atomic exc.done)
-        wait(exc.bottom)
-    else
-        error("wait: Exchanger is not running")
-    end
-end
-
-get_recv_view(::Val{1}, ::Val{D}, A) where D = view(A, ntuple(I -> I == D ? 1          : Colon(), Val(ndims(A)))...)
-get_recv_view(::Val{2}, ::Val{D}, A) where D = view(A, ntuple(I -> I == D ? size(A, D) : Colon(), Val(ndims(A)))...)
-
-get_send_view(::Val{1}, ::Val{D}, A) where D = view(A, ntuple(I -> I == D ? 2              : Colon(), Val(ndims(A)))...)
-get_send_view(::Val{2}, ::Val{D}, A) where D = view(A, ntuple(I -> I == D ? size(A, D) - 1 : Colon(), Val(ndims(A)))...)
diff --git a/scripts_future_API/profileme.sh b/scripts_future_API/profileme.sh
deleted file mode 100755
index dced62b6..00000000
--- a/scripts_future_API/profileme.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-source /users/lurass/scratch/setenv_lumi.sh
-
-# srun -N1 -n2 --ntasks-per-node=2 --gpus-per-node=2 --gpu-bind=closest profileme.sh
-
-# srun --cpu-bind=map_cpu:49,57,17,25,1,9,33,41 -N1 -n8 --gpus-per-node=8 profileme.sh
-
-# julia --project laplacian_3d.jl
-julia --project bench3d.jl
-# julia --project exchanger2.jl
-# julia --project rocmaware.jl
-
-# ENABLE_JITPROFILING=1 ../../myrocprof --hsa-trace --hip-trace julia --project ./exchanger2.jl
-
-# export HIP_VISIBLE_DEVICES=$SLURM_LOCALID
-# export HIP_VISIBLE_DEVICES="0,2,4,6"
-
-# ENABLE_JITPROFILING=1 rocprof --hsa-trace --hip-trace -d ./prof_out${SLURM_PROCID} -o ./prof_out${SLURM_PROCID}/results${SLURM_PROCID}.csv julia --project bench3d.jl
\ No newline at end of file
diff --git a/scripts_future_API/rocmaware.jl b/scripts_future_API/rocmaware.jl
deleted file mode 100644
index b2387ab1..00000000
--- a/scripts_future_API/rocmaware.jl
+++ /dev/null
@@ -1,25 +0,0 @@
-using MPI
-using AMDGPU
-MPI.Init()
-comm = MPI.COMM_WORLD
-rank = MPI.Comm_rank(comm)
-# select device
-comm_l = MPI.Comm_split_type(comm, MPI.COMM_TYPE_SHARED, rank)
-rank_l = MPI.Comm_rank(comm_l)
-device = AMDGPU.device_id!(rank_l+1)
-gpu_id = AMDGPU.device_id(AMDGPU.device())
-# select device
-size = MPI.Comm_size(comm)
-dst  = mod(rank+1, size)
-src  = mod(rank-1, size)
-println("rank=$rank rank_loc=$rank_l (gpu_id=$gpu_id - $device), size=$size, dst=$dst, src=$src")
-N = 4
-send_mesg = ROCArray{Float64}(undef, N)
-recv_mesg = ROCArray{Float64}(undef, N)
-fill!(send_mesg, Float64(rank))
-fill!(recv_mesg, Float64(0.0))
-AMDGPU.synchronize()
-rank==0 && println("start sending...")
-MPI.Sendrecv!(send_mesg, dst, 0, recv_mesg, src, 0, comm)
-println("recv_mesg on proc $rank: $recv_mesg")
-rank==0 && println("done.")
\ No newline at end of file
diff --git a/scripts_future_API/run_stokes3D.sh b/scripts_future_API/run_stokes3D.sh
new file mode 100755
index 00000000..d7ae7850
--- /dev/null
+++ b/scripts_future_API/run_stokes3D.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+module load LUMI/22.08
+module load partition/G
+module load rocm/5.3.3
+
+export MPICH_GPU_SUPPORT_ENABLED=1
+
+julia --project -O3 tm_stokes_mpi_wip.jl
diff --git a/scripts_future_API/runme.sh b/scripts_future_API/runme.sh
index 45ed56ab..f7c803f1 100755
--- a/scripts_future_API/runme.sh
+++ b/scripts_future_API/runme.sh
@@ -1,14 +1,23 @@
 #!/bin/bash
 
-module load LUMI/22.08
-module load partition/G
-module load rocm/5.3.3
+source /users/lurass/scratch/setenv_lumi.sh
+# module load LUMI/22.08
+# module load partition/G
+# module load rocm/5.3.3
 
 # ROCm-aware MPI set to 1, else 0
 export MPICH_GPU_SUPPORT_ENABLED=1
-export IGG_ROCMAWARE_MPI=1
 
-# Needs to know about location of GTL lib
-export LD_PRELOAD=${CRAY_MPICH_ROOTDIR}/gtl/lib/libmpi_gtl_hsa.so
+## basic
+# srun --cpu-bind=map_cpu:49,57,17,25,1,9,33,41 -N1 -n8 --gpus-per-node=8 profileme.sh
 
-julia --project rocmaware.jl
\ No newline at end of file
+## optimal using only single GCD per MI250x Module
+# srun --cpu-bind=map_cpu:49,17,1,33 -N1 -n1 --gpus-per-node=8 profileme.sh
+# srun --cpu-bind=map_cpu:49,17,1,33 -N4 -n16 --gpus-per-node=8 profileme.sh
+export ROCR_VISIBLE_DEVICES=0,2,4,6
+
+# julia --project benchmark_diffusion_3D.jl
+julia --project --color=yes tm_stokes_mpi_wip.jl
+
+# Profiling
+# ENABLE_JITPROFILING=1 rocprof --hip-trace --hsa-trace -d ./prof_out${SLURM_PROCID} -o ./prof_out${SLURM_PROCID}/results${SLURM_PROCID}.csv julia --project bench3d.jl
diff --git a/scripts_future_API/stokes_ms.jl b/scripts_future_API/stokes_ms.jl
new file mode 100644
index 00000000..1cdf48ed
--- /dev/null
+++ b/scripts_future_API/stokes_ms.jl
@@ -0,0 +1,151 @@
+using Printf
+
+using FastIce
+using FastIce.Architectures
+using FastIce.Grids
+using FastIce.Fields
+using FastIce.Utils
+using FastIce.BoundaryConditions
+using FastIce.Models.FullStokes.Isothermal
+using FastIce.Physics
+using FastIce.KernelLaunch
+
+const VBC = BoundaryCondition{Velocity}
+const TBC = BoundaryCondition{Traction}
+const SBC = BoundaryCondition{Slip}
+
+using LinearAlgebra
+using KernelAbstractions
+
+# manufactured solution for the confined Stokes flow with free-slip boundaries
+# helper functions
+f(ξ, η) = cos(π * ξ) * (η^2 - 1)^2 -
+          cos(π * η) * (ξ^2 - 1)^2
+g(ξ, η) = sin(π * η) * (ξ^2 - 1) * ξ -
+          sin(π * ξ) * (η^2 - 1) * η
+p(ξ, η) = cos(π * η) * (1 - 3 * ξ^2) * 2 -
+          cos(π * ξ) * (1 - 3 * η^2) * 2
+# velocity
+vx(x, y, z) = sin(π * x) * f(y, z)
+vy(x, y, z) = sin(π * y) * f(z, x)
+vz(x, y, z) = sin(π * z) * f(x, y)
+# diagonal deviatoric stress
+τxx(x, y, z, η) = 2 * η * π * cos(π * x) * f(y, z)
+τyy(x, y, z, η) = 2 * η * π * cos(π * y) * f(z, x)
+τzz(x, y, z, η) = 2 * η * π * cos(π * z) * f(x, y)
+# off-diagonal deviatoric stress
+τxy(x, y, z, η) = 4 * η * cos(π * z) * g(x, y)
+τxz(x, y, z, η) = 4 * η * cos(π * y) * g(z, x)
+τyz(x, y, z, η) = 4 * η * cos(π * x) * g(y, z)
+# forcing terms
+ρgx(x, y, z, η) = -2 * η * sin(π * x) * (f(y, z) * π^2 - p(y, z))
+ρgy(x, y, z, η) = -2 * η * sin(π * y) * (f(z, x) * π^2 - p(z, x))
+ρgz(x, y, z, η) = -2 * η * sin(π * z) * (f(x, y) * π^2 - p(x, y))
+
+@views function main()
+    backend = CUDABackend()
+    arch = Architecture(backend, 2)
+    set_device!(arch)
+
+    outer_width = (4, 4, 4)
+
+    # physics
+    η0 = 1.0
+    A0 = 0.5
+
+    # geometry
+    grid = CartesianGrid(; origin=(-1.0, -1.0, -1.0),
+                         extent=(2.0, 2.0, 2.0),
+                         size=(256, 256, 256))
+
+    free_slip = SBC(0.0, 0.0, 0.0)
+    xface = (Vertex(), Center(), Center())
+    yface = (Center(), Vertex(), Center())
+    zface = (Center(), Center(), Vertex())
+
+    boundary_conditions = (x = (free_slip, free_slip),
+                           y = (free_slip, free_slip),
+                           z = (free_slip, free_slip))
+
+    gravity = (x=FunctionField(ρgx, grid, xface; parameters=η0),
+               y=FunctionField(ρgy, grid, yface; parameters=η0),
+               z=FunctionField(ρgz, grid, zface; parameters=η0))
+
+    # numerics
+    niter  = 10maximum(size(grid))
+    ncheck = maximum(size(grid))
+
+    # PT params
+    r       = 0.7
+    re_mech = 8π
+    lτ_re_m = minimum(extent(grid)) / re_mech
+    vdτ     = minimum(spacing(grid)) / sqrt(ndims(grid) * 1.1)
+    θ_dτ    = lτ_re_m * (r + 4 / 3) / vdτ
+    dτ_r    = 1.0 / (θ_dτ + 1.0)
+    nudτ    = vdτ * lτ_re_m
+
+    iter_params = (η_rel=1e-1,
+                   Δτ=(Pr=r / θ_dτ, τ=(xx=dτ_r, yy=dτ_r, zz=dτ_r, xy=dτ_r, xz=dτ_r, yz=dτ_r), V=(x=nudτ, y=nudτ, z=nudτ)))
+
+    physics = (rheology=GlensLawRheology(1),)
+    other_fields = (A=Field(backend, grid, Center()),)
+
+    model = IsothermalFullStokesModel(;
+                                      arch,
+                                      grid,
+                                      physics,
+                                      gravity,
+                                      boundary_conditions,
+                                      iter_params,
+                                      outer_width,
+                                      other_fields)
+
+    set!(model.fields.Pr, 0.0)
+    foreach(x -> set!(x, 0.0), model.fields.τ)
+    foreach(x -> set!(x, 0.0), model.fields.V)
+
+    set!(other_fields.A, A0)
+    set!(model.fields.η, grid, (grid, loc, I, fields) -> physics.rheology(grid, I, fields); discrete=true, parameters=(model.fields,))
+
+    KernelLaunch.apply_all_boundary_conditions!(arch, grid, model.boundary_conditions.stress)
+    KernelLaunch.apply_all_boundary_conditions!(arch, grid, model.boundary_conditions.velocity)
+    KernelLaunch.apply_all_boundary_conditions!(arch, grid, model.boundary_conditions.rheology)
+
+    for iter in 1:niter
+        advance_iteration!(model, 0.0, 1.0)
+        if iter % ncheck == 0
+            compute_residuals!(model)
+            err = (Pr = norm(model.fields.r_Pr, Inf),
+                   Vx = norm(model.fields.r_V.x, Inf),
+                   Vy = norm(model.fields.r_V.y, Inf),
+                   Vz = norm(model.fields.r_V.z, Inf))
+            if any(.!isfinite.(values(err)))
+                error("simulation failed, err = $err")
+            end
+            iter_nx = iter / maximum(size(grid))
+            @printf("  iter/nx = %.1f, err = [Pr = %1.3e, Vx = %1.3e, Vy = %1.3e, Vz = %1.3e]\n", iter_nx, err...)
+        end
+    end
+
+    Vx_m = Field(backend, grid, location(model.fields.V.x))
+    Vy_m = Field(backend, grid, location(model.fields.V.y))
+    Vz_m = Field(backend, grid, location(model.fields.V.z))
+
+    set!(Vx_m, grid, vx)
+    set!(Vy_m, grid, vy)
+    set!(Vz_m, grid, vz)
+
+    dVx = interior(Vx_m) .- interior(model.fields.V.x)
+    dVy = interior(Vy_m) .- interior(model.fields.V.y)
+    dVz = interior(Vz_m) .- interior(model.fields.V.z)
+
+    err = (norm(dVx, Inf) / norm(Vx_m, Inf),
+           norm(dVy, Inf) / norm(Vy_m, Inf),
+           norm(dVz, Inf) / norm(Vz_m, Inf))
+
+    @printf("err = [Vx: %1.3e, Vy: %1.3e, Vz: %1.3e]\n", err...)
+
+    return
+end
+
+main()
diff --git a/scripts_future_API/submit.sh b/scripts_future_API/submit.sh
new file mode 100644
index 00000000..77ff13fe
--- /dev/null
+++ b/scripts_future_API/submit.sh
@@ -0,0 +1,18 @@
+#!/bin/bash -l
+#SBATCH --job-name="FastIce3D"
+#SBATCH --output=FastIce3D.%j.o
+#SBATCH --error=FastIce3D.%j.e
+#SBATCH --time=00:10:00
+#SBATCH --nodes=4
+#SBATCH --ntasks=16
+# #SBATCH --ntasks-per-node=8 # this somehow fails...
+#SBATCH --gpus-per-node=8
+#SBATCH --partition=standard-g
+#SBATCH --account project_465000557
+
+# CPU_BIND="map_cpu:49,57,17,25,1,9,33,41"
+
+# export ROCR_VISIBLE_DEVICES=0,2,4,6 # -> done in runme.sh
+CPU_BIND="map_cpu:49,17,1,33"
+
+srun --cpu-bind=${CPU_BIND} ./runme.sh
diff --git a/scripts_future_API/submit_stokes3D.sh b/scripts_future_API/submit_stokes3D.sh
new file mode 100755
index 00000000..04615068
--- /dev/null
+++ b/scripts_future_API/submit_stokes3D.sh
@@ -0,0 +1,15 @@
+#!/bin/bash -l
+#SBATCH --job-name="FastIce3D"
+#SBATCH --output=FastIce3D.%j.o
+#SBATCH --error=FastIce3D.%j.e
+#SBATCH --time=01:00:00
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=8
+#SBATCH --gpus-per-node=8
+#SBATCH --partition=standard-g
+#SBATCH --account project_465000557
+
+# export ROCR_VISIBLE_DEVICES=0,2,4,6
+# srun --cpu-bind=map_cpu:49,17,1,33 ./run_stokes3D.sh
+
+srun --cpu-bind=map_cpu:49,57,17,25,1,9,33,41 ./run_stokes3D.sh
diff --git a/scripts_future_API/test_stokes_ms.jl b/scripts_future_API/test_stokes_ms.jl
new file mode 100644
index 00000000..2dea8982
--- /dev/null
+++ b/scripts_future_API/test_stokes_ms.jl
@@ -0,0 +1,125 @@
+using FastIce
+using FastIce.Architectures
+using FastIce.Grids
+using FastIce.Fields
+using FastIce.Utils
+using FastIce.BoundaryConditions
+using FastIce.Models.FullStokes.Isothermal
+using FastIce.Physics
+using FastIce.KernelLaunch
+
+const VBC = BoundaryCondition{Velocity}
+const TBC = BoundaryCondition{Traction}
+const SBC = BoundaryCondition{Slip}
+
+using KernelAbstractions
+# using AMDGPU
+
+using FastIce.Distributed
+using MPI
+
+# manufactured solution for the confined Stokes flow with free-slip boundaries
+# helper functions
+f(ξ, η) = cos(π * ξ) * (η^2 - 1)^2 -
+          cos(π * η) * (ξ^2 - 1)^2
+g(ξ, η) = sin(π * η) * (ξ^2 - 1) * ξ -
+          sin(π * ξ) * (η^2 - 1) * η
+p(ξ, η) = cos(π * η) * (1 - 3 * ξ^2) * 2 -
+          cos(π * ξ) * (1 - 3 * η^2) * 2
+# velocity
+vx(x, y, z) = sin(π * x) * f(y, z)
+vy(x, y, z) = sin(π * y) * f(z, x)
+vz(x, y, z) = sin(π * z) * f(x, y)
+# diagonal deviatoric stress
+τxx(x, y, z, η) = 2 * η * π * cos(π * x) * f(y, z)
+τyy(x, y, z, η) = 2 * η * π * cos(π * y) * f(z, x)
+τzz(x, y, z, η) = 2 * η * π * cos(π * z) * f(x, y)
+# off-diagonal deviatoric stress
+τxy(x, y, z, η) = 4 * η * cos(π * z) * g(x, y)
+τxz(x, y, z, η) = 4 * η * cos(π * y) * g(z, x)
+τyz(x, y, z, η) = 4 * η * cos(π * x) * g(y, z)
+# forcing terms
+ρgx(x, y, z, η) = -2 * η * sin(π * x) * (f(y, z) * π^2 - p(y, z))
+ρgy(x, y, z, η) = -2 * η * sin(π * y) * (f(z, x) * π^2 - p(z, x))
+ρgz(x, y, z, η) = -2 * η * sin(π * z) * (f(x, y) * π^2 - p(x, y))
+
+function main()
+    MPI.Init()
+    # architecture
+    backend = CPU()
+    dims = (1, 1, 1)
+    arch = Architecture(backend, dims, MPI.COMM_WORLD)
+    set_device!(arch)
+    topo = details(arch)
+    # geometry
+    size_l = (64, 64, 64) # local grid size
+    size_g = global_grid_size(topo, size_l)
+    if global_rank(topo) == 0
+        @show dimensions(topo)
+        @show size_g
+    end
+    grid_g = CartesianGrid(; origin=(-1.0, -1.0, -1.0),
+                           extent=(2.0, 2.0, 2.0),
+                           size=size_g)
+
+    grid_l = local_grid(grid_g, topo)
+    # physics
+    η0 = 1.0
+    A0 = 0.5
+    # boundary conditions
+    free_slip = SBC(0.0, 0.0, 0.0)
+    boundary_conditions = (x = (free_slip, free_slip),
+                           y = (free_slip, free_slip),
+                           z = (free_slip, free_slip))
+    gravity = (x=0.0, y=0.0, z=0.0)
+    physics = (rheology=GlensLawRheology(1),)
+    other_fields = (A=Field(backend, grid_l, Center()),)
+    # numerics
+    niter  = 20maximum(size(grid_g))
+    ncheck = 1maximum(size(grid_g))
+    # PT params
+    r       = 0.7
+    re_mech = 5π
+    lτ_re_m = minimum(extent(grid_g)) / re_mech
+    vdτ     = minimum(spacing(grid_g)) / sqrt(ndims(grid_g) * 10.1)
+    θ_dτ    = lτ_re_m * (r + 4 / 3) / vdτ
+    dτ_r    = 1.0 / (θ_dτ + 1.0)
+    nudτ    = vdτ * lτ_re_m
+    # pack PT params
+    iter_params = (η_rel=1e-1, Δτ=(Pr=r / θ_dτ, τ=(xx=dτ_r, yy=dτ_r, zz=dτ_r, xy=dτ_r, xz=dτ_r, yz=dτ_r), V=(x=nudτ, y=nudτ, z=nudτ)))
+    # model
+    model = IsothermalFullStokesModel(; arch,
+                                      grid=grid_l,
+                                      physics,
+                                      gravity,
+                                      boundary_conditions,
+                                      iter_params,
+                                      other_fields)
+    # init
+    fill!(parent(model.fields.Pr), 0.0)
+    foreach(x -> fill!(parent(x), 0.0), model.fields.τ)
+    foreach(x -> fill!(parent(x), 0.0), model.fields.V)
+    fill!(parent(other_fields.A), A0)
+    set!(model.fields.η, grid_l, (grid, loc, I, fields) -> physics.rheology(grid, I, fields); discrete=true, parameters=(model.fields,))
+
+    KernelLaunch.apply_all_boundary_conditions!(arch, grid_l, model.boundary_conditions.stress)
+    KernelLaunch.apply_all_boundary_conditions!(arch, grid_l, model.boundary_conditions.velocity)
+    KernelLaunch.apply_all_boundary_conditions!(arch, grid_l, model.boundary_conditions.rheology)
+
+    if global_rank(topo) == 0
+        println("action")
+    end
+
+    for iter in 1:niter
+        advance_iteration!(model, 0.0, 1.0)
+        if (iter % ncheck == 0) && (global_rank(topo) == 0)
+            println("iter/nx = $(iter/maximum(size(grid_g)))")
+        end
+    end
+
+    KernelAbstractions.synchronize(backend)
+    MPI.Finalize()
+    return
+end
+
+main()
diff --git a/scripts_future_API/tm_stokes_mpi_wip.jl b/scripts_future_API/tm_stokes_mpi_wip.jl
index 5baf8087..b2cda295 100644
--- a/scripts_future_API/tm_stokes_mpi_wip.jl
+++ b/scripts_future_API/tm_stokes_mpi_wip.jl
@@ -1,129 +1,273 @@
 using FastIce
+using FastIce.Architectures
 using FastIce.Grids
 using FastIce.Fields
 using FastIce.Utils
 using FastIce.BoundaryConditions
 using FastIce.Models.FullStokes.Isothermal
 using FastIce.Physics
-using FastIce.Distributed
+using FastIce.KernelLaunch
 
-using MPI
+const VBC = BoundaryCondition{Velocity}
+const TBC = BoundaryCondition{Traction}
+const SBC = BoundaryCondition{Slip}
 
+using LinearAlgebra, Printf
 using KernelAbstractions
+# using AMDGPU
+
+using FastIce.Distributed
+using MPI
+
+using CairoMakie
+
+norm_g(A) = (sum2_l = sum(interior(A) .^ 2); sqrt(MPI.Allreduce(sum2_l, MPI.SUM, MPI.COMM_WORLD)))
+max_abs_g(A) = (max_l = maximum(abs.(interior(A))); MPI.Allreduce(max_l, MPI.MAX, MPI.COMM_WORLD))
+
+@views avx(A) = 0.5 .* (A[1:end-1, :, :] .+ A[2:end, :, :])
+@views avy(A) = 0.5 .* (A[:, 1:end-1, :] .+ A[:, 2:end, :])
+@views avz(A) = 0.5 .* (A[:, :, 1:end-1] .+ A[:, :, 2:end])
+
+@views av_xy(A) = 0.25 .* (A[1:end-1, 1:end-1, :] .+ A[2:end, 1:end-1, :] .+ A[2:end, 2:end, :] .+ A[1:end-1, 2:end, :])
+@views av_xz(A) = 0.25 .* (A[1:end-1, :, 1:end-1] .+ A[2:end, :, 1:end-1, :] .+ A[2:end, :, 2:end, :] .+ A[1:end-1, :, 2:end])
+@views av_yz(A) = 0.25 .* (A[:, 1:end-1, 1:end-1] .+ A[:, 2:end, 1:end-1] .+ A[:, 2:end, 2:end] .+ A[:, 1:end-1, 2:end])
+
+function fastice_intro(; kwargs...)
+    intro = raw"""
+    ┌──────────────────────────────────────────────────────────┐
+    │     ______              __   ____                  _  __ │
+    │    / ____/____ _ _____ / /_ /  _/_____ ___        (_)/ / │
+    │   / /_   / __ `// ___// __/ / / / ___// _ \      / // /  │
+    │  / __/  / /_/ /(__  )/ /_ _/ / / /__ /  __/_    / // /   │
+    │ /_/     \__,_//____/ \__//___/ \___/ \___/(_)__/ //_/    │
+    │                                             /___/        │
+    └──────────────────────────────────────────────────────────┘
+    """
+    printstyled(intro; kwargs...)
+end
+
+function main(; do_visu=false, do_save=false)
+    MPI.Init()
+
+    backend = CPU()
+    # dims = (4, 2, 2)
+    # dims = (4, 2, 2)
+    dims = (1, 1, 1)
+    topo = CartesianTopology(dims)
+    arch = Architecture(backend, topo)
+    set_device!(arch)
+
+    comm = cartesian_communicator(topo)
+    me = global_rank(topo) # rank
+
+    size_l = (30, 30, 30)
+    size_g = global_grid_size(topo, size_l)
+
+    outer_width = (3, 3, 3) #(128, 32, 4)#
+
+    grid_g = CartesianGrid(; origin=(-2.0, -1.0, 0.0),
+                           extent=(4.0, 2.0, 2.0),
+                           size=size_g)
+
+    grid_l = local_grid(grid_g, topo)
+
+    if me == 0
+        fastice_intro(bold=true, color=:blue)
+        printstyled("Running FastIce.jl 🧊 \n"; bold=true, color=:blue)
+        printstyled(grid_g; bold=true)
+    end
+
+    no_slip      = VBC(0.0, 0.0, 0.0)
+    free_slip    = SBC(0.0, 0.0, 0.0)
+    free_surface = TBC(0.0, 0.0, 0.0)
+
+    boundary_conditions = (x=(free_slip, free_slip),
+                           y=(free_slip, free_slip),
+                           z=(no_slip, free_surface))
+
+    ρgx(x, y, z) = 0.25
+    ρgy(x, y, z) = 0.0
+    ρgz(x, y, z) = 1.0
+    gravity = (x=FunctionField(ρgx, grid_l, (Vertex(), Center(), Center())),
+               y=FunctionField(ρgy, grid_l, (Center(), Vertex(), Center())),
+               z=FunctionField(ρgz, grid_l, (Center(), Center(), Vertex())))
+
+    # numerics
+    niter  = 10maximum(size(grid_g))
+    ncheck = 2maximum(size(grid_g))
+
+    r       = 0.7
+    re_mech = 4π
+    lτ_re_m = minimum(extent(grid_g)) / re_mech
+    vdτ     = minimum(spacing(grid_g)) / sqrt(ndims(grid_g) * 1.5)
+    θ_dτ    = lτ_re_m * (r + 4 / 3) / vdτ
+    dτ_r    = 1.0 / (θ_dτ + 1.0)
+    nudτ    = vdτ * lτ_re_m
+
+    iter_params = (η_rel=1e-1,
+                   Δτ=(Pr=r / θ_dτ, τ=(xx=dτ_r, yy=dτ_r, zz=dτ_r, xy=dτ_r, xz=dτ_r, yz=dτ_r), V=(x=nudτ, y=nudτ, z=nudτ)))
+
+    physics = (rheology=GlensLawRheology(1),)
+    other_fields = (A=Field(backend, grid_l, Center()),)
+
+    model = IsothermalFullStokesModel(;
+                                      arch,
+                                      grid=grid_l,
+                                      physics,
+                                      gravity,
+                                      boundary_conditions,
+                                      outer_width,
+                                      iter_params,
+                                      other_fields)
+
+    (me == 0) && printstyled("Model created \n"; bold=true, color=:light_blue)
+
+    if do_save || do_visu
+        if me == 0
+            Pr_g  = zeros(size(grid_g))
+            τxx_g = zeros(size(grid_g))
+            τyy_g = zeros(size(grid_g))
+            τzz_g = zeros(size(grid_g))
+            τxy_g = zeros(size(grid_g))
+            τxz_g = zeros(size(grid_g))
+            τyz_g = zeros(size(grid_g))
+            Vx_g  = zeros(size(grid_g))
+            Vy_g  = zeros(size(grid_g))
+            Vz_g  = zeros(size(grid_g))
+        else
+            Pr_g  = nothing
+            τxx_g = nothing
+            τyy_g = nothing
+            τzz_g = nothing
+            τxy_g = nothing
+            τxz_g = nothing
+            τyz_g = nothing
+            Vx_g  = nothing
+            Vy_g  = nothing
+            Vz_g  = nothing
+        end
+        Pr_v  = zeros(size(grid_l))
+        τxx_v = zeros(size(grid_l))
+        τyy_v = zeros(size(grid_l))
+        τzz_v = zeros(size(grid_l))
+        τxy_v = zeros(size(grid_l))
+        τxz_v = zeros(size(grid_l))
+        τyz_v = zeros(size(grid_l))
+        Vx_v  = zeros(size(grid_l))
+        Vy_v  = zeros(size(grid_l))
+        Vz_v  = zeros(size(grid_l))
+    end
+
+    fill!(parent(model.fields.Pr), 0.0)
+    foreach(x -> fill!(parent(x), 0.0), model.fields.τ)
+    foreach(x -> fill!(parent(x), 0.0), model.fields.V)
+    fill!(parent(other_fields.A), 1.0)
+
+    set!(model.fields.η, grid_l, (grid, loc, I, fields) -> physics.rheology(grid, I, fields); discrete=true, parameters=(model.fields,))
+
+    KernelLaunch.apply_all_boundary_conditions!(arch, grid_l, model.boundary_conditions.stress)
+    KernelLaunch.apply_all_boundary_conditions!(arch, grid_l, model.boundary_conditions.velocity)
+    KernelLaunch.apply_all_boundary_conditions!(arch, grid_l, model.boundary_conditions.rheology)
+
+    MPI.Barrier(comm)
+
+    (me == 0) && printstyled("Action \n"; bold=true, color=:light_blue)
+
+    ttot_ns = UInt64(0)
+    for iter in 1:niter
+        if iter == 10
+            MPI.Barrier(comm)
+            ttot_ns = time_ns()
+        end
+        advance_iteration!(model, 0.0, 1.0)
+        if (iter % ncheck == 0)
+            compute_residuals!(model)
+            err = (Pr = max_abs_g(model.fields.r_Pr),
+                   Vx = max_abs_g(model.fields.r_V.x),
+                   Vy = max_abs_g(model.fields.r_V.y),
+                   Vz = max_abs_g(model.fields.r_V.z))
+            if (me == 0)
+                any(.!isfinite.(values(err))) && error("simulation failed, err = $err")
+                iter_nx = iter / maximum(size(grid_g))
+                @printf("  iter/nx = %.1f, err = [Pr = %1.3e, Vx = %1.3e, Vy = %1.3e, Vz = %1.3e]\n", iter_nx, err...)
+            end
+        end
+    end
+    ttot = float(time_ns() - ttot_ns)
+    ttot /= (niter - 10)
+
+    MPI.Barrier(comm)
+
+    ttot_min = MPI.Allreduce(ttot, MPI.MIN, comm)
+    ttot_max = MPI.Allreduce(ttot, MPI.MAX, comm)
+
+    if me == 0
+        Teff_min = 23 * 8 * prod(size(grid_l)) / ttot_max
+        Teff_max = 23 * 8 * prod(size(grid_l)) / ttot_min
+        printstyled("Performance: T_eff [min max] = $(round(Teff_min, sigdigits=4)) $(round(Teff_max, sigdigits=4)) \n"; bold=true, color=:green)
+    end
+
+    if do_save || do_visu
+        copyto!(Pr_v, interior(model.fields.Pr))
+        copyto!(τxx_v, interior(model.fields.τ.xx))
+        copyto!(τyy_v, interior(model.fields.τ.yy))
+        copyto!(τzz_v, interior(model.fields.τ.zz))
+        copyto!(τxy_v, av_xy(interior(model.fields.τ.xy)))
+        copyto!(τxz_v, av_xz(interior(model.fields.τ.xz)))
+        copyto!(τyz_v, av_yz(interior(model.fields.τ.yz)))
+        copyto!(Vx_v, avx(interior(model.fields.V.x)))
+        copyto!(Vy_v, avy(interior(model.fields.V.y)))
+        copyto!(Vz_v, avz(interior(model.fields.V.z)))
+
+        KernelAbstractions.synchronize(backend)
+
+        gather!(Pr_g, Pr_v, comm)
+        gather!(τxx_g, τxx_v, comm)
+        gather!(τyy_g, τyy_v, comm)
+        gather!(τzz_g, τzz_v, comm)
+        gather!(τxy_g, τxy_v, comm)
+        gather!(τxz_g, τxz_v, comm)
+        gather!(τyz_g, τyz_v, comm)
+        gather!(Vx_g, Vx_v, comm)
+        gather!(Vy_g, Vy_v, comm)
+        gather!(Vz_g, Vz_v, comm)
+
+        if (me == 0) && do_visu
+            fig = Figure()
+            axs = (Pr=Axis(fig[1, 1][1, 1]; aspect=DataAspect(), xlabel="x", ylabel="z", title="Pr"),
+            Vx=Axis(fig[1, 2][1, 1]; aspect=DataAspect(), xlabel="x", ylabel="z", title="Vx"),
+            Vy=Axis(fig[2, 1][1, 1]; aspect=DataAspect(), xlabel="x", ylabel="z", title="Vy"),
+            Vz=Axis(fig[2, 2][1, 1]; aspect=DataAspect(), xlabel="x", ylabel="z", title="Vz"))
+            plt = (Pr = heatmap!(axs.Pr, xcenters(grid_g), zcenters(grid_g), Pr_g[:, size(grid_g, 2)÷2+1, :]; colormap=:turbo),
+                   Vx = heatmap!(axs.Vx, xvertices(grid_g), zcenters(grid_g), Vx_g[:, size(grid_g, 2)÷2+1, :]; colormap=:turbo),
+                   Vy = heatmap!(axs.Vy, xcenters(grid_g), zcenters(grid_g), Vy_g[:, size(grid_g, 2)÷2+1, :]; colormap=:turbo),
+                   Vz = heatmap!(axs.Vz, xcenters(grid_g), zvertices(grid_g), Vz_g[:, size(grid_g, 2)÷2+1, :]; colormap=:turbo))
+            Colorbar(fig[1, 1][1, 2], plt.Pr)
+            Colorbar(fig[1, 2][1, 2], plt.Vx)
+            Colorbar(fig[2, 1][1, 2], plt.Vy)
+            Colorbar(fig[2, 2][1, 2], plt.Vz)
+            save("fig.png", fig)
+        end
+
+        if me == 0 && do_save
+            open("data.bin", "w") do io
+                write(io, Pr_g)
+                write(io, τxx_g)
+                write(io, τyy_g)
+                write(io, τzz_g)
+                write(io, τxy_g)
+                write(io, τxz_g)
+                write(io, τyz_g)
+                write(io, Vx_g)
+                write(io, Vy_g)
+                write(io, Vz_g)
+            end
+        end
+    end
 
-using Printf
+    MPI.Finalize()
 
-MPI.Init()
-
-dims = (0, 0, 0)
+    return
+end
 
-topology = CartesianTopology(dims)
-
-me        = global_rank(topology)
-device_id = shared_rank(topology)
-comm      = cartesian_communicator(topology)
-dims      = dimensions(topology)
-
-size_l = (64, 64, 64)
-
-size_g = global_grid_size(topology, size_l)
-
-# physics
-ebg = 1.0
-
-global_grid = CartesianGrid(
-    origin = (-0.5, -0.5, 0.0),
-    extent = ( 1.0,  1.0, 1.0),
-    size   = size_g,
-)
-
-grid = local_grid(global_grid, topology)
-
-@printf("process %d/%d on node %s, global rank %02d\n", device_id + 1, node_size(topology), node_name(topology), me + 1)
-
-# psh_x(x, _, _) = -x*ebg
-# psh_y(_, y, _) =  y*ebg
-
-# x_bc = BoundaryFunction(psh_x; reduce_dims=false)
-# y_bc = BoundaryFunction(psh_y; reduce_dims=false)
-
-# boundary_conditions = (
-#     west   = BoundaryCondition{Velocity}(x_bc, y_bc, 0.0),
-#     east   = BoundaryCondition{Velocity}(x_bc, y_bc, 0.0),
-#     south  = BoundaryCondition{Velocity}(x_bc, y_bc, 0.0),
-#     north  = BoundaryCondition{Velocity}(x_bc, y_bc, 0.0),
-#     bottom = BoundaryCondition{Velocity}(0.0 , 0.0 , 0.0),
-#     top    = BoundaryCondition{Velocity}(0.0 , 0.0 , 0.0),
-# )
-
-# r       = 0.7
-# re_mech = 10π
-# lτ_re_m = minimum(extent(grid)) / re_mech
-# vdτ     = minimum(spacing(grid)) / sqrt(10.1)
-# θ_dτ    = lτ_re_m * (r + 4 / 3) / vdτ
-# dτ_r    = 1.0 / (θ_dτ + 1.0)
-# nudτ    = vdτ * lτ_re_m
-
-# iter_params = (
-#     η_rel = 1e-1,
-#     Δτ = ( Pr = r / θ_dτ, τ = (xx = dτ_r, yy = dτ_r, zz = dτ_r, xy = dτ_r, xz = dτ_r, yz = dτ_r), V = (x = nudτ, y = nudτ, z = nudτ)),
-# )
-
-# backend = CPU()
-
-# physics = (rheology = GlensLawRheology(1), )
-# other_fields = (
-#     A = Field(backend, grid, Center()),
-# )
-
-# init_incl(x, y, z, x0, y0, z0, r, Ai, Am) = ifelse((x-x0)^2 + (y-y0)^2 + (z-z0)^2 < r^2, Ai, Am)
-# set!(other_fields.A, grid, init_incl; parameters = (x0 = 0.0, y0 = 0.0, z0 = 0.5, r = 0.2, Ai = 1e1, Am = 1.0))
-
-# model = IsothermalFullStokesModel(;
-#     backend,
-#     grid,
-#     physics,
-#     boundary_conditions,
-#     iter_params,
-#     other_fields
-# )
-
-# fig = Figure(resolution=(1000,1000), fontsize=32)
-# axs = (
-#     Pr = Axis(fig[1,1][1,1]; aspect=DataAspect(), xlabel="x", ylabel="y", title="Pr"),
-#     Vx = Axis(fig[2,1][1,1]; aspect=DataAspect(), xlabel="x", ylabel="y", title="Vx"),
-#     Vy = Axis(fig[2,2][1,1]; aspect=DataAspect(), xlabel="x", ylabel="y", title="Vy"),
-# )
-
-# plt = (
-#     Pr = heatmap!(axs.Pr, xcenters(grid), ycenters(grid), interior(model.fields.Pr)[:, :, size(grid,3)÷2]; colormap=:turbo),
-#     Vx = heatmap!(axs.Vx, xvertices(grid), ycenters(grid), interior(model.fields.V.x)[:, :, size(grid,3)÷2]; colormap=:turbo),
-#     Vy = heatmap!(axs.Vy, xcenters(grid), yvertices(grid), interior(model.fields.V.y)[:, :, size(grid,3)÷2]; colormap=:turbo),
-# )
-# Colorbar(fig[1,1][1,2], plt.Pr)
-# Colorbar(fig[2,1][1,2], plt.Vx)
-# Colorbar(fig[2,2][1,2], plt.Vy)
-
-# set!(model.fields.Pr, 0.0)
-# foreach(x -> set!(x, 0.0), model.fields.τ)
-# Isothermal._apply_bcs!(model.backend, model.grid, model.fields, model.boundary_conditions.stress)
-
-# set!(model.fields.V.x, grid, psh_x)
-# set!(model.fields.V.y, grid, psh_y)
-# set!(model.fields.V.z, 0.0)
-# Isothermal._apply_bcs!(model.backend, model.grid, model.fields, model.boundary_conditions.velocity)
-
-# set!(model.fields.η, other_fields.A)
-# extrapolate!(model.fields.η)
-
-# for it in 1:10
-#     advance_iteration!(model, 0.0, 1.0; async = false)
-#     if it % 10 == 0
-#         plt.Pr[3][] = interior(model.fields.Pr)[:, :, size(grid,3)÷2]
-#         plt.Vx[3][] = interior(model.fields.V.x)[:, :, size(grid,3)÷2]
-#         plt.Vy[3][] = interior(model.fields.V.y)[:, :, size(grid,3)÷2]
-#         yield()
-#     end
-# end
-
-MPI.Finalize()
\ No newline at end of file
+main(; do_visu=false, do_save=false)
diff --git a/scripts_future_API/tm_stokes_wip.jl b/scripts_future_API/tm_stokes_wip.jl
index d532e2ec..ca12b24f 100644
--- a/scripts_future_API/tm_stokes_wip.jl
+++ b/scripts_future_API/tm_stokes_wip.jl
@@ -1,105 +1,140 @@
 using FastIce
+using FastIce.Architectures
 using FastIce.Grids
 using FastIce.Fields
 using FastIce.Utils
 using FastIce.BoundaryConditions
 using FastIce.Models.FullStokes.Isothermal
 using FastIce.Physics
+using FastIce.KernelLaunch
 
-using KernelAbstractions
+const VBC = BoundaryCondition{Velocity}
+const TBC = BoundaryCondition{Traction}
+const SBC = BoundaryCondition{Slip}
 
-using GLMakie
-
-# physics
-ebg = 1.0
-
-grid = CartesianGrid(
-    origin = (-0.5, -0.5, 0.0),
-    extent = ( 1.0,  1.0, 1.0),
-    size   = (  64,   64, 64 ),
-)
-
-psh_x(x, _, _) = -x*ebg
-psh_y(_, y, _) =  y*ebg
-
-x_bc = BoundaryFunction(psh_x; reduce_dims=false)
-y_bc = BoundaryFunction(psh_y; reduce_dims=false)
-
-boundary_conditions = (
-    west   = BoundaryCondition{Velocity}(x_bc, y_bc, 0.0),
-    east   = BoundaryCondition{Velocity}(x_bc, y_bc, 0.0),
-    south  = BoundaryCondition{Velocity}(x_bc, y_bc, 0.0),
-    north  = BoundaryCondition{Velocity}(x_bc, y_bc, 0.0),
-    bottom = BoundaryCondition{Velocity}(0.0 , 0.0 , 0.0),
-    top    = BoundaryCondition{Velocity}(0.0 , 0.0 , 0.0),
-)
-
-r       = 0.7
-re_mech = 10π
-lτ_re_m = minimum(extent(grid)) / re_mech
-vdτ     = minimum(spacing(grid)) / sqrt(10.1)
-θ_dτ    = lτ_re_m * (r + 4 / 3) / vdτ
-dτ_r    = 1.0 / (θ_dτ + 1.0)
-nudτ    = vdτ * lτ_re_m
-
-iter_params = (
-    η_rel = 1e-1,
-    Δτ = ( Pr = r / θ_dτ, τ = (xx = dτ_r, yy = dτ_r, zz = dτ_r, xy = dτ_r, xz = dτ_r, yz = dτ_r), V = (x = nudτ, y = nudτ, z = nudτ)),
-)
-
-backend = CPU()
-
-physics = (rheology = GlensLawRheology(1), )
-other_fields = (
-    A = Field(backend, grid, Center()),
-)
-
-init_incl(x, y, z, x0, y0, z0, r, Ai, Am) = ifelse((x-x0)^2 + (y-y0)^2 + (z-z0)^2 < r^2, Ai, Am)
-set!(other_fields.A, grid, init_incl; parameters = (x0 = 0.0, y0 = 0.0, z0 = 0.5, r = 0.2, Ai = 1e1, Am = 1.0))
-
-model = IsothermalFullStokesModel(;
-    backend,
-    grid,
-    physics,
-    boundary_conditions,
-    iter_params,
-    other_fields
-)
-
-fig = Figure(resolution=(1000,1000), fontsize=32)
-axs = (
-    Pr = Axis(fig[1,1][1,1]; aspect=DataAspect(), xlabel="x", ylabel="y", title="Pr"),
-    Vx = Axis(fig[2,1][1,1]; aspect=DataAspect(), xlabel="x", ylabel="y", title="Vx"),
-    Vy = Axis(fig[2,2][1,1]; aspect=DataAspect(), xlabel="x", ylabel="y", title="Vy"),
-)
-
-plt = (
-    Pr = heatmap!(axs.Pr, xcenters(grid), ycenters(grid), interior(model.fields.Pr)[:, :, size(grid,3)÷2]; colormap=:turbo),
-    Vx = heatmap!(axs.Vx, xvertices(grid), ycenters(grid), interior(model.fields.V.x)[:, :, size(grid,3)÷2]; colormap=:turbo),
-    Vy = heatmap!(axs.Vy, xcenters(grid), yvertices(grid), interior(model.fields.V.y)[:, :, size(grid,3)÷2]; colormap=:turbo),
-)
-Colorbar(fig[1,1][1,2], plt.Pr)
-Colorbar(fig[2,1][1,2], plt.Vx)
-Colorbar(fig[2,2][1,2], plt.Vy)
-
-set!(model.fields.Pr, 0.0)
-foreach(x -> set!(x, 0.0), model.fields.τ)
-Isothermal._apply_bcs!(model.backend, model.grid, model.fields, model.boundary_conditions.stress)
-
-set!(model.fields.V.x, grid, psh_x)
-set!(model.fields.V.y, grid, psh_y)
-set!(model.fields.V.z, 0.0)
-Isothermal._apply_bcs!(model.backend, model.grid, model.fields, model.boundary_conditions.velocity)
-
-set!(model.fields.η, other_fields.A)
-extrapolate!(model.fields.η)
-
-for it in 1:100
-    advance_iteration!(model, 0.0, 1.0; async = false)
-    if it % 10 == 0
-        plt.Pr[3][] = interior(model.fields.Pr)[:, :, size(grid,3)÷2]
-        plt.Vx[3][] = interior(model.fields.V.x)[:, :, size(grid,3)÷2]
-        plt.Vy[3][] = interior(model.fields.V.y)[:, :, size(grid,3)÷2]
-        yield()
+using LinearAlgebra, Printf
+using KernelAbstractions
+# using CUDA
+# using AMDGPU
+
+using CairoMakie
+# using GLMakie
+# Makie.inline!(true)
+
+@views function main()
+    backend = CPU()
+    arch = Architecture(backend, 2)
+    set_device!(arch)
+
+    # physics
+    ebg = 2.0
+
+    b_width = (16, 4, 4) #(128, 32, 4)#
+
+    grid = CartesianGrid(; origin=(-0.5, -0.5, 0.0),
+                         extent=(1.0, 1.0, 1.0),
+                         size=(62, 62, 62))
+
+    psh_x(x, _, _) = -x * ebg
+    psh_y(_, y, _) = y * ebg
+
+    x_bc = BoundaryFunction(psh_x; reduce_dims=false)
+    y_bc = BoundaryFunction(psh_y; reduce_dims=false)
+
+    free_slip    = SBC(0.0, 0.0, 0.0)
+    free_surface = TBC(0.0, 0.0, 0.0)
+
+    boundary_conditions = (x=(VBC(x_bc, y_bc, 0.0), VBC(x_bc, y_bc, 0.0)),
+                           y=(VBC(x_bc, y_bc, 0.0), VBC(x_bc, y_bc, 0.0)),
+                           z=(free_slip, free_surface))
+    # TODO: Add ConstantField
+    ρg(x, y, z) = 0.0
+    gravity = (x=FunctionField(ρg, grid, (Vertex(), Center(), Center())),
+               y=FunctionField(ρg, grid, (Center(), Vertex(), Center())),
+               z=FunctionField(ρg, grid, (Center(), Center(), Vertex())))
+
+    # numerics
+    niter  = 20maximum(size(grid))
+    ncheck = maximum(size(grid))
+
+    r       = 0.7
+    re_mech = 10π
+    lτ_re_m = minimum(extent(grid)) / re_mech
+    vdτ     = minimum(spacing(grid)) / sqrt(ndims(grid) * 1.5)
+    θ_dτ    = lτ_re_m * (r + 4 / 3) / vdτ
+    dτ_r    = 1.0 / (θ_dτ + 1.0)
+    nudτ    = vdτ * lτ_re_m
+
+    iter_params = (η_rel=1e-1,
+                   Δτ=(Pr=r / θ_dτ, τ=(xx=dτ_r, yy=dτ_r, zz=dτ_r, xy=dτ_r, xz=dτ_r, yz=dτ_r), V=(x=nudτ, y=nudτ, z=nudτ)))
+
+    physics = (rheology=GlensLawRheology(1),)
+    other_fields = (A=Field(backend, grid, Center()),)
+
+    init_incl(x, y, z, x0, y0, z0, r, Ai, Am) = ifelse((x - x0)^2 + (y - y0)^2 + (z - z0)^2 < r^2, Ai, Am)
+    set!(other_fields.A, grid, init_incl; parameters=(x0=0.0, y0=0.0, z0=0.5, r=0.1, Ai=1e-1, Am=1.0))
+
+    model = IsothermalFullStokesModel(;
+                                      arch,
+                                      grid,
+                                      physics,
+                                      gravity,
+                                      boundary_conditions,
+                                      outer_width=b_width,
+                                      iter_params,
+                                      other_fields)
+
+    fig = Figure()
+    axs = (Pr = Axis(fig[1, 1][1, 1]; aspect=DataAspect(), xlabel="x", ylabel="z", title="Pr"),
+           Vx = Axis(fig[1, 2][1, 1]; aspect=DataAspect(), xlabel="x", ylabel="z", title="Vx"),
+           Vy = Axis(fig[2, 1][1, 1]; aspect=DataAspect(), xlabel="x", ylabel="z", title="Vy"),
+           Vz = Axis(fig[2, 2][1, 1]; aspect=DataAspect(), xlabel="x", ylabel="z", title="Vz"))
+    plt = (Pr = heatmap!(axs.Pr, xcenters(grid), zcenters(grid), Array(interior(model.fields.Pr)[:, size(grid, 2)÷2+1, :]); colormap=:turbo),
+           Vx = heatmap!(axs.Vx, xvertices(grid), zcenters(grid), Array(interior(model.fields.V.x)[:, size(grid, 2)÷2+1, :]); colormap=:turbo),
+           Vy = heatmap!(axs.Vy, xcenters(grid), zcenters(grid), Array(interior(model.fields.V.y)[:, size(grid, 2)÷2+1, :]); colormap=:turbo),
+           Vz = heatmap!(axs.Vz, xcenters(grid), zvertices(grid), Array(interior(model.fields.V.z)[:, size(grid, 2)÷2+1, :]); colormap=:turbo))
+    Colorbar(fig[1, 1][1, 2], plt.Pr)
+    Colorbar(fig[1, 2][1, 2], plt.Vx)
+    Colorbar(fig[2, 1][1, 2], plt.Vy)
+    Colorbar(fig[2, 2][1, 2], plt.Vz)
+
+    fill!(parent(model.fields.Pr), 0.0)
+    foreach(x -> fill!(parent(x), 0.0), model.fields.τ)
+
+    set!(model.fields.V.x, grid, psh_x)
+    set!(model.fields.V.y, grid, psh_y)
+    set!(model.fields.V.z, 0.0)
+    set!(model.fields.η, grid, (grid, loc, I, fields) -> physics.rheology(grid, I, fields); discrete=true, parameters=(model.fields,))
+
+    KernelLaunch.apply_all_boundary_conditions!(arch, grid, model.boundary_conditions.stress)
+    KernelLaunch.apply_all_boundary_conditions!(arch, grid, model.boundary_conditions.velocity)
+    KernelLaunch.apply_all_boundary_conditions!(arch, grid, model.boundary_conditions.rheology)
+
+    display(fig)
+
+    for iter in 1:niter
+        advance_iteration!(model, 0.0, 1.0)
+        if (iter % ncheck == 0)
+            compute_residuals!(model)
+            err = (Pr = norm(model.fields.r_Pr, Inf),
+                   Vx = norm(model.fields.r_V.x, Inf),
+                   Vy = norm(model.fields.r_V.y, Inf),
+                   Vz = norm(model.fields.r_V.z, Inf))
+            if any(.!isfinite.(values(err)))
+                error("simulation failed, err = $err")
+            end
+            iter_nx = iter / maximum(size(grid))
+            @printf("  iter/nx = %.1f, err = [Pr = %1.3e, Vx = %1.3e, Vy = %1.3e, Vz = %1.3e]\n", iter_nx, err...)
+            plt.Pr[3][] = interior(model.fields.Pr)[:, size(grid, 2)÷2+1, :]
+            plt.Vx[3][] = interior(model.fields.V.x)[:, size(grid, 2)÷2+1, :]
+            plt.Vy[3][] = interior(model.fields.V.y)[:, size(grid, 2)÷2+1, :]
+            plt.Vz[3][] = interior(model.fields.V.z)[:, size(grid, 2)÷2+1, :]
+            # yield()
+            display(fig)
+        end
     end
+
+    return
 end
+
+main()
diff --git a/src/Architectures.jl b/src/Architectures.jl
new file mode 100644
index 00000000..9e85210b
--- /dev/null
+++ b/src/Architectures.jl
@@ -0,0 +1,43 @@
+module Architectures
+
+export Architecture
+
+export launch!, set_device!, get_device, set_device_and_priority!, heuristic_groupsize
+export backend, device, details
+
+using FastIce.Grids
+
+using KernelAbstractions
+
+struct Architecture{Kind,B,D,Details}
+    backend::B
+    device::D
+    details::Details
+end
+
+struct SingleDevice end
+
+function Architecture(backend::Backend, device_id::Integer=1)
+    device = get_device(backend, device_id)
+    return Architecture{SingleDevice,typeof(backend),typeof(device),Nothing}(backend, device, nothing)
+end
+
+device(arch::Architecture) = arch.device
+backend(arch::Architecture) = arch.backend
+details(arch::Architecture) = arch.details
+
+set_device!(arch::Architecture) = set_device!(arch.device)
+
+function set_device_and_priority!(arch::Architecture, prio::Symbol)
+    set_device!(arch)
+    KernelAbstractions.priority!(arch.backend, prio)
+    return
+end
+
+set_device!(::Architecture{Kind,CPU}) where {Kind} = nothing
+get_device(::CPU, id::Integer) = nothing
+
+heuristic_groupsize(arch::Architecture, ::Val{N}) where {N} = heuristic_groupsize(arch.device, Val(N))
+heuristic_groupsize(::Architecture{Kind,CPU}, ::Val{N}) where {Kind,N} = 256
+
+end
diff --git a/src/BoundaryConditions/BoundaryConditions.jl b/src/BoundaryConditions/BoundaryConditions.jl
new file mode 100644
index 00000000..e03a5748
--- /dev/null
+++ b/src/BoundaryConditions/BoundaryConditions.jl
@@ -0,0 +1,44 @@
+module BoundaryConditions
+
+export FieldBoundaryCondition, BoundaryConditionsBatch
+export apply_boundary_conditions!, apply_all_boundary_conditions!
+
+export DirichletBC, HalfCell, FullCell
+export ExtrapolateBC
+export ContinuousBC, DiscreteBC
+export BoundaryFunction, DiscreteBoundaryFunction, ContinuousBoundaryFunction
+
+export HideBoundaries, hide
+
+using FastIce.Grids
+using FastIce.Fields
+using FastIce.Utils
+using FastIce.Architectures
+
+using KernelAbstractions
+using Adapt
+
+abstract type FieldBoundaryCondition end
+
+include("field_boundary_conditions.jl")
+include("utils.jl")
+include("boundary_function.jl")
+include("dirichlet_bc.jl")
+include("extrapolate_bc.jl")
+include("hide_boundaries.jl")
+
+struct BoundaryConditionsBatch{F,BC}
+    fields::F
+    conditions::BC
+end
+
+@inline function apply_boundary_conditions!(::Val{S}, ::Val{D},
+                                            arch::Architecture,
+                                            grid::CartesianGrid,
+                                            batch::BoundaryConditionsBatch; kwargs...) where {S,D}
+    apply_boundary_conditions!(Val(S), Val(D), arch, grid, batch.fields, batch.conditions; kwargs...)
+end
+
+apply_boundary_conditions!(side, val, arch, grid, ::Nothing; kwargs...) = nothing
+
+end
diff --git a/src/BoundaryConditions/boundary_conditions.jl b/src/BoundaryConditions/boundary_conditions.jl
deleted file mode 100644
index 4b8ed13c..00000000
--- a/src/BoundaryConditions/boundary_conditions.jl
+++ /dev/null
@@ -1,47 +0,0 @@
-module BoundaryConditions
-
-export apply_bcs!
-
-export DirichletBC, HalfCell, FullCell
-export ContinuousBC, DiscreteBC
-export BoundaryFunction, DiscreteBoundaryFunction, ContinuousBoundaryFunction
-
-using FastIce.Grids
-using FastIce.Fields
-using FastIce.Utils
-
-using KernelAbstractions
-using Adapt
-
-include("utils.jl")
-include("boundary_function.jl")
-include("dirichlet_bc.jl")
-
-function apply_bcs!(::Val{D}, backend, grid, fields, bcs; async=true) where {D}
-    sizes   = ntuple(I -> remove_dim(Val(D), size(fields[I])), Val(length(fields)))
-    ndrange = max.(sizes...)
-    _apply_bcs!(backend, 256)(Val(D), grid, fields, sizes, bcs; ndrange)
-    async || KernelAbstractions.synchronize(backend)
-    return
-end
-
-@kernel function _apply_bcs!(dim, grid, fields, sizes, bcs)
-    I = @index(Global, Cartesian)
-    ntuple(Val(length(fields))) do ifield
-        Base.@_inline_meta
-        if _checkindices(sizes[ifield], I)
-            _apply_bcs_lr!(dim, grid, fields[ifield], I, bcs[ifield]...)
-        end
-    end
-end
-
-@inline function _apply_bcs_lr!(dim, grid, f, I, lbc, rbc)
-    loc = location(f)
-    _apply_bc!(Val(1), dim, grid, f, loc, I, lbc)
-    _apply_bc!(Val(2), dim, grid, f, loc, I, rbc)
-    return
-end
-
-@inline _apply_bc!(side, dim, grid, f, loc, Ibc, ::Nothing) = nothing
-
-end
\ No newline at end of file
diff --git a/src/BoundaryConditions/boundary_function.jl b/src/BoundaryConditions/boundary_function.jl
index 2ece4c03..0f38ec32 100644
--- a/src/BoundaryConditions/boundary_function.jl
+++ b/src/BoundaryConditions/boundary_function.jl
@@ -1,7 +1,7 @@
 abstract type BoundaryFunction{F} end
 
 struct ReducedDimensions end
-struct FullDimensions    end
+struct FullDimensions end
 
 @inline _reduce(::Type{ReducedDimensions}, I, dim) = remove_dim(dim, I)
 @inline _reduce(::Type{FullDimensions}, I, dim)    = I
@@ -21,14 +21,14 @@ end
 const CBF{F,P,RF} = ContinuousBoundaryFunction{F,P,RF} where {F,P,RF}
 const DBF{F,P,RF} = DiscreteBoundaryFunction{F,P,RF} where {F,P,RF}
 
-const CDBF{F,P} = Union{CBF{F,P}, DBF{F,P}} where {F,P}
+const CDBF{F,P} = Union{CBF{F,P},DBF{F,P}} where {F,P}
 
 @inline _params(::CDBF{F,Nothing}) where {F} = ()
 @inline _params(cbf::CDBF{F}) where {F} = cbf.parameters
 
 @inline (bc::CBF{F,P,RF})(grid, loc, dim, I) where {F,P,RF} = bc.fun(_reduce(RF, coord(grid, loc, I), dim)..., _params(bc)...)
 
-@inline (bc::DBF{F,P,RF})(grid, loc, dim, I) where {F,P,RF} =  bc.fun(grid, loc, dim, Tuple(_reduce(RF, I, dim))..., _params(bc)...)
+@inline (bc::DBF{F,P,RF})(grid, loc, dim, I) where {F,P,RF} = bc.fun(grid, loc, dim, Tuple(_reduce(RF, I, dim))..., _params(bc)...)
 
 # Create a continous or discrete boundary function
 # if discrete = true, the function has signature f(grid, loc, dim, inds...)
@@ -36,4 +36,4 @@ const CDBF{F,P} = Union{CBF{F,P}, DBF{F,P}} where {F,P}
 function BoundaryFunction(fun::Function; discrete=false, parameters=nothing, reduce_dims=true)
     RF = reduce_dims ? ReducedDimensions : FullDimensions
     discrete ? DiscreteBoundaryFunction{RF}(fun, parameters) : ContinuousBoundaryFunction{RF}(fun, parameters)
-end
\ No newline at end of file
+end
diff --git a/src/BoundaryConditions/dirichlet_bc.jl b/src/BoundaryConditions/dirichlet_bc.jl
index 804f953c..51b10c7f 100644
--- a/src/BoundaryConditions/dirichlet_bc.jl
+++ b/src/BoundaryConditions/dirichlet_bc.jl
@@ -5,13 +5,13 @@ struct HalfCell end
 struct FullCell end
 
 # First-kind Dirichlet boundary conditon parametrised by the gradient reconstruction kind (can be HalfCell or FullCell)
-struct DirichletBC{Gradient,T}
+struct DirichletBC{Gradient,T} <: FieldBoundaryCondition
     condition::T
     DirichletBC{Gradient}(condition::T) where {Gradient,T} = new{Gradient,T}(condition)
 end
 
 # Create a DirichletBC with a continuous or discrete boundary function
-function DirichletBC{G}(fun::Function; kwargs...) where G
+function DirichletBC{G}(fun::Function; kwargs...) where {G}
     condition = BoundaryFunction(fun; kwargs...)
     return DirichletBC{G}(condition)
 end
@@ -24,9 +24,9 @@ Base.@propagate_inbounds (bc::DirichletBC{G,<:BoundaryFunction})(grid, loc, dim,
 Adapt.adapt_structure(to, f::DirichletBC{G,<:AbstractArray}) where {G} = DirichletBC{G}(Adapt.adapt(to, parent(f.condition)))
 
 Base.@propagate_inbounds _get_gradient(f2, bc::DirichletBC{HalfCell}, grid, loc, dim, I) = 2 * (bc(grid, loc, dim, I) - f2)
-Base.@propagate_inbounds _get_gradient(f2, bc::DirichletBC{FullCell}, grid, loc, dim, I) =     (bc(grid, loc, dim, I) - f2)
+Base.@propagate_inbounds _get_gradient(f2, bc::DirichletBC{FullCell}, grid, loc, dim, I) = (bc(grid, loc, dim, I) - f2)
 
-@inline function _apply_bc!(side, dim, grid, f, loc, Ibc, bc::DirichletBC)
+@inline function _apply_field_boundary_condition!(side, dim, grid, f, loc, Ibc, bc::DirichletBC)
     I = _bc_index(dim, side, loc, size(f), Ibc)
     DI = _bc_offset(Val(ndims(grid)), dim, side)
     @inbounds f[I] = f[I+DI] + _get_gradient(f[I+DI], bc, grid, loc, dim, I)
diff --git a/src/BoundaryConditions/extrapolate_bc.jl b/src/BoundaryConditions/extrapolate_bc.jl
new file mode 100644
index 00000000..191022c8
--- /dev/null
+++ b/src/BoundaryConditions/extrapolate_bc.jl
@@ -0,0 +1,9 @@
+# Boundary conditions that extrapolates the values of the field outside of the domain
+struct ExtrapolateBC <: FieldBoundaryCondition end
+
+@inline function _apply_field_boundary_condition!(side, dim, grid, f, loc, Ibc, ::ExtrapolateBC)
+    I = _bc_index(dim, side, loc, size(f), Ibc)
+    DI = _bc_offset(Val(ndims(grid)), dim, side)
+    @inbounds f[I] = 2 * f[I+DI] - f[I+2DI]
+    return
+end
diff --git a/src/BoundaryConditions/field_boundary_conditions.jl b/src/BoundaryConditions/field_boundary_conditions.jl
new file mode 100644
index 00000000..ff287060
--- /dev/null
+++ b/src/BoundaryConditions/field_boundary_conditions.jl
@@ -0,0 +1,44 @@
+function apply_boundary_conditions!(::Val{S}, ::Val{D},
+                                    arch::Architecture,
+                                    grid::CartesianGrid,
+                                    fields::NTuple{N,Field},
+                                    conditions::NTuple{N,FieldBoundaryCondition}; async=true) where {S,D,N}
+    _validate_fields(fields, D, S)
+    sizes = ntuple(ifield -> remove_dim(Val(D), size(parent(fields[ifield]))), Val(N))
+    halos = ntuple(ifield -> remove_dim(Val(D), halo(fields[ifield])), Val(N))
+    offsets = ntuple(Val(N)) do ifield
+        NF = ndims(fields[ifield]) - 1
+        ntuple(Val(NF)) do RD
+            -halos[ifield][RD][1]
+        end |> CartesianIndex
+    end
+    worksize = max.(sizes...)
+    _apply_boundary_conditions!(backend(arch), 256)(Val(S), Val(D), grid, sizes, offsets, fields, conditions; ndrange=worksize)
+    async || synchronize(backend(arch))
+    return
+end
+
+@kernel function _apply_boundary_conditions!(side, dim, grid, sizes, offsets, fields, conditions)
+    I = @index(Global, Cartesian)
+    # ntuple here unrolls the loop over fields
+    ntuple(Val(length(fields))) do ifield
+        Base.@_inline_meta
+        @inbounds if _checkindices(sizes[ifield], I)
+            Ibc       = I + offsets[ifield]
+            field     = fields[ifield]
+            condition = conditions[ifield]
+            _apply_field_boundary_condition!(side, dim, grid, field, location(field), Ibc, condition)
+        end
+    end
+end
+
+_apply_field_boundary_condition!(side, dim, grid, field, loc, I, ::Nothing) = nothing
+
+function _validate_fields(fields::NTuple{N,Field}, dim, side) where {N}
+    for f in fields
+        if (location(f, Val(dim)) == Center()) && (halo(f, dim, side) < 1)
+            error("to apply boundary conditions, halo width must be at least 1")
+        end
+    end
+    return
+end
diff --git a/src/BoundaryConditions/hide_boundaries.jl b/src/BoundaryConditions/hide_boundaries.jl
new file mode 100644
index 00000000..f4982bd6
--- /dev/null
+++ b/src/BoundaryConditions/hide_boundaries.jl
@@ -0,0 +1,32 @@
+struct HideBoundaries{N}
+    pipelines::NTuple{N,Tuple{Pipeline,Pipeline}}
+    function HideBoundaries{N}(arch::Architecture) where {N}
+        pre() = set_device_and_priority!(arch, :high)
+        pipelines = ntuple(Val(N)) do _
+            return ntuple(_ -> Pipeline(; pre), Val(2))
+        end
+        return new{N}(pipelines)
+    end
+end
+
+function hide(fun::F, hb::HideBoundaries{N}, arch::Architecture, grid::CartesianGrid{N}, boundary_conditions, worksize;
+              outer_width=nothing) where {F,N}
+    inner_range, outer_ranges = split_ndrange(worksize, outer_width)
+    # execute inner range in a main Task with a normal priority
+    fun(inner_range)
+    for dim in N:-1:1
+        ntuple(Val(2)) do side
+            pipe  = hb.pipelines[dim][side]
+            range = outer_ranges[dim][side]
+            batch = boundary_conditions[dim][side]
+            # execute outer range and boundary conditions asynchronously
+            put!(pipe) do
+                fun(range)
+                apply_boundary_conditions!(Val(side), Val(dim), arch, grid, batch)
+                synchronize(backend(arch))
+            end
+        end
+        wait.(hb.pipelines[dim]) # synchronize spatial dimension
+    end
+    return
+end
diff --git a/src/BoundaryConditions/utils.jl b/src/BoundaryConditions/utils.jl
index 41262b04..e498162b 100644
--- a/src/BoundaryConditions/utils.jl
+++ b/src/BoundaryConditions/utils.jl
@@ -1,4 +1,3 @@
-
 # Similar to `checkindex`, but for the multidimensional case. Custom axes aren't supported
 @inline _checkindices(AI::Tuple, I::Tuple) = (I[1] <= AI[1]) && _checkindices(Base.tail(AI), Base.tail(I))
 @inline _checkindices(AI::Tuple{}, I::Tuple{}) = true
@@ -11,14 +10,14 @@
 @inline _get_i(::Vertex) = 1
 
 # Return 1D array index for the "left" and "right" sides of the array
-@inline _index1(::Val{1}, L, sz) =  _get_i(L)
+@inline _index1(::Val{1}, L, sz) = _get_i(L)
 @inline _index1(::Val{2}, L, sz) = -_get_i(L) + sz + 1
 
 # Cartesian array index to store the boundary condition
 @inline _bc_index(dim::Val{D}, side, loc, sz, Ibc) where {D} = insert_dim(dim, Ibc, _index1(side, loc[D], sz[D]))
 
 # Return 1D offset for the "left" and "right" sides of the array to compute the flux projection
-@inline _offset1(::Val{1}) =  1
+@inline _offset1(::Val{1}) = 1
 @inline _offset1(::Val{2}) = -1
 
-@inline _bc_offset(N, ::Val{D}, S) where {D} = ntuple(I -> I == D ? _offset1(S) : 0, N) |> CartesianIndex
\ No newline at end of file
+@inline _bc_offset(N, ::Val{D}, S) where {D} = ntuple(I -> I == D ? _offset1(S) : 0, N) |> CartesianIndex
diff --git a/src/Distributed/Distributed.jl b/src/Distributed/Distributed.jl
new file mode 100644
index 00000000..2a442a34
--- /dev/null
+++ b/src/Distributed/Distributed.jl
@@ -0,0 +1,36 @@
+"""
+    Distributed
+
+Tools for performing parallel computations in a distributed environment.
+Contains tools for non-blocking halo exchange using MPI.
+Implements `BoundaryCondition` API to conveniently define communication as an operation that fills halo buffers.
+Together with `HideBoundaries` from the `BoundaryConditions` module enables hiding MPI communication behind computations.
+"""
+module Distributed
+
+using FastIce.Grids
+using FastIce.Fields
+using FastIce.Architectures
+using FastIce.BoundaryConditions
+import FastIce.BoundaryConditions: apply_boundary_conditions!
+
+using MPI
+using KernelAbstractions
+
+export CartesianTopology
+export global_rank, shared_rank, node_name, cartesian_communicator, shared_communicator, coordinates
+export dimensions, global_size, node_size
+export global_grid_size, local_grid
+export neighbors, neighbor, has_neighbor
+export gather!
+
+export ExchangeInfo, DistributedBoundaryConditions
+
+"Trait structure used as a type parameter to indicate that the Architecture is a distributed MPI Architecture."
+struct DistributedMPI end
+
+include("topology.jl")
+include("boundary_conditions.jl")
+include("gather.jl")
+
+end
diff --git a/src/Distributed/boundary_conditions.jl b/src/Distributed/boundary_conditions.jl
new file mode 100644
index 00000000..d6ad1445
--- /dev/null
+++ b/src/Distributed/boundary_conditions.jl
@@ -0,0 +1,112 @@
+"""
+    ExchangeInfo
+
+Structure containing the information to exchange halos of one side of an N-dimensional array.
+"""
+mutable struct ExchangeInfo{SB,RB}
+    send_buffer::SB
+    recv_buffer::RB
+    send_request::MPI.Request
+    recv_request::MPI.Request
+    ExchangeInfo(send_buf, recv_buf) = new{typeof(send_buf),typeof(recv_buf)}(send_buf, recv_buf, MPI.REQUEST_NULL, MPI.REQUEST_NULL)
+end
+
+"""
+    ExchangeInfo(::Val{S}, ::Val{D}, field::Field) where {S,D}
+
+Create `ExchangeInfo` to exchange halos on side `S` in the spatial direction `D`.
+"""
+function ExchangeInfo(::Val{S}, ::Val{D}, field::Field) where {S,D}
+    send_view = get_send_view(Val(S), Val(D), field)
+    recv_view = get_recv_view(Val(S), Val(D), field)
+    send_buffer = similar(parent(send_view), eltype(send_view), size(send_view))
+    recv_buffer = similar(parent(recv_view), eltype(recv_view), size(recv_view))
+    return ExchangeInfo(send_buffer, recv_buffer)
+end
+
+"""
+    apply_boundary_conditions!(::Val{S}, ::Val{D},
+                               arch::Architecture,
+                               grid::CartesianGrid,
+                               fields::NTuple{N,Field},
+                               exchange_infos::NTuple{N,ExchangeInfo}; async=true) where {S,D,N}
+
+Perform a non-blocking MPI exchange for a set of fields.
+"""
+function apply_boundary_conditions!(::Val{S}, ::Val{D},
+                                    arch::Architecture,
+                                    grid::CartesianGrid,
+                                    fields::NTuple{N,Field},
+                                    exchange_infos::NTuple{N,ExchangeInfo}; async=true) where {S,D,N}
+    comm = cartesian_communicator(details(arch))
+    nbrank = neighbor(details(arch), D, S)
+
+    # initiate non-blocking MPI recieve and device-to-device copy to the send buffer
+    for idx in eachindex(fields)
+        info = exchange_infos[idx]
+        info.recv_request = MPI.Irecv!(info.recv_buffer, comm; source=nbrank)
+        send_view = get_send_view(Val(S), Val(D), fields[idx])
+        copyto!(info.send_buffer, send_view)
+    end
+    KernelAbstractions.synchronize(backend(arch))
+
+    # initiate non-blocking MPI send
+    for idx in eachindex(fields)
+        info = exchange_infos[idx]
+        info.send_request = MPI.Isend(info.send_buffer, comm; dest=nbrank)
+    end
+
+    recv_ready = falses(N)
+    send_ready = falses(N)
+
+    # test send and receive requests, initiating device-to-device copy
+    # to the receive buffer if the receive is complete
+    while !(all(recv_ready) && all(send_ready))
+        for idx in eachindex(fields)
+            info = exchange_infos[idx]
+            if MPI.Test(info.recv_request) && !recv_ready[idx]
+                recv_view = get_recv_view(Val(S), Val(D), fields[idx])
+                copyto!(recv_view, info.recv_buffer)
+                recv_ready[idx] = true
+            end
+            send_ready[idx] = MPI.Test(info.send_request)
+        end
+        yield()
+    end
+    async || KernelAbstractions.synchronize(backend(arch))
+
+    return
+end
+
+_overlap(::Vertex) = 1
+_overlap(::Center) = 0
+
+get_recv_view(side::Val{S}, dim::Val{D}, f::Field) where {S,D} = get_recv_view(side, dim, parent(f), halo(f, D, S))
+
+function get_send_view(side::Val{S}, dim::Val{D}, f::Field) where {S,D}
+    get_send_view(side, dim, parent(f), halo(f, D, S), _overlap(location(f, dim)))
+end
+
+function get_recv_view(::Val{1}, ::Val{D}, array::AbstractArray, halo_width::Integer) where {D}
+    recv_range = Base.OneTo(halo_width)
+    indices = ntuple(I -> I == D ? recv_range : Colon(), Val(ndims(array)))
+    return view(array, indices...)
+end
+
+function get_recv_view(::Val{2}, ::Val{D}, array::AbstractArray, halo_width::Integer) where {D}
+    recv_range = (size(array, D)-halo_width+1):size(array, D)
+    indices = ntuple(I -> I == D ? recv_range : Colon(), Val(ndims(array)))
+    return view(array, indices...)
+end
+
+function get_send_view(::Val{1}, ::Val{D}, array::AbstractArray, halo_width::Integer, overlap::Integer) where {D}
+    send_range = (overlap+halo_width+1):(overlap+2halo_width)
+    indices = ntuple(I -> I == D ? send_range : Colon(), Val(ndims(array)))
+    return view(array, indices...)
+end
+
+function get_send_view(::Val{2}, ::Val{D}, array::AbstractArray, halo_width::Integer, overlap::Integer) where {D}
+    send_range = (size(array, D)-overlap-2halo_width+1):(size(array, D)-overlap-halo_width)
+    indices = ntuple(I -> I == D ? send_range : Colon(), Val(ndims(array)))
+    return view(array, indices...)
+end
diff --git a/src/Distributed/distributed.jl b/src/Distributed/distributed.jl
deleted file mode 100644
index ac045a45..00000000
--- a/src/Distributed/distributed.jl
+++ /dev/null
@@ -1,47 +0,0 @@
-module Distributed
-
-using FastIce.Architecture
-using FastIce.Grids
-
-export CartesianTopology
-
-export global_rank, shared_rank, node_name, cartesian_communicator, shared_communicator
-
-export dimensions, global_size, node_size
-
-export global_grid_size, local_grid
-
-export split_ndrange
-
-using FastIce.Grids
-
-using MPI
-
-include("topology.jl")
-
-include("split_ndrange.jl")
-
-struct DistributedArchitecture{C,T,R} <: AbstractArchitecture
-    child_arch::C
-    topology::T
-    ranges::R
-end
-
-device(arch::DistributedArchitecture) = device(arch.child_arch)
-
-function launch!(arch::DistributedArchitecture, grid::CartesianGrid, kernel::Pair{Kernel,Args}; boundary_conditions=nothing, async=true) where {Args}
-    fun, args = kernel
-
-    worksize = size(grid, Vertex())
-    groupsize = heuristic_groupsize(arch.child_arch)
-
-    fun(arch.backend, groupsize)(args...; ndrange=size(arch.ranges[end]), offset=first(arch.ranges[end]))
-
-
-    isnothing(boundary_conditions) || apply_boundary_conditions!(boundary_conditions)
-
-    async || synchronize(arch.backend)
-    return
-end
-
-end
\ No newline at end of file
diff --git a/src/Distributed/exchanger.jl b/src/Distributed/exchanger.jl
deleted file mode 100644
index 2659ed04..00000000
--- a/src/Distributed/exchanger.jl
+++ /dev/null
@@ -1,72 +0,0 @@
-mutable struct Exchanger
-    @atomic done::Bool
-    ch::Channel
-    bottom::Base.Event
-    task::Task
-    @atomic err
-
-    function Exchanger(f::F, arch::AbstractArchitecture, comm, rank, halo, border) where F
-        top    = Base.Event(true)
-        bottom = Base.Event(true)
-
-        send_buf = similar(border)
-        recv_buf = similar(halo)
-        this = new(false, top, bottom, nothing)
-
-        has_neighbor = rank != MPI.PROC_NULL
-        compute_bc   = !has_neighbor
-
-        this.task = Threads.@spawn begin
-            set_device!(device(arch))
-            KernelAbstractions.priority!(backend(arch), :high)
-            try
-                while !(@atomic this.done)
-                    wait(top)
-                    if has_neighbor
-                        recv = MPI.Irecv!(recv_buf, comm; source=rank)
-                    end
-                    f(compute_bc)
-                    if has_neighbor
-                        copyto!(send_buf, border)
-                        send = MPI.Isend(send_buf, comm; dest=rank)
-                        cooperative_test!(recv)
-                        copyto!(halo, recv_buf)
-                        cooperative_test!(send)
-                    end
-                    notify(bottom)
-                end
-            catch err
-                @show err
-                @atomic this.done = true
-                @atomic this.err = err
-            end
-        end
-        errormonitor(this.task)
-        return this
-    end
-end
-
-setdone!(exc::Exchanger) = @atomic exc.done = true
-
-Base.isdone(exc::Exchanger) = @atomic exc.done
-
-function Base.notify(exc::Exchanger)
-    if !(@atomic exc.done)
-        notify(exc.top)
-    else
-        error("notify: Exchanger is not running")
-    end
-end
-function Base.wait(exc::Exchanger)
-    if !(@atomic exc.done)
-        wait(exc.bottom)
-    else
-        error("wait: Exchanger is not running")
-    end
-end
-
-get_recv_view(::Val{1}, ::Val{D}, A) where D = view(A, ntuple(I -> I == D ? 1          : Colon(), Val(ndims(A)))...)
-get_recv_view(::Val{2}, ::Val{D}, A) where D = view(A, ntuple(I -> I == D ? size(A, D) : Colon(), Val(ndims(A)))...)
-
-get_send_view(::Val{1}, ::Val{D}, A) where D = view(A, ntuple(I -> I == D ? 2              : Colon(), Val(ndims(A)))...)
-get_send_view(::Val{2}, ::Val{D}, A) where D = view(A, ntuple(I -> I == D ? size(A, D) - 1 : Colon(), Val(ndims(A)))...)
diff --git a/src/Distributed/gather.jl b/src/Distributed/gather.jl
new file mode 100644
index 00000000..b52ed47c
--- /dev/null
+++ b/src/Distributed/gather.jl
@@ -0,0 +1,42 @@
+"""
+    gather!(dst::Union{AbstractArray{T,N},Nothing}, src::AbstractArray{T,N}, comm::MPI.Comm; root=0) where {T,N}
+
+Gather local array `src` into a global array `dst`.
+Size of the global array `size(dst)` should be equal to the product of the size of a local array `size(src)` and the dimensions of a Cartesian communicator `comm`.
+The array will be gathered on the process with id `root` (`root=0` by default).
+Note that the memory for a global array should be allocated only on the process with id `root`, on other processes `dst` can be set to `nothing`.
+"""
+function gather!(dst::Union{AbstractArray{T,N},Nothing}, src::AbstractArray{T,N}, comm::MPI.Comm; root=0) where {T,N}
+    dims, _, _ = MPI.Cart_get(comm)
+    dims = Tuple(dims)
+    if MPI.Comm_rank(comm) == root
+        # make subtype for gather
+        offset  = Tuple(0 for _ in 1:N)
+        subtype = MPI.Types.create_subarray(size(dst), size(src), offset, MPI.Datatype(eltype(dst)))
+        subtype = MPI.Types.create_resized(subtype, 0, size(src, 1) * Base.elsize(dst))
+        MPI.Types.commit!(subtype)
+        # make VBuffer for collective communication
+        counts  = fill(Cint(1), reverse(dims)) # gather one subarray from each MPI rank
+        displs  = zeros(Cint, reverse(dims))   # reverse dims since MPI Cart comm is row-major
+        csizes  = cumprod(size(src)[2:end] .* dims[1:end-1])
+        strides = (1, csizes...)
+        for I in CartesianIndices(displs)
+            offset = reverse(Tuple(I - oneunit(I)))
+            displs[I] = sum(offset .* strides)
+        end
+        recvbuf = MPI.VBuffer(dst, vec(counts), vec(displs), subtype)
+        MPI.Gatherv!(src, recvbuf, comm; root)
+    else
+        MPI.Gatherv!(src, nothing, comm; root)
+    end
+    return
+end
+
+"""
+    gather!(arch::Architecture{DistributedMPI}, dst, src::Field; kwargs...)
+
+Gather the interior of a field `src` into a global array `dst`.
+"""
+function gather!(arch::Architecture{DistributedMPI}, dst, src::Field; kwargs...)
+    gather!(dst, interior(src), cartesian_communicator(details(arch)); kwargs...)
+end
diff --git a/src/Distributed/split_ndrange.jl b/src/Distributed/split_ndrange.jl
deleted file mode 100644
index 37c20128..00000000
--- a/src/Distributed/split_ndrange.jl
+++ /dev/null
@@ -1,27 +0,0 @@
-@inline subrange(nr,bw,I,::Val{1}) = 1:bw[I]
-@inline subrange(nr,bw,I,::Val{2}) = (size(nr,I)-bw[I]+1):size(nr,I)
-@inline subrange(nr,bw,I,::Val{3}) = (bw[I]+1):(size(nr,I)-bw[I])
-
-@inline split_ndrange(ndrange,ndwidth) = split_ndrange(CartesianIndices(ndrange),ndwidth)
-
-function split_ndrange(ndrange::CartesianIndices{N},ndwidth::NTuple{N,<:Integer}) where N
-    @assert all(size(ndrange) .> ndwidth.*2)
-    @inline ndsubrange(I,::Val{J}) where J = ntuple(Val(N)) do idim
-        if idim < I
-            1:size(ndrange,idim)
-        elseif idim == I
-            subrange(ndrange,ndwidth,idim,Val(J))
-        else
-            subrange(ndrange,ndwidth,idim,Val(3))
-        end
-    end
-    ndinner = ntuple(idim -> subrange(ndrange,ndwidth,idim,Val(3)), Val(N))
-    return ntuple(Val(2N+1)) do i
-        if i == 2N+1
-            ndrange[ndinner...]
-        else
-            idim,idir = divrem(i-1,2) .+ 1
-            ndrange[ndsubrange(idim,Val(idir))...]
-        end
-    end
-end
\ No newline at end of file
diff --git a/src/Distributed/topology.jl b/src/Distributed/topology.jl
index e5a0c81c..f4a34a88 100644
--- a/src/Distributed/topology.jl
+++ b/src/Distributed/topology.jl
@@ -1,3 +1,8 @@
+"""
+    CartesianTopology
+
+Represents N-dimensional Cartesian topology of distributed processes.
+"""
 struct CartesianTopology{N}
     nprocs::Int
     dims::NTuple{N,Int}
@@ -11,48 +16,147 @@ struct CartesianTopology{N}
     node_name::String
 end
 
-function CartesianTopology(dims::NTuple{N,Int}; comm = MPI.COMM_WORLD) where {N}
-    nprocs = MPI.Comm_size(comm)
-    dims = Tuple(MPI.Dims_create(nprocs, dims))
-    cart_comm  = MPI.Cart_create(MPI.COMM_WORLD, dims)
+"""
+    CartesianTopology(dims::NTuple{N,Int}; comm=MPI.COMM_WORLD) where {N}
+
+Create an N-dimensional Cartesian topology with dimensions `dims` and using base MPI communicator `comm` (by default, `comm=MPI.COMM_WORLD`).
+If all entries in `dims` are not equal to `0`, the product of `dims` should be equal to the total number of MPI processes `MPI.Comm_size(comm)`.
+If any (or all) entries of `dims` are `0`, the dimensions in the corresponding spatial directions will be picked automatically.
+"""
+function CartesianTopology(dims::NTuple{N,Int}; comm=MPI.COMM_WORLD) where {N}
+    nprocs      = MPI.Comm_size(comm)
+    dims        = Tuple(MPI.Dims_create(nprocs, dims))
+    cart_comm   = MPI.Cart_create(MPI.COMM_WORLD, dims)
     global_rank = MPI.Comm_rank(cart_comm)
     shared_comm = MPI.Comm_split_type(cart_comm, MPI.COMM_TYPE_SHARED, global_rank)
     shared_rank = MPI.Comm_rank(shared_comm)
-    node_name = MPI.Get_processor_name()
+    node_name   = MPI.Get_processor_name()
     cart_coords = Tuple(MPI.Cart_coords(cart_comm))
 
     neighbors = ntuple(Val(N)) do dim
-        MPI.Cart_shift(cart_comm, dim-1, 1)
+        MPI.Cart_shift(cart_comm, dim - 1, 1)
     end
 
     return CartesianTopology{N}(nprocs, dims, global_rank, shared_rank, cart_coords, neighbors, comm, cart_comm, shared_comm, node_name)
 end
 
+"""
+    global_rank(t::CartesianTopology)
+
+Global id of a process in a Cartesian topology.
+"""
 global_rank(t::CartesianTopology) = t.global_rank
 
+"""
+    shared_rank(t::CartesianTopology)
+
+Local id of a process within a single node. Can be used to set the GPU device.
+"""
 shared_rank(t::CartesianTopology) = t.shared_rank
 
+
+"""
+    node_name(t::CartesianTopology)
+
+Name of a node according to `MPI.Get_processor_name()`.
+"""
 node_name(t::CartesianTopology) = t.node_name
 
+
+"""
+    cartesian_communicator(t::CartesianTopology)
+
+MPI Cartesian communicator for the topology.
+"""
 cartesian_communicator(t::CartesianTopology) = t.cart_comm
 
+"""
+    shared_communicator(t::CartesianTopology)
+
+MPI communicator for the processes sharing the same node.
+"""
 shared_communicator(t::CartesianTopology) = t.shared_comm
 
+
+"""
+    dimensions(t::CartesianTopology)
+
+Dimensions of the topology as NTuple.
+"""
 dimensions(t::CartesianTopology) = t.dims
 
+"""
+    coordinates(t::CartesianTopology)
+
+Coordinates of a current process within a Cartesian topology.
+"""
 coordinates(t::CartesianTopology) = t.cart_coords
 
+"""
+    neighbors(t::CartesianTopology)
+
+Neighbors of a current process.
+
+Returns NTuple containing process ids of the two immediate neighbors in each spatial direction, or MPI.PROC_NULL if no neighbor on a corresponding side.
+"""
 neighbors(t::CartesianTopology) = t.neighbors
 
+"""
+    neighbor(t::CartesianTopology, dim, side)
+
+Returns id of a neighbor process in spatial direction `dim` on the side `side`, if this neighbor exists, or MPI.PROC_NULL otherwise.
+"""
+neighbor(t::CartesianTopology, dim, side) = t.neighbors[dim][side]
+
+
+"""
+    has_neighbor(t::CartesianTopology, dim, side)
+
+Returns true if there a neighbor process in spatial direction `dim` on the side `side`, or false otherwise.
+"""
+has_neighbor(t::CartesianTopology, dim, side) = t.neighbors[dim][side] != MPI.PROC_NULL
+
+"""
+    global_size(t::CartesianTopology)
+
+Total number of processes withing the topology.
+"""
 global_size(t::CartesianTopology) = MPI.Comm_size(t.cart_comm)
+
+"""
+    node_size(t::CartesianTopology)
+
+Number of processes sharing the same node.
+"""
 node_size(t::CartesianTopology) = MPI.Comm_size(t.shared_comm)
 
-global_grid_size(t::CartesianTopology, local_size) =  t.dims .* local_size
+"""
+    global_grid_size(t::CartesianTopology, local_size)
 
+Return the global size for a structured grid.
+"""
+global_grid_size(t::CartesianTopology, local_size) = t.dims .* local_size
+
+
+"""
+    local_grid(g::CartesianGrid, t::CartesianTopology)
+
+Return a `CartesianGrid` covering the subdomain which corresponds to the current process.
+"""
 function local_grid(g::CartesianGrid, t::CartesianTopology)
     local_extent = extent(g) ./ t.dims
-    local_size   = size(g) .÷ t.dims
+    local_size = size(g) .÷ t.dims
     local_origin = origin(g) .+ local_extent .* t.cart_coords
 
     return CartesianGrid(local_origin, local_extent, local_size)
-end
\ No newline at end of file
+end
+
+"""
+    Architecture(backend::Backend, topology::CartesianTopology) where {N}
+
+Create a distributed Architecture using `backend` and `topology`. For GPU backends, device will be selected automatically based on a process id within a node.
+"""
+function Architectures.Architecture(backend::Backend, topology::CartesianTopology)
+    device = get_device(backend, shared_rank(topology)+1)
+    return Architecture{DistributedMPI,typeof(backend),typeof(device),typeof(topology)}(backend, device, topology)
+end
diff --git a/src/FastIce.jl b/src/FastIce.jl
index 72fb62ca..e1491a03 100644
--- a/src/FastIce.jl
+++ b/src/FastIce.jl
@@ -1,19 +1,18 @@
 module FastIce
 
-using KernelAbstractions
-
+# core modules
 include("Grids/Grids.jl")
+include("GridOperators.jl")
+include("Logging.jl")
+include("Architectures.jl")
+include("Fields/Fields.jl")
+include("Utils/Utils.jl")
+include("BoundaryConditions/BoundaryConditions.jl")
+include("KernelLaunch.jl")
+include("Distributed/Distributed.jl")
+include("Physics.jl")
 
-include("grid_operators.jl")
-include("logging.jl")
-include("fields.jl")
-include("utils.jl")
-
-include("physics.jl")
-
-include("BoundaryConditions/boundary_conditions.jl")
+# ice flow models
 include("Models/models.jl")
 
-# include("Distributed/distributed.jl")
-
 end # module
diff --git a/src/fields.jl b/src/Fields/Fields.jl
similarity index 74%
rename from src/fields.jl
rename to src/Fields/Fields.jl
index c2deab90..47f88dd1 100644
--- a/src/fields.jl
+++ b/src/Fields/Fields.jl
@@ -2,13 +2,18 @@ module Fields
 
 export AbstractField
 export Field, interior
+export FunctionField
 export location, data, halo, set!
 
 using Adapt
 using OffsetArrays
 using KernelAbstractions
 
-import FastIce.Grids: Location, CartesianGrid, coord
+using FastIce.Grids
+using FastIce.GridOperators
+using FastIce.Architectures
+
+import LinearAlgebra
 
 abstract type AbstractField{T,N,L} <: AbstractArray{T,N} end
 
@@ -44,6 +49,8 @@ function Field(backend::Backend, grid::CartesianGrid, T::DataType, loc::L, halo:
     return Field{L}(data, halo, indices)
 end
 
+Field(arch::Architecture, args...; kwargs...) = Field(backend(arch), args...; kwargs...)
+
 expand_axis_halo(::Nothing) = (0, 0)
 expand_axis_halo(halo::Integer) = (halo, halo)
 expand_axis_halo(halo::Tuple) = (something(halo[1], 0), something(halo[2], 0))
@@ -53,8 +60,8 @@ expand_halo(::Val{N}, halo::Tuple) where {N} = ntuple(I -> expand_axis_halo(halo
 expand_halo(::Val{N}, halo::Integer) where {N} = ntuple(I -> (halo, halo), Val(N))
 expand_halo(::Val{N}, halo::Nothing) where {N} = ntuple(I -> (0, 0), Val(N))
 
-expand_loc(::Val{N}, loc::NTuple{N, Location}) where N = loc
-expand_loc(::Val{N}, loc::Location) where N = ntuple(_ -> loc, Val(N))
+expand_loc(::Val{N}, loc::NTuple{N,Location}) where {N} = loc
+expand_loc(::Val{N}, loc::Location) where {N} = ntuple(_ -> loc, Val(N))
 
 function Field(backend::Backend, grid::CartesianGrid, loc::L, T::DataType=eltype(grid); halo::H=nothing) where {L,H}
     N = ndims(grid)
@@ -62,10 +69,10 @@ function Field(backend::Backend, grid::CartesianGrid, loc::L, T::DataType=eltype
 end
 
 Base.checkbounds(f::Field, I...) = checkbounds(f.data, I...)
-Base.checkbounds(f::Field, I::Union{CartesianIndex, AbstractArray{<:CartesianIndex}}) = checkbounds(f.data, I)
+Base.checkbounds(f::Field, I::Union{CartesianIndex,AbstractArray{<:CartesianIndex}}) = checkbounds(f.data, I)
 
 Base.checkbounds(::Type{Bool}, f::Field, I...) = checkbounds(Bool, f.data, I...)
-Base.checkbounds(::Type{Bool}, f::Field, I::Union{CartesianIndex, AbstractArray{<:CartesianIndex}}) = checkbounds(Bool, f.data, I)
+Base.checkbounds(::Type{Bool}, f::Field, I::Union{CartesianIndex,AbstractArray{<:CartesianIndex}}) = checkbounds(Bool, f.data, I)
 
 Base.size(f::Field) = length.(f.indices)
 Base.parent(f::Field) = parent(f.data)
@@ -76,8 +83,8 @@ Base.view(f::Field, I...) = view(f.data, I...)
 data(f::Field) = f.data
 
 halo(f::Field) = f.halo
-halo(f::Field, dim) = f.halo[dim]
-halo(f::Field, dim, side) = f.halo[dim][side]
+halo(f::Field, dim::Integer) = f.halo[dim]
+halo(f::Field, dim::Integer, side::Integer) = f.halo[dim][side]
 
 Adapt.adapt_structure(to, f::Field{T,N,L}) where {T,N,L} = Field{L}(Adapt.adapt(to, f.data), f.halo, f.indices)
 
@@ -91,7 +98,7 @@ Base.@propagate_inbounds Base.lastindex(f::Field, dim) = lastindex(f.data, dim)
 
 function interior_indices(f::Field)
     return ntuple(ndims(f)) do I
-        (firstindex(parent(f), I) + f.halo[I][1]):(lastindex(parent(f), I) - f.halo[I][2])
+        (firstindex(parent(f), I)+f.halo[I][1]):(lastindex(parent(f), I)-f.halo[I][2])
     end
 end
 
@@ -103,14 +110,14 @@ set!(f::Field, other::Field) = (copy!(interior(f), interior(other)); nothing)
 set!(f::Field{T}, val::T) where {T<:Number} = (fill!(interior(f), val); nothing)
 set!(f::Field, A::AbstractArray) = (copy!(interior(f), A); nothing)
 
-@kernel function _set_continuous!(dst, grid, loc, fun, args...)
+@kernel function _set_continuous!(dst, grid, loc, fun::F, args...) where {F}
     I = @index(Global, Cartesian)
     dst[I] = fun(coord(grid, loc, I)..., args...)
 end
 
-@kernel function _set_discrete!(dst, grid, loc, fun, args...)
+@kernel function _set_discrete!(dst, grid, loc, fun::F, args...) where {F}
     I = @index(Global, Cartesian)
-    dst[I] = fun(grid, loc, Tuple(I)..., args...)
+    dst[I] = fun(grid, loc, I, args...)
 end
 
 function set!(f::Field{T,N}, grid::CartesianGrid{N}, fun::F; discrete=false, parameters=()) where {T,F,N}
@@ -124,4 +131,15 @@ function set!(f::Field{T,N}, grid::CartesianGrid{N}, fun::F; discrete=false, par
     return
 end
 
-end
\ No newline at end of file
+# grid operators
+Base.@propagate_inbounds ∂(f::AbstractField, I, dim) = ∂(f, I, dim, location(f, dim))
+Base.@propagate_inbounds ∂(f::AbstractField, I, dim, ::Vertex) = ∂ᶜ(f, I, dim)
+Base.@propagate_inbounds ∂(f::AbstractField, I, dim, ::Center) = ∂ᵛ(f, I, dim)
+
+# field norm
+LinearAlgebra.norm(f::Field) = LinearAlgebra.norm(interior(f))
+LinearAlgebra.norm(f::Field, p::Real) = LinearAlgebra.norm(interior(f), p)
+
+include("function_field.jl")
+
+end
diff --git a/src/Fields/function_field.jl b/src/Fields/function_field.jl
new file mode 100644
index 00000000..e522583e
--- /dev/null
+++ b/src/Fields/function_field.jl
@@ -0,0 +1,46 @@
+struct Continuous end
+struct Discrete end
+
+struct FunctionField{T,N,L,CD,F,G,P} <: AbstractField{T,N,L}
+    func::F
+    grid::G
+    parameters::P
+    function FunctionField{CD,L}(func::F, grid::G, parameters::P) where {CD,L,F,G,P}
+        N = ndims(grid)
+        T = eltype(grid)
+        return new{T,N,L,CD,F,G,P}(func, grid, parameters)
+    end
+end
+
+function FunctionField(func::F, grid::G, loc; discrete=false, parameters=nothing) where {F,G}
+    loc = expand_loc(Val(ndims(grid)), loc)
+    L   = typeof(loc)
+    CD  = discrete ? Discrete : Continuous
+    return FunctionField{CD,L}(func, grid, parameters)
+end
+
+Base.size(f::FunctionField) = size(f.grid, location(f))
+
+@inline func_type(::FunctionField{T,N,L,CD}) where {T,N,L,CD} = CD
+
+@inline _params(::Nothing) = ()
+@inline _params(p) = p
+
+Base.@propagate_inbounds function call_func(func::F, grid, loc, I, params, ::Type{Continuous}) where {F}
+    func(coord(grid, loc, I)..., params...)
+end
+
+Base.@propagate_inbounds function call_func(func::F, grid, loc, I, params, ::Type{Discrete}) where {F}
+    func(grid, loc, I, params...)
+end
+
+Base.@propagate_inbounds Base.getindex(f::FunctionField{T,N}, inds::Vararg{Integer,N}) where {T,N} = getindex(f, CartesianIndex(inds))
+Base.@propagate_inbounds function Base.getindex(f::FunctionField{T,N}, I::CartesianIndex{N}) where {T,N}
+    call_func(f.func, f.grid, location(f), I, _params(f.parameters), func_type(f))
+end
+
+function Adapt.adapt_structure(to, f::FunctionField{T,N,L,CD}) where {T,N,L,CD}
+    FunctionField{CD,L}(Adapt.adapt(to, f.func),
+                        Adapt.adapt(to, f.grid),
+                        Adapt.adapt(to, f.parameters))
+end
diff --git a/src/grid_operators.jl b/src/GridOperators.jl
similarity index 91%
rename from src/grid_operators.jl
rename to src/GridOperators.jl
index 67b1d7c7..183fb1c1 100644
--- a/src/grid_operators.jl
+++ b/src/GridOperators.jl
@@ -7,7 +7,9 @@ export ∂ᶜx, ∂ᶜy, ∂ᶜz, avᶜx, avᶜy, avᶜz, avᶜxy, avᶜxz, av
 
 import Base.@propagate_inbounds
 
-Base.@assume_effects :foldable δ(op, I::CartesianIndex{N}, ::Val{D}) where {N,D} = ntuple(i -> i == D ? op(I[i], 1) : I[i], Val(N)) |> CartesianIndex
+Base.@assume_effects :foldable function δ(op, I::CartesianIndex{N}, ::Val{D}) where {N,D}
+    ntuple(i -> i == D ? op(I[i], 1) : I[i], Val(N)) |> CartesianIndex
+end
 
 Base.@assume_effects :foldable function δ(op, I::CartesianIndex{N}, ::Val{D1}, ::Val{D2}) where {N,D1,D2}
     δI = ntuple(Val(N)) do i
@@ -54,7 +56,7 @@ for (dim, val1, val2) in ((:xy, 1, 2), (:xz, 1, 3), (:yz, 2, 3))
         @eval begin
             @propagate_inbounds $av(f, I) = $avl(f, I, Val($val1), Val($val2))
         end
-    end 
+    end
 end
 
-end
\ No newline at end of file
+end
diff --git a/src/Grids/Grids.jl b/src/Grids/Grids.jl
index 0540f959..e37785fc 100644
--- a/src/Grids/Grids.jl
+++ b/src/Grids/Grids.jl
@@ -15,4 +15,4 @@ struct Vertex <: Location end
 include("discrete_axis.jl")
 include("cartesian_grid.jl")
 
-end
\ No newline at end of file
+end
diff --git a/src/Grids/cartesian_grid.jl b/src/Grids/cartesian_grid.jl
index 8adc6670..3f13220a 100644
--- a/src/Grids/cartesian_grid.jl
+++ b/src/Grids/cartesian_grid.jl
@@ -1,11 +1,31 @@
+"""
+Rectilinear grid with uniform spacing.
+
+# Examples
+
+```jldoctest
+julia> grid = CartesianGrid(origin=(0.0,0.0), extent=(1.0,1.0), size=(4,4))
+2D 4×4 CartesianGrid{Float64}:
+    x ∈ [0.0–1.0]; Δx = 0.25
+    y ∈ [0.0–1.0]; Δy = 0.25
+```
+"""
 struct CartesianGrid{N,T<:AbstractFloat,I<:Integer}
     axes::NTuple{N,DiscreteAxis{T,I}}
 end
 
-CartesianGrid(origin::NTuple{N,T}, extent::NTuple{N,T}, size::NTuple{N,I}) where {N,T,I} = CartesianGrid(DiscreteAxis.(origin, extent, size))
+function CartesianGrid(origin::NTuple{N,T}, extent::NTuple{N,T}, size::NTuple{N,I}) where {N,T,I}
+    CartesianGrid(DiscreteAxis.(origin, extent, size))
+end
+
+"""
+    CartesianGrid(origin::NTuple{N,T}, extent::NTuple{N,T}, size::NTuple{N,I})
 
+Create a Cartesian grid with a specified origin (bottom-south-west corner in 3D), spatial extent, and number of grid cells.
+"""
 CartesianGrid(; origin::NTuple{N,T}, extent::NTuple{N,T}, size::NTuple{N,I}) where {N,T,I} = CartesianGrid(origin, extent, size)
 
+# overload methods from Base
 @pure Base.eltype(::CartesianGrid{N,T}) where {N,T} = T
 @pure Base.ndims(::CartesianGrid{N}) where {N} = N
 
@@ -17,11 +37,49 @@ Base.size(grid::CartesianGrid) = length.(grid.axes)
 Base.size(grid::CartesianGrid{N}, locs::NTuple{N,Location}) where {N} = length.(grid.axes, locs)
 Base.size(grid::CartesianGrid, loc::Location) = ntuple(D -> length(grid.axes[D], loc), Val(ndims(grid)))
 
+# pretty-printing
+function Base.show(io::IO, grid::CartesianGrid{N,T}) where {N,T}
+    print(io, "$(N)D $(join(size(grid), "×")) CartesianGrid{$T}:\n")
+    symbols = (:x, :y, :z)
+    for idim in 1:N
+        l, r = origin(grid, idim), origin(grid, idim) + extent(grid, idim)
+        print(io, "    $(symbols[idim]) ∈ [$(l)–$(r)]; Δ$(symbols[idim]) = $(spacing(grid, idim))\n")
+    end
+end
+
+"""
+    axis(grid::CartesianGrid, dim::Integer)
+
+Return a `DiscreteAxis` corresponding to the spatial dimension `dim`.
+"""
 axis(grid::CartesianGrid, dim::Integer) = grid.axes[dim]
 
+"""
+    origin(grid::CartesianGrid)
+
+Return the origin of a `CartesianGrid`. The origin corresponds to bottom-south-west corner of the grid in 3D.
+"""
 origin(grid::CartesianGrid) = origin.(grid.axes)
+
+"""
+    extent(grid::CartesianGrid)
+
+Return the spatial extent of a `CartesianGrid` as a tuple.
+"""
 extent(grid::CartesianGrid) = extent.(grid.axes)
+
+"""
+    spacing(grid::CartesianGrid)
+
+Return the spacing of a `CartesianGrid` as a tuple.
+"""
 spacing(grid::CartesianGrid) = spacing.(grid.axes)
+
+"""
+    Δ(grid::CartesianGrid)
+
+Return the spacing of a `CartesianGrid` as a tuple. Same as `spacing`.
+"""
 Δ(grid::CartesianGrid) = spacing(grid)
 
 @propagate_inbounds origin(grid::CartesianGrid, dim::Integer) = origin(grid.axes[dim])
@@ -29,8 +87,15 @@ spacing(grid::CartesianGrid) = spacing.(grid.axes)
 @propagate_inbounds spacing(grid::CartesianGrid, dim::Integer) = spacing(grid.axes[dim])
 @propagate_inbounds Δ(grid::CartesianGrid, dim::Integer) = spacing(grid.axes[dim])
 
-@propagate_inbounds coord(grid::CartesianGrid{N}, loc::Location, inds::NTuple{N}) where {N} = coord.(grid.axes, Ref(loc), inds)
 
+"""
+    coord(grid::CartesianGrid{N}, loc::NTuple{N,Location}, inds::NTuple{N})
+
+Return a tuple of spatial coordinates of a grid point at location `loc` and indices `inds`.
+
+For vertex-centered locations, first grid point is at the origin.
+For cell-centered locations, first grid point at half-spacing distance from the origin.
+"""
 @propagate_inbounds function coord(grid::CartesianGrid{N}, loc::NTuple{N,Location}, inds::NTuple{N}) where {N}
     ntuple(Val(N)) do I
         Base.@_inline_meta
@@ -38,12 +103,20 @@ spacing(grid::CartesianGrid) = spacing.(grid.axes)
     end
 end
 
+@propagate_inbounds coord(grid::CartesianGrid{N}, loc::Location, inds::NTuple{N}) where {N} = coord.(grid.axes, Ref(loc), inds)
+
 coord(grid::CartesianGrid{N}, loc, I::CartesianIndex{N}) where {N} = coord(grid, loc, Tuple(I))
 
 @propagate_inbounds coord(grid::CartesianGrid, loc::Location, dim::Integer, i::Integer) = coord(grid.axes[dim], loc, i)
-@propagate_inbounds coord(grid::CartesianGrid{N}, loc::Location, dim::Integer, inds::NTuple{N}) where {N} = coord(grid.axes[dim], loc, inds[dim])
-@propagate_inbounds coord(grid::CartesianGrid{N}, locs::NTuple{N,Location}, dim::Integer, inds::NTuple{N}) where {N} = coord(grid.axes[dim], locs[dim], inds[dim])
-@propagate_inbounds coord(grid::CartesianGrid{N}, locs::NTuple{N,Location}, dim::Integer, i::Integer) where {N} = coord(grid.axes[dim], locs[dim], i)
+@propagate_inbounds function coord(grid::CartesianGrid{N}, loc::Location, dim::Integer, inds::NTuple{N}) where {N}
+    coord(grid.axes[dim], loc, inds[dim])
+end
+@propagate_inbounds function coord(grid::CartesianGrid{N}, locs::NTuple{N,Location}, dim::Integer, inds::NTuple{N}) where {N}
+    coord(grid.axes[dim], locs[dim], inds[dim])
+end
+@propagate_inbounds function coord(grid::CartesianGrid{N}, locs::NTuple{N,Location}, dim::Integer, i::Integer) where {N}
+    coord(grid.axes[dim], locs[dim], i)
+end
 
 @propagate_inbounds coord(grid::CartesianGrid, loc, ::Val{D}, i) where {D} = coord(grid, loc, D, i)
 
@@ -111,4 +184,4 @@ end
 
 xvertices(grid::CartesianGrid; kwargs...) = vertices(grid, Val(1); kwargs...)
 yvertices(grid::CartesianGrid; kwargs...) = vertices(grid, Val(2); kwargs...)
-zvertices(grid::CartesianGrid; kwargs...) = vertices(grid, Val(3); kwargs...)
\ No newline at end of file
+zvertices(grid::CartesianGrid; kwargs...) = vertices(grid, Val(3); kwargs...)
diff --git a/src/Grids/discrete_axis.jl b/src/Grids/discrete_axis.jl
index c1d763d9..87e1e728 100644
--- a/src/Grids/discrete_axis.jl
+++ b/src/Grids/discrete_axis.jl
@@ -1,3 +1,6 @@
+"""
+Discretised one-dimensional interval. Grid type `CartesianGrid` is defined as a tuple of `DiscreteAxis`'s.
+"""
 struct DiscreteAxis{T<:AbstractFloat,I<:Integer} <: AbstractRange{T}
     origin::T
     extent::T
@@ -6,6 +9,8 @@ struct DiscreteAxis{T<:AbstractFloat,I<:Integer} <: AbstractRange{T}
     DiscreteAxis(origin::T, extent::T, len::I) where {T,I} = new{T,I}(origin, extent, extent / len, len)
 end
 
+# overload methods from Base
+
 import Base.@pure
 import Base.@propagate_inbounds
 
@@ -53,4 +58,4 @@ end
 end
 
 centers(ax::DiscreteAxis; kwargs...) = coords(ax, Center(); kwargs...)
-vertices(ax::DiscreteAxis; kwargs...) = coords(ax, Vertex(); kwargs...)
\ No newline at end of file
+vertices(ax::DiscreteAxis; kwargs...) = coords(ax, Vertex(); kwargs...)
diff --git a/src/KernelLaunch.jl b/src/KernelLaunch.jl
new file mode 100644
index 00000000..e25fa6e2
--- /dev/null
+++ b/src/KernelLaunch.jl
@@ -0,0 +1,87 @@
+module KernelLaunch
+
+using FastIce.Grids
+using FastIce.Architectures
+using FastIce.BoundaryConditions
+
+using KernelAbstractions
+
+export launch!
+
+"""
+    launch!(arch::Architecture, grid::CartesianGrid, kernel::Pair{K,Args}; <keyword arguments>) where {K,Args}
+
+Launch a KernelAbstraction `kernel` on a `grid` using the backend from `arch`.
+Either `worksize` or `location` must be provided as keyword arguments.
+
+# Keyword Arguments
+- `worksize`: worksize of a kernel, i.e. how many grid points are included in each spatial direction.
+- `location[=nothing]`: compute worksize as a size of the grid at a specified location.
+    If only one location is provided, e.g. `location=Vertex()`, then this location will be used for all spacial directions.
+- `offset[=nothing]`: index offset for all grid indices as a `CartesianIndex`.
+- `expand[=nothing]`: if provided, the worksize is increased by `2*expand`, and offset is set to `-expand`, or combined with user-provided offset. 
+- `hide_boundaries[=nothing]`: instance of `HideBoundaries`, that will be used to overlap boundary processing with computations at inner points of the domain.
+- `outer_width[=nothing]`: if `hide_boundaries` is specified, used to determine the decomposition of the domain into inner and outer regions.
+- `boundary_conditions[=nothing]`: a tuple of boundary condition batches for each side of every spatial direction.
+- `async[=true]`: if set to `false`, will block the host until the kernel is finished executing.
+"""
+function launch!(arch::Architecture, grid::CartesianGrid, kernel::Pair{K,Args};
+                 worksize=nothing,
+                 location=nothing,
+                 offset=nothing,
+                 expand=nothing,
+                 boundary_conditions=nothing,
+                 hide_boundaries=nothing,
+                 outer_width=nothing,
+                 async=true) where {K,Args}
+    fun, args = kernel
+
+    if isnothing(location) && isnothing(worksize)
+        throw(ArgumentError("either grid location or worksize must me specified"))
+    end
+
+    if !isnothing(location) && !isnothing(worksize)
+        throw(ArgumentError("either grid location or worksize must me specified, but not both"))
+    end
+
+    if isnothing(worksize)
+        worksize = size(grid, location)
+    end
+
+    if !isnothing(expand)
+        if !isnothing(offset)
+            offset -= oneunit(CartesianIndex{ndims(grid)})
+        else
+            offset = -oneunit(CartesianIndex{ndims(grid)})
+        end
+        worksize = worksize .+ 2 .* expand
+    end
+
+    groupsize = heuristic_groupsize(arch, Val(length(worksize)))
+
+    if isnothing(hide_boundaries)
+        fun(backend(arch), groupsize, worksize)(args..., offset)
+        isnothing(boundary_conditions) || apply_all_boundary_conditions!(arch, grid, boundary_conditions)
+    else
+        hide(hide_boundaries, arch, grid, boundary_conditions, worksize; outer_width) do indices
+            sub_offset, ndrange = first(indices) - oneunit(first(indices)), size(indices)
+            if !isnothing(offset)
+                sub_offset += offset
+            end
+            fun(backend(arch), groupsize)(args..., sub_offset; ndrange)
+        end
+    end
+
+    async || KernelAbstractions.synchronize(backend(arch))
+    return
+end
+
+function apply_all_boundary_conditions!(arch::Architecture, grid::CartesianGrid{N}, boundary_conditions) where {N}
+    ntuple(Val(N)) do D
+        apply_boundary_conditions!(Val(1), Val(D), arch, grid, boundary_conditions[D][1])
+        apply_boundary_conditions!(Val(2), Val(D), arch, grid, boundary_conditions[D][2])
+    end
+    return
+end
+
+end
diff --git a/src/logging.jl b/src/Logging.jl
similarity index 61%
rename from src/logging.jl
rename to src/Logging.jl
index f3b7eebb..b2d81aa4 100644
--- a/src/logging.jl
+++ b/src/Logging.jl
@@ -11,14 +11,14 @@ struct MPILogger{B<:AbstractLogger} <: AbstractLogger
     base_logger::B
 end
 
-function handle_message(l::MPILogger,args...;kwargs...)
+function handle_message(l::MPILogger, args...; kwargs...)
     if MPI.Comm_rank(l.comm) == l.rank
-        handle_message(l.base_logger,args...;kwargs...)
+        handle_message(l.base_logger, args...; kwargs...)
     end
 end
 
-shouldlog(l::MPILogger,args...) = (MPI.Comm_rank(l.comm) == l.rank) && shouldlog(l.base_logger,args...)
+shouldlog(l::MPILogger, args...) = (MPI.Comm_rank(l.comm) == l.rank) && shouldlog(l.base_logger, args...)
 
 min_enabled_level(l::MPILogger) = min_enabled_level(l.base_logger)
 
-end
\ No newline at end of file
+end
diff --git a/src/Models/full_stokes/isothermal/boundary_conditions.jl b/src/Models/full_stokes/isothermal/boundary_conditions.jl
index c3531cc5..4aeb6a3f 100644
--- a/src/Models/full_stokes/isothermal/boundary_conditions.jl
+++ b/src/Models/full_stokes/isothermal/boundary_conditions.jl
@@ -1,15 +1,18 @@
 struct Traction end
 struct Velocity end
-struct Slip     end
+struct Slip end
 
-struct BoundaryCondition{Kind, N, C}
+"""
+Boundary condition for Stokes problem.
+"""
+struct BoundaryCondition{Kind,N,C}
     components::C
-    BoundaryCondition{Kind}(components::Tuple) where {Kind} = new{Kind, length(components), typeof(components)}(components)
+    BoundaryCondition{Kind}(components::Tuple) where {Kind} = new{Kind,length(components),typeof(components)}(components)
 end
 
 BoundaryCondition{Kind}(components...) where {Kind} = BoundaryCondition{Kind}(components)
 
-const _COORDINATES  = ((:x, 1), (:y, 2), (:z, 3))
+const _COORDINATES = ((:x, 1), (:y, 2), (:z, 3))
 
 for (dim, val) in _COORDINATES
     td = remove_dim(Val(val), _COORDINATES)
@@ -20,7 +23,7 @@ for (dim, val) in _COORDINATES
     N  = Symbol(:τ, dim, dim)
 
     ex1 = Expr(:(=), :Pr, :(DirichletBC{HalfCell}(bc.components[$val])))
-    ex2 = Expr(:(=), N,   :(DirichletBC{HalfCell}(convert(eltype(bc.components[$val]), 0))))
+    ex2 = Expr(:(=), N, :(DirichletBC{HalfCell}(convert(eltype(bc.components[$val]), 0))))
     ex3 = ntuple(Val(length(TN))) do I
         Expr(:(=), TN[I], :(DirichletBC{FullCell}(bc.components[$(td[I][2])])))
     end
@@ -39,75 +42,110 @@ for (dim, val) in _COORDINATES
     ex_slip_vel = Expr(:(=), Symbol(:V, dim), :(DirichletBC{FullCell}(bc.components[$val])))
     ex_slip_vel = Expr(:tuple, ex_slip_vel)
 
-    @eval begin
-        function extract_boundary_conditions(::Val{$val}, bc::BoundaryCondition{Traction})
-            return $ex_tr, NamedTuple()
-        end
-
-        function extract_boundary_conditions(::Val{$val}, bc::BoundaryCondition{Velocity})
-            return NamedTuple(), $ex_vel
-        end
-
-        function extract_boundary_conditions(::Val{$val}, bc::BoundaryCondition{Slip})
-            return $ex_slip_tr, $ex_slip_vel
-        end
+    ex_res_vel = ntuple(length(_COORDINATES)) do I
+        kind = I == val ? :(FullCell) : :(HalfCell)
+        Expr(:(=), Symbol(:r_V, _COORDINATES[I][1]), :(DirichletBC{$kind}(0.0)))
     end
-end
-
-no_bcs(names) = NamedTuple(f => nothing for f in names)
 
-unique_names(a, b) = Tuple(unique(tuple(a..., b...)))
+    ex_res_vel = Expr(:tuple, ex_res_vel...)
 
-function expand_boundary_conditions(left, right)
-    names = unique_names(keys(left), keys(right))
+    ex_res_slip_vel = Expr(:(=), Symbol(:r_V, dim), :(DirichletBC{FullCell}(0.0)))
+    ex_res_slip_vel = Expr(:tuple, ex_res_slip_vel)
 
-    if isempty(names)
-        return nothing
-    end
+    @eval begin
+        extract_stress_bc(::Val{$val}, bc::BoundaryCondition{Traction}) = $ex_tr
+        extract_stress_bc(::Val{$val}, bc::BoundaryCondition{Velocity}) = ()
+        extract_stress_bc(::Val{$val}, bc::BoundaryCondition{Slip})     = $ex_slip_tr
 
-    default = no_bcs(names)
-    left    = merge(default, left)
-    right   = merge(default, right)
+        extract_velocity_bc(::Val{$val}, bc::BoundaryCondition{Traction}) = ()
+        extract_velocity_bc(::Val{$val}, bc::BoundaryCondition{Velocity}) = $ex_vel
+        extract_velocity_bc(::Val{$val}, bc::BoundaryCondition{Slip})     = $ex_slip_vel
 
-    return NamedTuple{names}(zip(left, right))
+        extract_residuals_vel_bc(::Val{$val}, bc::BoundaryCondition{Traction}) = ()
+        extract_residuals_vel_bc(::Val{$val}, bc::BoundaryCondition{Velocity}) = $ex_res_vel
+        extract_residuals_vel_bc(::Val{$val}, bc::BoundaryCondition{Slip})     = $ex_res_slip_vel
+    end
 end
 
-function make_field_boundary_conditions(bcs)
-    ordering = (
-        (:west  , :east),
-        (:south , :north),
-        (:bottom, :top),
-    )
-
-    field_bcs = ntuple(Val(length(ordering))) do dim
-        left, right = bcs[ordering[dim]]
-
-        left  = extract_boundary_conditions(Val(dim), left)
-        right = extract_boundary_conditions(Val(dim), right)
-
-        stress, velocity = Tuple(zip(left, right))
+function make_batch(bcs::NamedTuple, fields::NamedTuple)
+    batch_fields = Tuple(fields[name] for name in eachindex(bcs))
+    return BoundaryConditionsBatch(batch_fields, values(bcs))
+end
 
-        stress   = expand_boundary_conditions(stress...)
-        velocity = expand_boundary_conditions(velocity...)
+function make_stress_bc(arch::Architecture{Kind}, ::CartesianGrid{N}, fields, bc) where {Kind,N}
+    ordering = (:x, :y, :z)
+    ntuple(Val(N)) do D
+        ntuple(Val(2)) do S
+            if (Kind == Distributed.DistributedMPI) && has_neighbor(details(arch), D, S)
+                nothing
+            else
+                new_bc = extract_stress_bc(Val(D), bc[ordering[D]][S])
+                if isempty(new_bc)
+                    nothing
+                else
+                    make_batch(new_bc, fields)
+                end
+            end
+        end
+    end
+end
 
-        stress, velocity
+function make_velocity_bc(arch::Architecture{Kind}, ::CartesianGrid{N}, fields::NamedTuple{names}, bc) where {Kind,N,names}
+    ordering = (:x, :y, :z)
+    ntuple(Val(N)) do D
+        ntuple(Val(2)) do S
+            if (Kind == Distributed.DistributedMPI) && has_neighbor(details(arch), D, S)
+                new_bc = NamedTuple{names}(ExchangeInfo(Val(S), Val(D), V) for V in fields)
+                make_batch(new_bc, fields)
+            else
+                new_bc = extract_velocity_bc(Val(D), bc[ordering[D]][S])
+                if isempty(new_bc)
+                    nothing
+                else
+                    make_batch(new_bc, fields)
+                end
+            end
+        end
     end
+end
 
-    return NamedTuple{(:stress, :velocity)}(zip(field_bcs...))
+function make_residuals_bc(arch::Architecture{Kind}, ::CartesianGrid{N}, fields::NamedTuple{names}, bc) where {Kind,N,names}
+    ordering = (:x, :y, :z)
+    ntuple(Val(N)) do D
+        ntuple(Val(2)) do S
+            if (Kind == Distributed.DistributedMPI) && has_neighbor(details(arch), D, S)
+                nothing
+            else
+                if !(bc[ordering[D]][S] isa BoundaryCondition{Traction})
+                    Vn = Symbol(:r_V, ordering[D])
+                    BoundaryConditionsBatch((fields[Vn], ), (DirichletBC{FullCell}(0.0), ))
+                else
+                    nothing
+                end
+            end
+        end
+    end
 end
 
-function _apply_bcs!(backend, grid, fields, bcs)
-    field_map = (Pr = fields.Pr,
-            τxx = fields.τ.xx, τyy = fields.τ.yy, τzz = fields.τ.zz,
-            τxy = fields.τ.xy, τxz = fields.τ.xz, τyz = fields.τ.yz,
-             Vx = fields.V.x ,  Vy = fields.V.y ,  Vz = fields.V.z)
-    
-    ntuple(Val(length(bcs))) do D
-        if !isnothing(bcs[D])
-            fs = Tuple( field_map[f] for f in eachindex(bcs[D]) )
-            apply_bcs!(Val(D), backend, grid, fs, values(bcs[D]))
+function make_rheology_bc(arch::Architecture{Kind}, ::CartesianGrid{N}, η) where {Kind,N}
+    ntuple(Val(N)) do D
+        ntuple(Val(2)) do S
+            if (Kind == Distributed.DistributedMPI) && has_neighbor(details(arch), D, S)
+                BoundaryConditionsBatch((η,), (ExchangeInfo(Val(S), Val(D), η),))
+            else
+                BoundaryConditionsBatch((η,), (ExtrapolateBC(),))
+            end
         end
     end
+end
+
+function make_field_boundary_conditions(arch::Architecture, grid::CartesianGrid, fields, bc)
+    stress_fields   = (; Pr=fields.Pr, NamedTuple{Symbol.(:τ, keys(fields.τ))}(values(fields.τ))...)
+    velocity_fields = NamedTuple{Symbol.(:V, keys(fields.V))}(values(fields.V))
+    residual_fields = NamedTuple{Symbol.(:r_V, keys(fields.r_V))}(values(fields.r_V))
 
-    return
+    return (stress   = make_stress_bc(arch, grid, stress_fields, bc),
+            velocity = make_velocity_bc(arch, grid, velocity_fields, bc),
+            rheology = make_rheology_bc(arch, grid, fields.η),
+            residual = make_residuals_bc(arch, grid, residual_fields, bc))
 end
diff --git a/src/Models/full_stokes/isothermal/isothermal.jl b/src/Models/full_stokes/isothermal/isothermal.jl
index 20ead3e2..f76993de 100644
--- a/src/Models/full_stokes/isothermal/isothermal.jl
+++ b/src/Models/full_stokes/isothermal/isothermal.jl
@@ -1,60 +1,87 @@
 module Isothermal
 
 export BoundaryCondition, Traction, Velocity, Slip
-export IsothermalFullStokesModel, advance_iteration!, advance_timestep!
+export IsothermalFullStokesModel, advance_iteration!, advance_timestep!, compute_residuals!
 
+using FastIce.Architectures
 using FastIce.Physics
 using FastIce.Grids
 using FastIce.Fields
 using FastIce.BoundaryConditions
 using FastIce.Utils
+using FastIce.KernelLaunch
+using FastIce.Distributed
 
 include("kernels.jl")
 
 include("boundary_conditions.jl")
 
-function default_physics(::Type{T}) where T
-    return (
-        equation_of_state=default(IncompressibleIceEOS{T}),
-        thermal_properties=default(IceThermalProperties{T}),
-        rheology=default(GlensLawRheology{Int64})
-    )
+function default_physics(::Type{T}) where {T}
+    return (equation_of_state=default(IncompressibleIceEOS{T}),
+            thermal_properties=default(IceThermalProperties{T}),
+            rheology=default(GlensLawRheology{Int64}))
 end
 
-struct IsothermalFullStokesModel{Backend,Grid,BC,Physics,IterParams,Fields}
-    backend::Backend
+struct IsothermalFullStokesModel{Arch,Grid,BC,HB,OW,Physics,Gravity,IterParams,Fields}
+    arch::Arch
     grid::Grid
     boundary_conditions::BC
+    hide_boundaries::HB
+    outer_width::OW
     physics::Physics
+    gravity::Gravity
     iter_params::IterParams
     fields::Fields
 end
 
-function make_fields_mechanics(backend, grid)
-    return (
-        Pr = Field(backend, grid, Center(); halo=1),
-        τ  = (
-            xx = Field(backend, grid, Center(); halo=1),
-            yy = Field(backend, grid, Center(); halo=1),
-            zz = Field(backend, grid, Center(); halo=1),
-            xy = Field(backend, grid, (Vertex(), Vertex(), Center()); halo=0),
-            xz = Field(backend, grid, (Vertex(), Center(), Vertex()); halo=0),
-            yz = Field(backend, grid, (Center(), Vertex(), Vertex()); halo=0),
-        ),
-        V = (
-            x = Field(backend, grid, (Vertex(), Center(), Center()); halo=1),
-            y = Field(backend, grid, (Center(), Vertex(), Center()); halo=1),
-            z = Field(backend, grid, (Center(), Center(), Vertex()); halo=1),
-        )
-    )
+function make_fields_mechanics(backend, grid::CartesianGrid{2})
+    return (Pr=Field(backend, grid, Center(); halo=1),
+            # deviatoric stress
+            τ=(xx=Field(backend, grid, Center(); halo=1),
+               yy=Field(backend, grid, Center(); halo=1),
+               xy=Field(backend, grid, (Vertex(), Vertex()))),
+            # velocity
+            V=(x=Field(backend, grid, (Vertex(), Center()); halo=1),
+               y=Field(backend, grid, (Center(), Vertex()); halo=1)),
+            # residual
+            r_Pr=Field(backend, grid, Center()),
+            r_V=(x=Field(backend, grid, (Vertex(), Center())),
+                 y=Field(backend, grid, (Center(), Vertex()))))
 end
 
+function make_fields_mechanics(backend, grid::CartesianGrid{3})
+    return (Pr=Field(backend, grid, Center(); halo=1),
+            # deviatoric stress
+            τ=(xx=Field(backend, grid, Center(); halo=1),
+               yy=Field(backend, grid, Center(); halo=1),
+               zz=Field(backend, grid, Center(); halo=1),
+               xy=Field(backend, grid, (Vertex(), Vertex(), Center())),
+               xz=Field(backend, grid, (Vertex(), Center(), Vertex())),
+               yz=Field(backend, grid, (Center(), Vertex(), Vertex()))),
+            # velocity
+            V=(x=Field(backend, grid, (Vertex(), Center(), Center()); halo=1),
+               y=Field(backend, grid, (Center(), Vertex(), Center()); halo=1),
+               z=Field(backend, grid, (Center(), Center(), Vertex()); halo=1)),
+            # residual
+            r_Pr=Field(backend, grid, Center()),
+            r_V=(x=Field(backend, grid, (Vertex(), Center(), Center())),
+                 y=Field(backend, grid, (Center(), Vertex(), Center())),
+                 z=Field(backend, grid, (Center(), Center(), Vertex()))))
+end
 
-
-function IsothermalFullStokesModel(; backend, grid, boundary_conditions, physics=nothing, iter_params, fields=nothing, other_fields=nothing)
+function IsothermalFullStokesModel(;
+                                   arch,
+                                   grid,
+                                   boundary_conditions,
+                                   iter_params,
+                                   gravity,
+                                   outer_width=(2, 2, 2),
+                                   physics=nothing,
+                                   fields=nothing,
+                                   other_fields=nothing)
     if isnothing(fields)
-        mechanic_fields = make_fields_mechanics(backend, grid)
-        rheology_fields = (η = Field(backend, grid, Center(); halo=1),)
+        mechanic_fields = make_fields_mechanics(backend(arch), grid)
+        rheology_fields = (η=Field(backend(arch), grid, Center(); halo=1),)
         fields = merge(mechanic_fields, rheology_fields)
     end
 
@@ -66,47 +93,53 @@ function IsothermalFullStokesModel(; backend, grid, boundary_conditions, physics
         physics = default_physics(eltype(grid))
     end
 
-    boundary_conditions = make_field_boundary_conditions(boundary_conditions)
+    boundary_conditions = make_field_boundary_conditions(arch, grid, fields, boundary_conditions)
+    hide_boundaries = HideBoundaries{ndims(grid)}(arch)
 
-    return IsothermalFullStokesModel(backend, grid, boundary_conditions, physics, iter_params, fields)
+    return IsothermalFullStokesModel(arch, grid, boundary_conditions, hide_boundaries, outer_width, physics, gravity, iter_params, fields)
 end
 
 fields(model::IsothermalFullStokesModel) = model.fields
 grid(model::IsothermalFullStokesModel) = model.grid
 
-function advance_iteration!(model::IsothermalFullStokesModel, t, Δt; async = true)
+function advance_iteration!(model::IsothermalFullStokesModel, t, Δt)
     (; Pr, τ, V, η) = model.fields
-    (; η_rel, Δτ) = model.iter_params
-    η_rh = model.physics.rheology
+    (; η_rel, Δτ)   = model.iter_params
+    η_rh            = model.physics.rheology
+    ρg              = model.gravity
+    hide_boundaries = model.hide_boundaries
+    outer_width     = model.outer_width
+
     Δ = NamedTuple{(:x, :y, :z)}(spacing(model.grid))
-    nx, ny, nz = size(model.grid)
-    backend = model.backend
 
-    set_bcs!(bcs) = _apply_bcs!(model.backend, model.grid, model.fields, bcs)
+    launch!(model.arch, model.grid, update_σ! => (Pr, τ, V, η, Δτ, Δ);
+            location=Center(), expand=1, boundary_conditions=model.boundary_conditions.stress, hide_boundaries, outer_width, async=false)
 
-    # stress
+    launch!(model.arch, model.grid, update_V! => (V, Pr, τ, η, ρg, Δτ, model.grid, Δ);
+            location=Vertex(), boundary_conditions=model.boundary_conditions.velocity, hide_boundaries, outer_width, async=false)
 
-    # launch!(arch, grid, update_σ!, Pr, τ, V, η, Δτ, Δ)
+    # rheology
+    launch!(model.arch, model.grid, update_η! => (η, η_rh, η_rel, model.grid, model.fields);
+            location=Center(), boundary_conditions=model.boundary_conditions.rheology, hide_boundaries, outer_width, async=false)
+    return
+end
 
-    update_σ!(backend, 256, (nx+1, ny+1, nz+1))(Pr, τ, V, η, Δτ, Δ)
-    set_bcs!(model.boundary_conditions.stress)
-    # velocity
+function compute_residuals!(model::IsothermalFullStokesModel)
+    (; Pr, τ, V, r_Pr, r_V) = model.fields
+    ρg = model.gravity
+    boundary_conditions = model.boundary_conditions.residual
 
-    # launch!(arch, grid, (update_res_V! => (rV, V, Pr, τ, η, Δτ, Δ), update_V! => (V, rV, dt)); exchangers = exchangers.velocity, boundary_conditions = boundary_conditions.velocity)
-    update_V!(backend, 256, (nx+1, ny+1, nz+1))(V, Pr, τ, η, Δτ, Δ)
-    set_bcs!(model.boundary_conditions.velocity)
-    # rheology
-    update_η!(backend, 256, (nx, ny, nz))(η, η_rh, η_rel, model.grid, model.fields)
-    extrapolate!(η)
+    Δ = NamedTuple{(:x, :y, :z)}(spacing(model.grid))
 
-    async || synchronize(backend)
+    launch!(model.arch, model.grid, compute_residuals! => (r_V, r_Pr, Pr, τ, V, ρg, model.grid, Δ);
+            location=Vertex(), boundary_conditions, async=false)
     return
 end
 
 function advance_timestep!(model::IsothermalFullStokesModel, t, Δt)
     # TODO
-    
+
     return
 end
 
-end
\ No newline at end of file
+end
diff --git a/src/Models/full_stokes/isothermal/kernels.jl b/src/Models/full_stokes/isothermal/kernels.jl
index ea435442..c9bb4fa8 100644
--- a/src/Models/full_stokes/isothermal/kernels.jl
+++ b/src/Models/full_stokes/isothermal/kernels.jl
@@ -2,14 +2,27 @@ using KernelAbstractions
 
 using FastIce.GridOperators
 
-@kernel function update_η!(η, η_rh, χ, grid, fields, args...)
+Base.@propagate_inbounds @generated function within(grid::CartesianGrid{N}, f::Field{T,N}, I::CartesianIndex{N}) where {T,N}
+    quote
+        Base.Cartesian.@nall $N i->I[i] <= size(grid, location(f, Val(i)), i)
+    end
+end
+
+"Update viscosity using relaxation in log-space. Improves stability of iterative methods"
+@kernel function update_η!(η, η_rh, χ, grid, fields, offset=nothing)
     I = @index(Global, Cartesian)
-    ηt = η_rh(grid, I, fields, args...)
-    @inbounds η[I] = exp(log(η[I]) * (1 - χ) + log(ηt) * χ)
+    isnothing(offset) || (I += offset)
+    @inbounds begin
+        ηt = η_rh(grid, I, fields)
+        η[I] = exp(log(η[I]) * (1 - χ) + log(ηt) * χ)
+    end
 end
 
-@kernel function update_σ!(Pr, τ, V, η, Δτ, Δ)
+# pseudo-transient update rules
+
+@kernel function update_σ!(Pr, τ, V, η, Δτ, Δ, offset=nothing)
     I = @index(Global, Cartesian)
+    isnothing(offset) || (I += offset)
     @inbounds if checkbounds(Bool, Pr, I)
         ε̇xx = ∂ᶜx(V.x, I) / Δ.x
         ε̇yy = ∂ᶜy(V.y, I) / Δ.y
@@ -36,24 +49,83 @@ end
     end
 end
 
-@kernel function update_V!(V, Pr, τ, η, Δτ, Δ)
+@kernel function update_V!(V, Pr, τ, η, ρg, Δτ, grid, Δ, offset=nothing)
+    I = @index(Global, Cartesian)
+    isnothing(offset) || (I += offset)
+    @inbounds if within(grid, V.x, I)
+        ∂σxx_∂x = (-∂ᵛx(Pr, I) + ∂ᵛx(τ.xx, I)) / Δ.x
+        ∂τxy_∂y = ∂ᶜy(τ.xy, I) / Δ.y
+        ∂τxz_∂z = ∂ᶜz(τ.xz, I) / Δ.z
+        V.x[I] += (∂σxx_∂x + ∂τxy_∂y + ∂τxz_∂z - ρg.x[I]) / maxlᵛx(η, I) * Δτ.V.x
+    end
+    @inbounds if within(grid, V.y, I)
+        ∂σyy_∂y = (-∂ᵛy(Pr, I) + ∂ᵛy(τ.yy, I)) / Δ.y
+        ∂τxy_∂x = ∂ᶜx(τ.xy, I) / Δ.x
+        ∂τyz_∂z = ∂ᶜz(τ.yz, I) / Δ.z
+        V.y[I] += (∂σyy_∂y + ∂τxy_∂x + ∂τyz_∂z - ρg.y[I]) / maxlᵛy(η, I) * Δτ.V.y
+    end
+    @inbounds if within(grid, V.z, I)
+        ∂σzz_∂z = (-∂ᵛz(Pr, I) + ∂ᵛz(τ.zz, I)) / Δ.z
+        ∂τxz_∂x = ∂ᶜx(τ.xz, I) / Δ.x
+        ∂τyz_∂y = ∂ᶜy(τ.yz, I) / Δ.y
+        V.z[I] += (∂σzz_∂z + ∂τxz_∂x + ∂τyz_∂y - ρg.z[I]) / maxlᵛz(η, I) * Δτ.V.z
+    end
+end
+
+# helper kernels
+
+@kernel function compute_τ!(τ, V, η, Δ, offset=nothing)
+    I = @index(Global, Cartesian)
+    isnothing(offset) || (I += offset)
+    @inbounds if checkbounds(Bool, Pr, I)
+        ε̇xx = ∂ᶜx(V.x, I) / Δ.x
+        ε̇yy = ∂ᶜy(V.y, I) / Δ.y
+        ε̇zz = ∂ᶜz(V.z, I) / Δ.z
+        ∇V = ε̇xx + ε̇yy + ε̇zz
+        # deviatoric diagonal
+        τ.xx[I] =  2.0 * η[I] * (ε̇xx - ∇V / 3.0)
+        τ.yy[I] =  2.0 * η[I] * (ε̇yy - ∇V / 3.0)
+        τ.zz[I] =  2.0 * η[I] * (ε̇zz - ∇V / 3.0)
+    end
+    @inbounds if checkbounds(Bool, τ.xy, I)
+        ε̇xy = 0.5 * (∂ᵛx(V.y, I) / Δ.x + ∂ᵛy(V.x, I) / Δ.y)
+        τ.xy[I] = 2.0 * avᵛxy(η, I) * ε̇xy
+    end
+    @inbounds if checkbounds(Bool, τ.xz, I)
+        ε̇xz = 0.5 * (∂ᵛx(V.z, I) / Δ.x + ∂ᵛz(V.x, I) / Δ.z)
+        τ.xz[I] = 2.0 * avᵛxz(η, I) * ε̇xz
+    end
+    @inbounds if checkbounds(Bool, τ.yz, I)
+        ε̇yz = 0.5 * (∂ᵛy(V.z, I) / Δ.y + ∂ᵛz(V.y, I) / Δ.z)
+        τ.yz[I] = 2.0 * avᵛyz(η, I) * ε̇yz
+    end
+end
+
+@kernel function compute_residuals!(r_V, r_Pr, Pr, τ, V, ρg, grid, Δ, offset=nothing)
     I = @index(Global, Cartesian)
-    @inbounds if checkbounds(Bool, V.x, I)
+    isnothing(offset) || (I += offset)
+    @inbounds if within(grid, r_Pr, I)
+        ε̇xx = ∂ᶜx(V.x, I) / Δ.x
+        ε̇yy = ∂ᶜy(V.y, I) / Δ.y
+        ε̇zz = ∂ᶜz(V.z, I) / Δ.z
+        r_Pr[I] = ε̇xx + ε̇yy + ε̇zz
+    end
+    @inbounds if within(grid, r_V.x, I)
         ∂σxx_∂x = (-∂ᵛx(Pr, I) + ∂ᵛx(τ.xx, I)) / Δ.x
         ∂τxy_∂y = ∂ᶜy(τ.xy, I) / Δ.y
         ∂τxz_∂z = ∂ᶜz(τ.xz, I) / Δ.z
-        V.x[I] += (∂σxx_∂x + ∂τxy_∂y + ∂τxz_∂z) / maxlᵛx(η, I) * Δτ.V.x
+        r_V.x[I] = ∂σxx_∂x + ∂τxy_∂y + ∂τxz_∂z - ρg.x[I]
     end
-    @inbounds if checkbounds(Bool, V.y, I)
+    @inbounds if within(grid, r_V.y, I)
         ∂σyy_∂y = (-∂ᵛy(Pr, I) + ∂ᵛy(τ.yy, I)) / Δ.y
         ∂τxy_∂x = ∂ᶜx(τ.xy, I) / Δ.x
         ∂τyz_∂z = ∂ᶜz(τ.yz, I) / Δ.z
-        V.y[I] += (∂σyy_∂y + ∂τxy_∂x + ∂τyz_∂z) / maxlᵛy(η, I) * Δτ.V.y
+        r_V.y[I] = ∂σyy_∂y + ∂τxy_∂x + ∂τyz_∂z - ρg.y[I]
     end
-    @inbounds if checkbounds(Bool, V.z, I)
+    @inbounds if within(grid, r_V.z, I)
         ∂σzz_∂z = (-∂ᵛz(Pr, I) + ∂ᵛz(τ.zz, I)) / Δ.z
         ∂τxz_∂x = ∂ᶜx(τ.xz, I) / Δ.x
         ∂τyz_∂y = ∂ᶜy(τ.yz, I) / Δ.y
-        V.z[I] += (∂σzz_∂z + ∂τxz_∂x + ∂τyz_∂y) / maxlᵛz(η, I) * Δτ.V.z
+        r_V.z[I] = ∂σzz_∂z + ∂τxz_∂x + ∂τyz_∂y - ρg.z[I]
     end
 end
diff --git a/src/physics.jl b/src/Physics.jl
similarity index 65%
rename from src/physics.jl
rename to src/Physics.jl
index 5797a303..af7831bd 100644
--- a/src/physics.jl
+++ b/src/Physics.jl
@@ -1,14 +1,11 @@
 module Physics
 
-export AbstractPhysics
 export IncompressibleIceEOS, IceThermalProperties
-export IceRheology, GlensLawRheology, τ
+export IceRheology, GlensLawRheology
 export default
 
 using FastIce.GridOperators
 
-abstract type AbstractPhysics end
-
 struct IncompressibleIceEOS{T}
     density::T
     heat_capacity::T
@@ -31,10 +28,10 @@ end
 
 default(::Type{GlensLawRheology{I}}) where {I} = GlensLawRheology(convert(I, 3))
 
-@inline function (rh::GlensLawRheology{T})(grid, I, fields) where {T}
+Base.@propagate_inbounds function (rh::GlensLawRheology{T})(grid, I, fields) where {T}
     (; τ, A) = fields
-    @inbounds τII = sqrt(0.5 * (τ.xx[I]^2 + τ.yy[I]^2 + τ.zz[I]^2) + avᶜxy(τ.xy, I)^2 + avᶜxz(τ.xz, I)^2 + avᶜyz(τ.yz, I)^2)
-    return 0.5 / (A[I] * τII^(rh.exponent - 1))
+    τII = sqrt(0.5 * (τ.xx[I]^2 + τ.yy[I]^2 + τ.zz[I]^2) + avᶜxy(τ.xy, I)^2 + avᶜxz(τ.xz, I)^2 + avᶜyz(τ.yz, I)^2)
+    0.5 / (A[I] * τII^(rh.exponent - 1))
 end
 
-end
\ No newline at end of file
+end
diff --git a/src/Utils/Utils.jl b/src/Utils/Utils.jl
new file mode 100644
index 00000000..78296230
--- /dev/null
+++ b/src/Utils/Utils.jl
@@ -0,0 +1,36 @@
+module Utils
+
+export remove_dim, insert_dim
+export split_ndrange
+export Pipeline
+
+using FastIce.Fields
+using FastIce.Architectures
+using KernelAbstractions
+
+include("pipelines.jl")
+include("split_ndrange.jl")
+
+# Returns a copy of the tuple `A` with element in position `D` removed
+@inline remove_dim(::Val{D}, A::NTuple{N}) where {D,N} = ntuple(I -> I < D ? A[I] : A[I+1], Val(N - 1))
+@inline remove_dim(::Val{1}, I::NTuple{1}) = 1
+
+# Same as `remove_dim`, but for `CartesianIndex`
+@inline remove_dim(dim, I::CartesianIndex) = remove_dim(dim, Tuple(I)) |> CartesianIndex
+
+# Returns a copy of tuple `A`, but inserts `i` into position `D`
+@inline insert_dim(::Val{D}, A::NTuple{N}, i) where {D,N} =
+    ntuple(Val(N + 1)) do I
+        if I < D
+            A[I]
+        elseif I == D
+            i
+        else
+            A[I-1]
+        end
+    end
+
+# Same as `insert_dim`, but for `CartesianIndex`
+@inline insert_dim(dim, A::CartesianIndex, i) = insert_dim(dim, Tuple(A), i) |> CartesianIndex
+
+end
diff --git a/src/Utils/extrapolate.jl b/src/Utils/extrapolate.jl
new file mode 100644
index 00000000..1d079204
--- /dev/null
+++ b/src/Utils/extrapolate.jl
@@ -0,0 +1,14 @@
+function extrapolate!(A::Field; async=true)
+    backend = get_backend(A)
+    I = ntuple(Val(ndims(A))) do dim
+        halo(A, dim, 1) > 0 ? 0 : nothing
+    end
+    N = ntuple(Val(ndims(A))) do dim
+        halo(A, dim, 2) > 0 ? size(A, dim) + 1 : nothing
+    end
+    kernel_extrapolate_x!(backend, 256, (size(A, 2) + 2, size(A, 3) + 2))(A, I[1], N[1])
+    kernel_extrapolate_y!(backend, 256, (size(A, 1) + 2, size(A, 3) + 2))(A, I[2], N[2])
+    kernel_extrapolate_z!(backend, 256, (size(A, 1) + 2, size(A, 2) + 2))(A, I[3], N[3])
+    async || KernelAbstractions.synchronize(backend)
+    return
+end
diff --git a/src/Utils/pipelines.jl b/src/Utils/pipelines.jl
new file mode 100644
index 00000000..63c9333e
--- /dev/null
+++ b/src/Utils/pipelines.jl
@@ -0,0 +1,45 @@
+mutable struct Pipeline
+    src::Channel
+    out::Base.Event
+    task::Task
+
+    function Pipeline(; pre=nothing, post=nothing)
+        src = Channel()
+        out = Base.Event(true)
+        task = Threads.@spawn begin
+            isnothing(pre) || Base.invokelatest(pre)
+            try
+                for work in src
+                    Base.invokelatest(work)
+                    notify(out)
+                end
+            finally
+                isnothing(post) || Base.invokelatest(post)
+            end
+        end
+        errormonitor(task)
+        return new(src, out, task)
+    end
+end
+
+function Base.close(p::Pipeline)
+    close(p.src)
+    wait(p.task)
+    return
+end
+
+Base.isopen(p::Pipeline) = isopen(p.src)
+
+function Base.put!(work::F, p::Pipeline) where {F}
+    put!(p.src, work)
+    return
+end
+
+function Base.wait(p::Pipeline)
+    if isopen(p.src)
+        wait(p.out)
+    else
+        error("Pipeline is not running")
+    end
+    return
+end
diff --git a/src/Utils/split_ndrange.jl b/src/Utils/split_ndrange.jl
new file mode 100644
index 00000000..a50d017b
--- /dev/null
+++ b/src/Utils/split_ndrange.jl
@@ -0,0 +1,26 @@
+@inline outer_subrange(nr, bw, D, ::Val{1}) = 1:bw[D]
+@inline outer_subrange(nr, bw, D, ::Val{2}) = (size(nr, D)-bw[D]+1):size(nr, D)
+@inline inner_subrange(nr, bw, D) = (bw[D]+1):(size(nr, D)-bw[D])
+
+function ndsubrange(ndrange::CartesianIndices{N}, ndwidth, I, ::Val{S}) where {N,S}
+    return ntuple(Val(N)) do D
+        if D < I
+            1:size(ndrange, D)
+        elseif D == I
+            outer_subrange(ndrange, ndwidth, D, Val(S))
+        else
+            inner_subrange(ndrange, ndwidth, D)
+        end
+    end
+end
+
+@inline split_ndrange(ndrange, ndwidth) = split_ndrange(CartesianIndices(ndrange), ndwidth)
+
+function split_ndrange(ndrange::CartesianIndices{N}, ndwidth::NTuple{N,<:Integer}) where {N}
+    @assert all(size(ndrange) .> ndwidth .* 2)
+    ndinner = ntuple(D -> inner_subrange(ndrange, ndwidth, D), Val(N))
+    inner_range = ndrange[ndinner...]
+    outer_ranges = ntuple(D -> (ndrange[ndsubrange(ndrange, ndwidth, D, Val(1))...],
+                                ndrange[ndsubrange(ndrange, ndwidth, D, Val(2))...]), Val(N))
+    return inner_range, outer_ranges
+end
diff --git a/src/architecture.jl b/src/architecture.jl
deleted file mode 100644
index 963ad350..00000000
--- a/src/architecture.jl
+++ /dev/null
@@ -1,48 +0,0 @@
-module Architecture
-
-export AbstractArchitecture
-
-export SingleDeviceArchitecture
-
-export launch!, set_device!, heuristic_groupsize
-
-using FastIce.Grids
-
-using KernelAbstractions
-import KernelAbstractions.Kernel
-
-abstract type AbstractArchitecture end
-
-set_device!(arch::AbstractArchitecture) = set_device!(device(arch))
-
-heuristic_groupsize(arch::AbstractArchitecture) = heuristic_groupsize(device(arch))
-
-struct SingleDeviceArchitecture{B,D} <: AbstractArchitecture
-    backend::B
-    device::D
-end
-
-set_device!(::SingleDeviceArchitecture{CPU}) = nothing
-
-heuristic_groupsize(::SingleDeviceArchitecture{CPU}) = 256
-
-device(arch::SingleDeviceArchitecture) = arch.device
-
-function launch!(arch::SingleDeviceArchitecture, grid::CartesianGrid, kernel::Pair{Kernel,Args}; kwargs...) where {Args}
-    worksize = size(grid, Vertex())
-    launch!(arch, worksize, kernel; kwargs...)
-end
-
-function launch!(arch::SingleDeviceArchitecture, worksize::NTuple{N,Int}, kernel::Pair{Kernel,Args}; boundary_conditions=nothing, async=true) where {N,Args}
-    fun, args = kernel
-
-    groupsize = heuristic_groupsize(device(arch))
-
-    fun(arch.backend, groupsize, worksize)(args...)
-    isnothing(boundary_conditions) || apply_boundary_conditions!(boundary_conditions)
-
-    async || synchronize(arch.backend)
-    return
-end
-
-end
\ No newline at end of file
diff --git a/src/geometry.jl b/src/geometry.jl
deleted file mode 100644
index dc4fe417..00000000
--- a/src/geometry.jl
+++ /dev/null
@@ -1,10 +0,0 @@
-function rotation_matrix(α,β,γ)
-    sα,cα = sincos(α)
-    sβ,cβ = sincos(β)
-    sγ,cγ = sincos(γ)
-    return Mat3(
-        cα * cβ               , sα * cβ               , -sβ     ,
-        cα * sβ * sγ - sα * cγ, sα * sβ * sγ + cα * cγ,  cβ * sγ,
-        cα * sβ * cγ + sα * sγ, sα * sβ * cγ - cα * sγ,  cβ * cγ,
-    )
-end
\ No newline at end of file
diff --git a/src/utils.jl b/src/utils.jl
deleted file mode 100644
index eeae9d21..00000000
--- a/src/utils.jl
+++ /dev/null
@@ -1,68 +0,0 @@
-module Utils
-
-export remove_dim, insert_dim
-
-# Returns a copy of the tuple `A` with element in position `D` removed
-@inline remove_dim(::Val{D}, A::NTuple{N}) where {D,N} = ntuple(I -> I < D ? A[I] : A[I+1], Val(N - 1))
-@inline remove_dim(::Val{1}, I::NTuple{1}) = 1
-
-# Same as `remove_dim`, but for `CartesianIndex`
-@inline remove_dim(dim, I::CartesianIndex) = remove_dim(dim, Tuple(I)) |> CartesianIndex
-
-# Returns a copy of tuple `A`, but inserts `i` into position `D`
-@inline insert_dim(::Val{D}, A::NTuple{N}, i) where {D, N} = ntuple(Val(N + 1)) do I
-    if I < D
-        A[I]
-    elseif I == D
-        i
-    else
-        A[I-1]
-    end
-end
-
-# Same as `insert_dim`, but for `CartesianIndex`
-@inline insert_dim(dim, A::CartesianIndex, i) = insert_dim(dim, Tuple(A), i) |> CartesianIndex
-
-export extrapolate!
-
-using FastIce.Fields
-
-using KernelAbstractions
-
-@kernel function kernel_extrapolate_x!(A, I, N)
-    iy, iz = @index(Global, NTuple)
-    iy -= 1; iz -= 1
-    @inbounds if !isnothing(I) A[I, iy, iz] = 2.0 * A[I+1, iy, iz] - A[I+2, iy, iz] end
-    @inbounds if !isnothing(N) A[N, iy, iz] = 2.0 * A[N-1, iy, iz] - A[N-2, iy, iz] end
-end
-
-@kernel function kernel_extrapolate_y!(A, I, N)
-    ix, iz = @index(Global, NTuple)
-    ix -= 1; iz -= 1
-    @inbounds if !isnothing(I) A[ix, I, iz] = 2.0 * A[ix, I+1, iz] - A[ix, I+2, iz] end
-    @inbounds if !isnothing(N) A[ix, N, iz] = 2.0 * A[ix, N-1, iz] - A[ix, N-2, iz] end
-end
-
-@kernel function kernel_extrapolate_z!(A, I, N)
-    ix, iy = @index(Global, NTuple)
-    ix -= 1; iy -= 1
-    @inbounds if !isnothing(I) A[ix, iy, I] = 2.0 * A[ix, iy, I+1] - A[ix, iy, I+2] end
-    @inbounds if !isnothing(N) A[ix, iy, N] = 2.0 * A[ix, iy, N-1] - A[ix, iy, N-2] end
-end
-
-function extrapolate!(A::Field; async=true)
-    backend = get_backend(A)
-    I = ntuple(Val(ndims(A))) do dim
-        halo(A, dim, 1) > 0 ? 0 : nothing
-    end
-    N = ntuple(Val(ndims(A))) do dim
-        halo(A, dim, 2) > 0 ? size(A, dim) + 1 : nothing
-    end
-    kernel_extrapolate_x!(backend, 256, (size(A, 2)+2, size(A, 3)+2))(A, I[1], N[1])
-    kernel_extrapolate_y!(backend, 256, (size(A, 1)+2, size(A, 3)+2))(A, I[2], N[2])
-    kernel_extrapolate_z!(backend, 256, (size(A, 1)+2, size(A, 2)+2))(A, I[3], N[3])
-    async || synchronize(backend)
-    return
-end
-
-end
\ No newline at end of file
diff --git a/test/Project.toml b/test/Project.toml
index 7a5a3438..dbd55b91 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
@@ -8,5 +9,5 @@ AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
 [compat]
-AMDGPU = "0.6"
-CUDA = "4, 5"
+AMDGPU = "0.7"
+CUDA = "5"
diff --git a/test/common.jl b/test/common.jl
index 911f91c2..f7853771 100644
--- a/test/common.jl
+++ b/test/common.jl
@@ -4,12 +4,12 @@ using FastIce
 using KernelAbstractions
 
 # add KA backends
-backends = KernelAbstractions.Backend[CPU(), ]
+backends = KernelAbstractions.Backend[CPU()]
 
-if get(ENV,"JULIA_FASTICE_BACKEND","") == "AMDGPU"
+if get(ENV, "JULIA_FASTICE_BACKEND", "") == "AMDGPU"
     using AMDGPU
     AMDGPU.functional() && push!(backends, ROCBackend())
-elseif get(ENV,"JULIA_FASTICE_BACKEND","") == "CUDA"
+elseif get(ENV, "JULIA_FASTICE_BACKEND", "") == "CUDA"
     using CUDA
     CUDA.functional() && push!(backends, CUDABackend())
-end
\ No newline at end of file
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 1023db56..525fa347 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,7 +3,16 @@ using FastIce
 
 using Pkg
 
-excludedfiles = [ "test_excluded.jl"];
+excludedfiles = ["test_excluded.jl"]
+
+# distributed
+test_distributed = ["test_distributed_2D.jl", "test_distributed_3D.jl"]
+using MPI
+nprocs_2D = 4
+nprocs_3D = 8
+ENV["DIMX"] = 2
+ENV["DIMY"] = 2
+ENV["DIMZ"] = 2
 
 function parse_flags!(args, flag; default=nothing, typ=typeof(default))
     for f in args
@@ -36,14 +45,23 @@ function runtests()
 
     for f in testfiles
         println("")
-        if f ∈ excludedfiles
+        if basename(f) ∈ excludedfiles
             println("Test Skip:")
             println("$f")
             continue
         end
         try
-            run(`$exename --startup-file=no $(joinpath(testdir, f))`)
+            if basename(f) ∈ test_distributed
+                nprocs = contains(f, "2D") ? nprocs_2D : nprocs_3D
+                cmd(n=nprocs) = `$(mpiexec()) -n $n $exename --startup-file=no --color=yes $(joinpath(testdir, f))`
+                withenv("JULIA_NUM_THREADS" => "4") do
+                    run(cmd())
+                end
+            else
+                run(`$exename --startup-file=no $(joinpath(testdir, f))`)
+            end
         catch ex
+            @error ex
             nfail += 1
         end
     end
@@ -60,4 +78,4 @@ elseif backend_name == "CUDA"
     ENV["JULIA_FASTICE_BACKEND"] = "CUDA"
 end
 
-exit(runtests())
\ No newline at end of file
+exit(runtests())
diff --git a/test/test_bc.jl b/test/test_bc.jl
index 51264b8d..21f33991 100644
--- a/test/test_bc.jl
+++ b/test/test_bc.jl
@@ -2,37 +2,42 @@ include("common.jl")
 
 using FastIce.Grids
 using FastIce.Fields
+using FastIce.Architectures
 using FastIce.BoundaryConditions
 
 @testset "backend $backend" for backend in backends
     @testset "boundary conditions" begin
         nx, ny, nz = 2, 2, 2
-        grid  = CartesianGrid(origin = (0.0, 0.0, 0.0), extent = (1.0, 1.0, 1.0), size = (nx, ny, nz))
+        grid = CartesianGrid(; origin=(0.0, 0.0, 0.0), extent=(1.0, 1.0, 1.0), size=(nx, ny, nz))
         field = Field(backend, grid, Center(); halo=1)
+        arch = Architecture(backend)
         @testset "value" begin
             @testset "x-dim" begin
                 data(field) .= 0.0
-                west_bc = DirichletBC{HalfCell}(1.0)
-                east_bc = DirichletBC{FullCell}(0.5)
-                apply_bcs!(Val(1), backend, grid, (field, ), ((west_bc, east_bc), ); async=false)
-                @test all((parent(field)[1   , 2:ny+1, 2:nz+1] .+ parent(field)[2, 2:ny+1, 2:nz+1]) ./ 2 .≈ west_bc.condition)
-                @test all( parent(field)[nx+2, 2:ny+1, 2:nz+1] .≈ east_bc.condition)
+                west_bc = BoundaryConditionsBatch((field,), (DirichletBC{HalfCell}(1.0),))
+                east_bc = BoundaryConditionsBatch((field,), (DirichletBC{FullCell}(0.5),))
+                apply_boundary_conditions!(Val(1), Val(1), arch, grid, west_bc; async=false)
+                apply_boundary_conditions!(Val(2), Val(1), arch, grid, east_bc; async=false)
+                @test all((parent(field)[1, 2:ny+1, 2:nz+1] .+ parent(field)[2, 2:ny+1, 2:nz+1]) ./ 2 .≈ west_bc.conditions[1].condition)
+                @test all(parent(field)[nx+2, 2:ny+1, 2:nz+1] .≈ east_bc.conditions[1].condition)
             end
             @testset "y-dim" begin
                 data(field) .= 0.0
-                south_bc = DirichletBC{HalfCell}(1.0)
-                north_bc = DirichletBC{FullCell}(0.5)
-                apply_bcs!(Val(2), backend, grid, (field, ), ((south_bc, north_bc), ); async=false)
-                @test all((parent(field)[2:nx+1, 1   , 2:nz+1] .+ parent(field)[2:nx+1, 2, 2:nz+1]) ./ 2 .≈ south_bc.condition)
-                @test all( parent(field)[2:nx+1, ny+2, 2:nz+1] .≈ north_bc.condition)
+                south_bc = BoundaryConditionsBatch((field,), (DirichletBC{HalfCell}(1.0),))
+                north_bc = BoundaryConditionsBatch((field,), (DirichletBC{FullCell}(0.5),))
+                apply_boundary_conditions!(Val(1), Val(2), arch, grid, south_bc; async=false)
+                apply_boundary_conditions!(Val(2), Val(2), arch, grid, north_bc; async=false)
+                @test all((parent(field)[2:nx+1, 1, 2:nz+1] .+ parent(field)[2:nx+1, 2, 2:nz+1]) ./ 2 .≈ south_bc.conditions[1].condition)
+                @test all(parent(field)[2:nx+1, ny+2, 2:nz+1] .≈ north_bc.conditions[1].condition)
             end
             @testset "z-dim" begin
                 data(field) .= 0.0
-                bot_bc = DirichletBC{HalfCell}(1.0)
-                top_bc = DirichletBC{FullCell}(0.5)
-                apply_bcs!(Val(3), backend, grid, (field, ), ((bot_bc, top_bc), ); async=false)
-                @test all((parent(field)[2:nx+1, 2:ny+1, 1   ] .+ parent(field)[2:nx+1, 2:ny+1, 2]) ./ 2 .≈ bot_bc.condition)
-                @test all( parent(field)[2:nx+1, 2:ny+1, nz+2] .≈ top_bc.condition)
+                bottom_bc = BoundaryConditionsBatch((field,), (DirichletBC{HalfCell}(1.0),))
+                top_bc = BoundaryConditionsBatch((field,), (DirichletBC{FullCell}(0.5),))
+                apply_boundary_conditions!(Val(1), Val(3), arch, grid, bottom_bc; async=false)
+                apply_boundary_conditions!(Val(2), Val(3), arch, grid, top_bc; async=false)
+                @test all((parent(field)[2:nx+1, 2:ny+1, 1] .+ parent(field)[2:nx+1, 2:ny+1, 2]) ./ 2 .≈ bottom_bc.conditions[1].condition)
+                @test all(parent(field)[2:nx+1, 2:ny+1, nz+2] .≈ top_bc.conditions[1].condition)
             end
         end
         @testset "array" begin
@@ -42,11 +47,12 @@ using FastIce.BoundaryConditions
                 bc_array_east = KernelAbstractions.allocate(backend, Float64, (size(grid, 2), size(grid, 3)))
                 bc_array_west .= 1.0
                 bc_array_east .= 0.5
-                west_bc = DirichletBC{HalfCell}(bc_array_west)
-                east_bc = DirichletBC{FullCell}(bc_array_east)
-                apply_bcs!(Val(1), backend, grid, (field, ), ((west_bc, east_bc), ); async=false)
-                @test all((parent(field)[1   , 2:ny+1, 2:nz+1] .+ parent(field)[2, 2:ny+1, 2:nz+1]) ./ 2 .≈ west_bc.condition)
-                @test all( parent(field)[nx+2, 2:ny+1, 2:nz+1] .≈ east_bc.condition)
+                west_bc = BoundaryConditionsBatch((field,), (DirichletBC{HalfCell}(bc_array_west),))
+                east_bc = BoundaryConditionsBatch((field,), (DirichletBC{FullCell}(bc_array_east),))
+                apply_boundary_conditions!(Val(1), Val(1), arch, grid, west_bc; async=false)
+                apply_boundary_conditions!(Val(2), Val(1), arch, grid, east_bc; async=false)
+                @test all((parent(field)[1, 2:ny+1, 2:nz+1] .+ parent(field)[2, 2:ny+1, 2:nz+1]) ./ 2 .≈ west_bc.conditions[1].condition)
+                @test all(parent(field)[nx+2, 2:ny+1, 2:nz+1] .≈ east_bc.conditions[1].condition)
             end
             @testset "y-dim" begin
                 data(field) .= 0.0
@@ -54,11 +60,12 @@ using FastIce.BoundaryConditions
                 bc_array_north = KernelAbstractions.allocate(backend, Float64, (size(grid, 2), size(grid, 3)))
                 bc_array_south .= 1.0
                 bc_array_north .= 0.5
-                south_bc = DirichletBC{HalfCell}(bc_array_south)
-                north_bc = DirichletBC{FullCell}(bc_array_north)
-                apply_bcs!(Val(2), backend, grid, (field, ), ((south_bc, north_bc), ); async=false)
-                @test all((parent(field)[2:nx+1, 1   , 2:nz+1] .+ parent(field)[2:nx+1, 2, 2:nz+1]) ./ 2 .≈ south_bc.condition)
-                @test all( parent(field)[2:nx+1, ny+2, 2:nz+1] .≈ north_bc.condition)
+                south_bc = BoundaryConditionsBatch((field,), (DirichletBC{HalfCell}(bc_array_south),))
+                north_bc = BoundaryConditionsBatch((field,), (DirichletBC{FullCell}(bc_array_north),))
+                apply_boundary_conditions!(Val(1), Val(2), arch, grid, south_bc; async=false)
+                apply_boundary_conditions!(Val(2), Val(2), arch, grid, north_bc; async=false)
+                @test all((parent(field)[2:nx+1, 1, 2:nz+1] .+ parent(field)[2:nx+1, 2, 2:nz+1]) ./ 2 .≈ south_bc.conditions[1].condition)
+                @test all(parent(field)[2:nx+1, ny+2, 2:nz+1] .≈ north_bc.conditions[1].condition)
             end
             @testset "z-dim" begin
                 data(field) .= 0.0
@@ -66,11 +73,12 @@ using FastIce.BoundaryConditions
                 bc_array_top = KernelAbstractions.allocate(backend, Float64, (size(grid, 2), size(grid, 3)))
                 bc_array_bot .= 1.0
                 bc_array_top .= 0.5
-                bot_bc = DirichletBC{HalfCell}(bc_array_bot)
-                top_bc = DirichletBC{FullCell}(bc_array_top)
-                apply_bcs!(Val(3), backend, grid, (field, ), ((bot_bc, top_bc), ); async=false)
-                @test all((parent(field)[2:nx+1, 2:ny+1, 1   ] .+ parent(field)[2:nx+1, 2:ny+1, 2]) ./ 2 .≈ bot_bc.condition)
-                @test all( parent(field)[2:nx+1, 2:ny+1, nz+2] .≈ top_bc.condition)
+                bottom_bc = BoundaryConditionsBatch((field,), (DirichletBC{HalfCell}(bc_array_bot),))
+                top_bc = BoundaryConditionsBatch((field,), (DirichletBC{FullCell}(bc_array_top),))
+                apply_boundary_conditions!(Val(1), Val(3), arch, grid, bottom_bc; async=false)
+                apply_boundary_conditions!(Val(2), Val(3), arch, grid, top_bc; async=false)
+                @test all((parent(field)[2:nx+1, 2:ny+1, 1] .+ parent(field)[2:nx+1, 2:ny+1, 2]) ./ 2 .≈ bottom_bc.conditions[1].condition)
+                @test all(parent(field)[2:nx+1, 2:ny+1, nz+2] .≈ top_bc.conditions[1].condition)
             end
         end
     end
diff --git a/test/test_distributed_2D.jl b/test/test_distributed_2D.jl
new file mode 100644
index 00000000..5e44c8c9
--- /dev/null
+++ b/test/test_distributed_2D.jl
@@ -0,0 +1,47 @@
+include("common.jl")
+
+using MPI
+using FastIce.Distributed
+
+MPI.Init()
+
+nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+
+backend = CPU() # until we have testing environment setup for GPU-aware MPI, run only on CPU
+
+@testset "Distributed 2D" begin
+    mpi_dims = parse.(Int, (get(ENV, "DIMX", "1"),
+                            get(ENV, "DIMY", "1")))
+    dims = (0, 0)
+    topo = CartesianTopology(dims)
+    local_size = (4, 5)
+    @testset "Topology" begin
+        @test dimensions(topo) == mpi_dims
+        @test length(neighbors(topo)) == 2
+        @test node_size(topo) == nprocs
+        if global_rank(topo) == 0
+            @test neighbor(topo, 1, 1) == MPI.PROC_NULL
+            @test neighbor(topo, 2, 1) == MPI.PROC_NULL
+            @test has_neighbor(topo, 1, 1) == false
+            @test has_neighbor(topo, 2, 1) == false
+            if mpi_dims[2] > 1
+                @test neighbor(topo, 1, 2) == mpi_dims[2]
+                @test has_neighbor(topo, 1, 2) == true
+            else
+                @test neighbor(topo, 1, 2) == MPI.PROC_NULL
+                @test has_neighbor(topo, 1, 2) == false
+            end
+        end
+        @test global_grid_size(topo, local_size) == mpi_dims .* local_size
+    end
+    @testset "gather!" begin
+        src = fill(global_rank(topo) + 1, local_size)
+        dst = (global_rank(topo) == 0) ? zeros(Int, mpi_dims .* local_size) : nothing
+        gather!(dst, src, cartesian_communicator(topo))
+        if global_rank(topo) == 0
+            @test dst == repeat(reshape(1:global_size(topo), dimensions(topo))'; inner=size(src))
+        end
+    end
+end
+
+MPI.Finalize()
diff --git a/test/test_distributed_3D.jl b/test/test_distributed_3D.jl
new file mode 100644
index 00000000..d10797b4
--- /dev/null
+++ b/test/test_distributed_3D.jl
@@ -0,0 +1,55 @@
+include("common.jl")
+
+using MPI
+using FastIce.Distributed
+
+MPI.Init()
+
+nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+
+backend = CPU() # until we have testing environment setup for GPU-aware MPI, run only on CPU
+
+@testset "Distributed 3D" begin
+    mpi_dims = parse.(Int, (get(ENV, "DIMX", "1"),
+                            get(ENV, "DIMY", "1"),
+                            get(ENV, "DIMZ", "1")))
+    dims = (0, 0, 0)
+    topo = CartesianTopology(dims)
+    local_size = (4, 5, 6)
+    @testset "Topology" begin
+        @test dimensions(topo) == mpi_dims
+        @test length(neighbors(topo)) == 3
+        @test node_size(topo) == nprocs
+        if global_rank(topo) == 0
+            @test neighbor(topo, 1, 1) == MPI.PROC_NULL
+            @test neighbor(topo, 2, 1) == MPI.PROC_NULL
+            @test neighbor(topo, 3, 1) == MPI.PROC_NULL
+            @test has_neighbor(topo, 1, 1) == false
+            @test has_neighbor(topo, 2, 1) == false
+            @test has_neighbor(topo, 3, 1) == false
+            if mpi_dims[2] > 1 && mpi_dims[3] > 1
+                @test neighbor(topo, 1, 2) == mpi_dims[2] * mpi_dims[3]
+                @test neighbor(topo, 2, 2) == mpi_dims[3]
+                @test has_neighbor(topo, 1, 2) == true
+                @test has_neighbor(topo, 2, 2) == true
+            else
+                @test neighbor(topo, 1, 2) == MPI.PROC_NULL
+                @test neighbor(topo, 2, 2) == MPI.PROC_NULL
+                @test has_neighbor(topo, 1, 2) == false
+                @test has_neighbor(topo, 2, 2) == false
+            end
+        end
+        @test global_grid_size(topo, local_size) == mpi_dims .* local_size
+    end
+    @testset "gather!" begin
+        src = fill(global_rank(topo) + 1, local_size)
+        dst = (global_rank(topo) == 0) ? zeros(Int, mpi_dims .* local_size) : nothing
+        gather!(dst, src, cartesian_communicator(topo))
+        ranks_mat = permutedims(reshape(1:global_size(topo), dimensions(topo)), reverse(1:3))
+        if global_rank(topo) == 0
+            @test dst == repeat(ranks_mat; inner=size(src))
+        end
+    end
+end
+
+MPI.Finalize()
diff --git a/test/test_fields.jl b/test/test_fields.jl
index f640001e..b1394f78 100644
--- a/test/test_fields.jl
+++ b/test/test_fields.jl
@@ -8,7 +8,7 @@ const HALO_SIZES = ((2, 2, 4), (0, 0, 0), (2, 0, 2), (0, 0, 0), (0, 1, 3))
 
 @testset "backend $backend" for backend in backends
     @testset "fields" begin
-        grid = CartesianGrid(origin = (0.0, 0.0, 0.0), extent=(1.0, 1.0, 1.0), size = (2, 2, 2))
+        grid = CartesianGrid(; origin=(0.0, 0.0, 0.0), extent=(1.0, 1.0, 1.0), size=(2, 2, 2))
         loc = (Center(), Vertex(), Center())
         @testset "location" begin
             @test location(Field(backend, grid, Center())) == (Center(), Center(), Center())
@@ -25,17 +25,17 @@ const HALO_SIZES = ((2, 2, 4), (0, 0, 0), (2, 0, 2), (0, 0, 0), (0, 1, 3))
             @testset "discrete" begin
                 # no parameters vertex
                 fill!(data(f), NaN)
-                set!(f, grid, (grid, loc, ix, iy, iz) -> ycoord(grid, loc, iy); discrete=true)
+                set!(f, grid, (grid, loc, I) -> ycoord(grid, loc, I[2]); discrete=true)
                 @test Array(interior(f)) == [0.0; 0.0;; 0.5; 0.5;; 1.0; 1.0;;;
                                              0.0; 0.0;; 0.5; 0.5;; 1.0; 1.0]
                 # no parameters center
                 fill!(data(f), NaN)
-                set!(f, grid, (grid, loc, ix, iy, iz) -> xcoord(grid, loc, ix); discrete=true)
+                set!(f, grid, (grid, loc, I) -> xcoord(grid, loc, I[1]); discrete=true)
                 @test Array(interior(f)) == [0.25; 0.75;; 0.25; 0.75;; 0.25; 0.75;;;
                                              0.25; 0.75;; 0.25; 0.75;; 0.25; 0.75]
                 # with parameters
                 fill!(data(f), NaN)
-                set!(f, grid, (grid, loc, ix, iy, iz, sc) -> ycoord(grid, loc, iy) * sc; discrete=true, parameters=(2.0, ))
+                set!(f, grid, (grid, loc, I, sc) -> ycoord(grid, loc, I[2]) * sc; discrete=true, parameters=(2.0,))
                 @test Array(interior(f)) == [0.0; 0.0;; 1.0; 1.0;; 2.0; 2.0;;;
                                              0.0; 0.0;; 1.0; 1.0;; 2.0; 2.0]
             end
@@ -52,10 +52,10 @@ const HALO_SIZES = ((2, 2, 4), (0, 0, 0), (2, 0, 2), (0, 0, 0), (0, 1, 3))
                                              0.25; 0.75;; 0.25; 0.75;; 0.25; 0.75]
                 # with parameters
                 fill!(data(f), NaN)
-                set!(f, grid, (x, y, z, sc) -> y * sc, parameters=(2.0, ))
+                set!(f, grid, (x, y, z, sc) -> y * sc; parameters=(2.0,))
                 @test Array(interior(f)) == [0.0; 0.0;; 1.0; 1.0;; 2.0; 2.0;;;
                                              0.0; 0.0;; 1.0; 1.0;; 2.0; 2.0]
             end
         end
     end
-end
\ No newline at end of file
+end
diff --git a/test/test_utils.jl b/test/test_utils.jl
new file mode 100644
index 00000000..2cfedd29
--- /dev/null
+++ b/test/test_utils.jl
@@ -0,0 +1,42 @@
+include("common.jl")
+
+using FastIce.Utils
+
+@testset "pipelines" begin
+    @testset "pre" begin
+        a = 0
+        pipe = Pipeline(; pre=() -> a += 1)
+        put!(() -> nothing, pipe)
+        wait(pipe)
+        @test a == 1
+        close(pipe)
+    end
+
+    @testset "post" begin
+        a = 0
+        pipe = Pipeline(; post=() -> a += 2)
+        put!(pipe) do
+            a -= 1
+        end
+        wait(pipe)
+        close(pipe)
+        @test a == 1
+    end
+
+    @testset "do work" begin
+        a = 0
+        pipe = Pipeline()
+        put!(pipe) do
+            a += 1
+        end
+        wait(pipe)
+        close(pipe)
+        @test a == 1
+    end
+
+    @testset "not running" begin
+        pipe = Pipeline()
+        close(pipe)
+        @test_throws ErrorException wait(pipe)
+    end
+end