Merge pull request #96 from semi-h/cufft

Add multi-GPU support for the Poisson solver in CUDA backend
xcompact3d · Jun 19, 2024 · 38e281d · 38e281d
2 parents f84015b + 50b36de
commit 38e281d
Show file tree

Hide file tree

Showing 6 changed files with 156 additions and 90 deletions.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -47,10 +47,9 @@ if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI" OR
   set(CMAKE_Fortran_FLAGS_DEBUG "-g -O0 -traceback -Mbounds -Mchkptr -Ktrap=fp")
   set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -fast")
   target_link_options(x3d2 INTERFACE "-cuda")
-  target_link_options(x3d2 INTERFACE "-lcufft")
+  target_link_options(x3d2 INTERFACE "-cudalib=cufftmp")
 
   target_compile_options(xcompact PRIVATE "-DCUDA")
-  #  target_link_options(xcompact INTERFACE "-cuda")
 elseif(${CMAKE_Fortran_COMPILER_ID} STREQUAL "GNU")
   set(CMAKE_Fortran_FLAGS "-cpp -std=f2008")
   set(CMAKE_Fortran_FLAGS_DEBUG "-g -Og -Wall -Wpedantic -Werror -Wimplicit-interface -Wimplicit-procedure -Wno-unused-dummy-argument")

diff --git a/src/cuda/kernels/spectral_processing.f90 b/src/cuda/kernels/spectral_processing.f90
@@ -8,7 +8,8 @@ module m_cuda_spectral
 contains
 
   attributes(global) subroutine process_spectral_div_u( &
-    div_u, waves, nx, ny, nz, ax, bx, ay, by, az, bz &
+    div_u, waves, nx_spec, ny_spec, y_sp_st, nx, ny, nz, &
+    ax, bx, ay, by, az, bz &
     )
     !! Post-processes the divergence of velocity in spectral space, including
     !! scaling w.r.t. grid size.
@@ -21,43 +22,49 @@ attributes(global) subroutine process_spectral_div_u( &
     !> Spectral equivalence constants
     complex(dp), device, intent(in), dimension(:, :, :) :: waves
     real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz
+    !> Grid size in spectral space
+    integer, value, intent(in) :: nx_spec, ny_spec
+    !> Offset in y direction in the permuted slabs in spectral space
+    integer, value, intent(in) :: y_sp_st
     !> Grid size
     integer, value, intent(in) :: nx, ny, nz
 
-    integer :: i, j, k
+    integer :: i, j, k, ix, iy, iz
     real(dp) :: tmp_r, tmp_c, div_r, div_c
 
     j = threadIdx%x + (blockIdx%x - 1)*blockDim%x
-    k = blockIdx%y
+    k = blockIdx%y ! nz_spec
 
-    if (j <= ny) then
-      do i = 1, nx/2 + 1
+    if (j <= ny_spec) then
+      do i = 1, nx_spec
         ! normalisation
         div_r = real(div_u(i, j, k), kind=dp)/(nx*ny*nz)
         div_c = aimag(div_u(i, j, k))/(nx*ny*nz)
 
+        ix = i; iy = j + y_sp_st; iz = k
+
         ! post-process forward
         ! post-process in z
         tmp_r = div_r
         tmp_c = div_c
-        div_r = tmp_r*bz(k) + tmp_c*az(k)
-        div_c = tmp_c*bz(k) - tmp_r*az(k)
+        div_r = tmp_r*bz(iz) + tmp_c*az(iz)
+        div_c = tmp_c*bz(iz) - tmp_r*az(iz)
+        if (iz > nz/2 + 1) div_r = -div_r
+        if (iz > nz/2 + 1) div_c = -div_c
 
         ! post-process in y
         tmp_r = div_r
         tmp_c = div_c
-        div_r = tmp_r*by(j) + tmp_c*ay(j)
-        div_c = tmp_c*by(j) - tmp_r*ay(j)
-        if (j > ny/2 + 1) div_r = -div_r
-        if (j > ny/2 + 1) div_c = -div_c
+        div_r = tmp_r*by(iy) + tmp_c*ay(iy)
+        div_c = tmp_c*by(iy) - tmp_r*ay(iy)
+        if (iy > ny/2 + 1) div_r = -div_r
+        if (iy > ny/2 + 1) div_c = -div_c
 
         ! post-process in x
         tmp_r = div_r
         tmp_c = div_c
-        div_r = tmp_r*bx(i) + tmp_c*ax(i)
-        div_c = tmp_c*bx(i) - tmp_r*ax(i)
-        if (i > nx/2 + 1) div_r = -div_r
-        if (i > nx/2 + 1) div_c = -div_c
+        div_r = tmp_r*bx(ix) + tmp_c*ax(ix)
+        div_c = tmp_c*bx(ix) - tmp_r*ax(ix)
 
         ! Solve Poisson
         tmp_r = real(waves(i, j, k), kind=dp)
@@ -73,24 +80,24 @@ attributes(global) subroutine process_spectral_div_u( &
         ! post-process in z
         tmp_r = div_r
         tmp_c = div_c
-        div_r = tmp_r*bz(k) - tmp_c*az(k)
-        div_c = -tmp_c*bz(k) - tmp_r*az(k)
+        div_r = tmp_r*bz(iz) - tmp_c*az(iz)
+        div_c = -tmp_c*bz(iz) - tmp_r*az(iz)
+        if (iz > nz/2 + 1) div_r = -div_r
+        if (iz > nz/2 + 1) div_c = -div_c
 
         ! post-process in y
         tmp_r = div_r
         tmp_c = div_c
-        div_r = tmp_r*by(j) + tmp_c*ay(j)
-        div_c = tmp_c*by(j) - tmp_r*ay(j)
-        if (j > ny/2 + 1) div_r = -div_r
-        if (j > ny/2 + 1) div_c = -div_c
+        div_r = tmp_r*by(iy) + tmp_c*ay(iy)
+        div_c = tmp_c*by(iy) - tmp_r*ay(iy)
+        if (iy > ny/2 + 1) div_r = -div_r
+        if (iy > ny/2 + 1) div_c = -div_c
 
         ! post-process in x
         tmp_r = div_r
         tmp_c = div_c
-        div_r = tmp_r*bx(i) + tmp_c*ax(i)
-        div_c = -tmp_c*bx(i) + tmp_r*ax(i)
-        if (i > nx/2 + 1) div_r = -div_r
-        if (i > nx/2 + 1) div_c = -div_c
+        div_r = tmp_r*bx(ix) + tmp_c*ax(ix)
+        div_c = -tmp_c*bx(ix) + tmp_r*ax(ix)
 
         ! update the entry
         div_u(i, j, k) = cmplx(div_r, div_c, kind=dp)

diff --git a/src/cuda/poisson_fft.f90 b/src/cuda/poisson_fft.f90
@@ -2,7 +2,9 @@ module m_cuda_poisson_fft
   use iso_c_binding, only: c_loc, c_ptr, c_f_pointer
   use iso_fortran_env, only: stderr => error_unit
   use cudafor
+  use cufftXt
   use cufft
+  use mpi
 
   use m_allocator, only: field_t
   use m_common, only: dp, DIR_X, DIR_Y, DIR_Z, CELL
@@ -18,18 +20,17 @@ module m_cuda_poisson_fft
   type, extends(poisson_fft_t) :: cuda_poisson_fft_t
     !! FFT based Poisson solver
 
-    !> Local domain sized array to store data in spectral space
-    complex(dp), device, allocatable, dimension(:, :, :) :: c_w_dev
     !> Local domain sized array storing the spectral equivalence constants
     complex(dp), device, allocatable, dimension(:, :, :) :: waves_dev
-    !> cufft requires a local domain sized storage
-    complex(dp), device, allocatable, dimension(:) :: fft_worksize
-
+    !> Wave numbers in x, y, and z
     real(dp), device, allocatable, dimension(:) :: ax_dev, bx_dev, &
                                                    ay_dev, by_dev, &
                                                    az_dev, bz_dev
-
+    !> Forward and backward FFT transform plans
     integer :: plan3D_fw, plan3D_bw
+
+    !> cuFFTMp object manages decomposition and data storage
+    type(cudaLibXtDesc), pointer :: xtdesc
   contains
     procedure :: fft_forward => fft_forward_cuda
     procedure :: fft_backward => fft_backward_cuda
@@ -59,9 +60,13 @@ function init(mesh, xdirps, ydirps, zdirps) result(poisson_fft)
 
     call poisson_fft%base_init(mesh, xdirps, ydirps, zdirps)
 
-    nx = poisson_fft%nx; ny = poisson_fft%ny; nz = poisson_fft%nz
+    nx = poisson_fft%nx_glob
+    ny = poisson_fft%ny_glob
+    nz = poisson_fft%nz_glob
 
-    allocate (poisson_fft%waves_dev(nx/2 + 1, ny, nz))
+    allocate (poisson_fft%waves_dev(poisson_fft%nx_spec, &
+                                    poisson_fft%ny_spec, &
+                                    poisson_fft%nz_spec))
     poisson_fft%waves_dev = poisson_fft%waves
 
     allocate (poisson_fft%ax_dev(nx), poisson_fft%bx_dev(nx))
@@ -71,28 +76,35 @@ function init(mesh, xdirps, ydirps, zdirps) result(poisson_fft)
     poisson_fft%ay_dev = poisson_fft%ay; poisson_fft%by_dev = poisson_fft%by
     poisson_fft%az_dev = poisson_fft%az; poisson_fft%bz_dev = poisson_fft%bz
 
-    allocate (poisson_fft%c_w_dev(nx/2 + 1, ny, nz))
-    allocate (poisson_fft%fft_worksize((nx/2 + 1)*ny*nz))
-
     ! 3D plans
     ierr = cufftCreate(poisson_fft%plan3D_fw)
+    ierr = cufftMpAttachComm(poisson_fft%plan3D_fw, CUFFT_COMM_MPI, &
+                             MPI_COMM_WORLD)
     ierr = cufftMakePlan3D(poisson_fft%plan3D_fw, nz, ny, nx, CUFFT_D2Z, &
                            worksize)
-    ierr = cufftSetWorkArea(poisson_fft%plan3D_fw, poisson_fft%fft_worksize)
     if (ierr /= 0) then
-      write (stderr, *), "cuFFT Error Code: ", ierr
+      write (stderr, *), 'cuFFT Error Code: ', ierr
       error stop 'Forward 3D FFT plan generation failed'
     end if
 
     ierr = cufftCreate(poisson_fft%plan3D_bw)
+    ierr = cufftMpAttachComm(poisson_fft%plan3D_bw, CUFFT_COMM_MPI, &
+                             MPI_COMM_WORLD)
     ierr = cufftMakePlan3D(poisson_fft%plan3D_bw, nz, ny, nx, CUFFT_Z2D, &
                            worksize)
-    ierr = cufftSetWorkArea(poisson_fft%plan3D_bw, poisson_fft%fft_worksize)
     if (ierr /= 0) then
-      write (stderr, *), "cuFFT Error Code: ", ierr
+      write (stderr, *), 'cuFFT Error Code: ', ierr
       error stop 'Backward 3D FFT plan generation failed'
     end if
 
+    ! allocate storage for cuFFTMp
+    ierr = cufftXtMalloc(poisson_fft%plan3D_fw, poisson_fft%xtdesc, &
+                         CUFFT_XT_FORMAT_INPLACE)
+    if (ierr /= 0) then
+      write (stderr, *), 'cuFFT Error Code: ', ierr
+      error stop 'cufftXtMalloc failed'
+    end if
+
   end function init
 
   subroutine fft_forward_cuda(self, f)
@@ -101,23 +113,32 @@ subroutine fft_forward_cuda(self, f)
     class(cuda_poisson_fft_t) :: self
     class(field_t), intent(in) :: f
 
-    real(dp), device, pointer, dimension(:, :, :) :: f_dev
-    real(dp), device, pointer :: f_ptr
-    type(c_ptr) :: f_c_ptr
+    real(dp), device, pointer :: flat_dev(:, :), d_dev(:, :, :)
+
+    type(cudaXtDesc), pointer :: descriptor
 
-    type(dim3) :: blocks, threads
     integer :: ierr
 
-    select type (f); type is (cuda_field_t); f_dev => f%data_d; end select
+    select type (f)
+    type is (cuda_field_t)
+      flat_dev(1:self%nx_loc, 1:self%ny_loc*self%nz_loc) => f%data_d
+    end select
+
+    call c_f_pointer(self%xtdesc%descriptor, descriptor)
+    call c_f_pointer(descriptor%data(1), d_dev, &
+                     [self%nx_loc + 2, self%ny_loc*self%nz_loc])
+    ierr = cudaMemcpy2D(d_dev, self%nx_loc + 2, flat_dev, self%nx_loc, &
+                        self%nx_loc, self%ny_loc*self%nz_loc)
+    if (ierr /= 0) then
+      print *, 'cudaMemcpy2D error code: ', ierr
+      error stop 'cudaMemcpy2D failed'
+    end if
 
-    ! Using f_dev directly in cufft call causes a segfault
-    ! Pointer switches below fixes the problem
-    f_c_ptr = c_loc(f_dev)
-    call c_f_pointer(f_c_ptr, f_ptr)
+    ierr = cufftXtExecDescriptor(self%plan3D_fw, self%xtdesc, self%xtdesc, &
+                                 CUFFT_FORWARD)
 
-    ierr = cufftExecD2Z(self%plan3D_fw, f_ptr, self%c_w_dev)
     if (ierr /= 0) then
-      write (stderr, *), "cuFFT Error Code: ", ierr
+      write (stderr, *), 'cuFFT Error Code: ', ierr
       error stop 'Forward 3D FFT execution failed'
     end if
 
@@ -129,24 +150,32 @@ subroutine fft_backward_cuda(self, f)
     class(cuda_poisson_fft_t) :: self
     class(field_t), intent(inout) :: f
 
-    real(dp), device, pointer, dimension(:, :, :) :: f_dev
-    real(dp), device, pointer :: f_ptr
-    type(c_ptr) :: f_c_ptr
+    real(dp), device, pointer :: flat_dev(:, :), d_dev(:, :, :)
+
+    type(cudaXtDesc), pointer :: descriptor
 
-    type(dim3) :: blocks, threads
     integer :: ierr
 
-    select type (f); type is (cuda_field_t); f_dev => f%data_d; end select
+    ierr = cufftXtExecDescriptor(self%plan3D_bw, self%xtdesc, self%xtdesc, &
+                                 CUFFT_INVERSE)
+    if (ierr /= 0) then
+      write (stderr, *), 'cuFFT Error Code: ', ierr
+      error stop 'Backward 3D FFT execution failed'
+    end if
 
-    ! Using f_dev directly in cufft call causes a segfault
-    ! Pointer switches below fixes the problem
-    f_c_ptr = c_loc(f_dev)
-    call c_f_pointer(f_c_ptr, f_ptr)
+    select type (f)
+    type is (cuda_field_t)
+      flat_dev(1:self%nx_loc, 1:self%ny_loc*self%nz_loc) => f%data_d
+    end select
 
-    ierr = cufftExecZ2D(self%plan3D_bw, self%c_w_dev, f_ptr)
+    call c_f_pointer(self%xtdesc%descriptor, descriptor)
+    call c_f_pointer(descriptor%data(1), d_dev, &
+                     [self%nx_loc + 2, self%ny_loc*self%nz_loc])
+    ierr = cudaMemcpy2D(flat_dev, self%nx_loc, d_dev, self%nx_loc + 2, &
+                        self%nx_loc, self%ny_loc*self%nz_loc)
     if (ierr /= 0) then
-      write (stderr, *), "cuFFT Error Code: ", ierr
-      error stop 'Backward 3D FFT execution failed'
+      print *, 'cudaMemcpy2D error code: ', ierr
+      error stop 'cudaMemcpy2D failed'
     end if
 
   end subroutine fft_backward_cuda
@@ -156,19 +185,27 @@ subroutine fft_postprocess_cuda(self)
 
     class(cuda_poisson_fft_t) :: self
 
+    type(cudaXtDesc), pointer :: descriptor
+
     complex(dp), device, dimension(:, :, :), pointer :: c_dev
     type(dim3) :: blocks, threads
     integer :: tsize
 
+    ! obtain a pointer to descriptor so that we can carry out postprocessing
+    call c_f_pointer(self%xtdesc%descriptor, descriptor)
+    call c_f_pointer(descriptor%data(1), c_dev, &
+                     [self%nx_spec, self%ny_spec, self%nz_spec])
+
     ! tsize is different than SZ, because here we work on a 3D Cartesian
     ! data structure, and free to specify any suitable thread/block size.
     tsize = 16
-    blocks = dim3((self%ny - 1)/tsize + 1, self%nz, 1)
+    blocks = dim3((self%ny_spec - 1)/tsize + 1, self%nz_spec, 1)
     threads = dim3(tsize, 1, 1)
 
     ! Postprocess div_u in spectral space
     call process_spectral_div_u<<<blocks, threads>>>( & !&
-      self%c_w_dev, self%waves_dev, self%nx, self%ny, self%nz, &
+      c_dev, self%waves_dev, self%nx_spec, self%ny_spec, self%y_sp_st, &
+      self%nx_glob, self%ny_glob, self%nz_glob, &
       self%ax_dev, self%bx_dev, self%ay_dev, self%by_dev, &
       self%az_dev, self%bz_dev &
       )