Skip to content

Commit

Permalink
Merge pull request #96 from semi-h/cufft
Browse files Browse the repository at this point in the history
Add multi-GPU support for the Poisson solver in CUDA backend
  • Loading branch information
semi-h authored Jun 19, 2024
2 parents f84015b + 50b36de commit 38e281d
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 90 deletions.
3 changes: 1 addition & 2 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,9 @@ if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI" OR
set(CMAKE_Fortran_FLAGS_DEBUG "-g -O0 -traceback -Mbounds -Mchkptr -Ktrap=fp")
set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -fast")
target_link_options(x3d2 INTERFACE "-cuda")
target_link_options(x3d2 INTERFACE "-lcufft")
target_link_options(x3d2 INTERFACE "-cudalib=cufftmp")

target_compile_options(xcompact PRIVATE "-DCUDA")
# target_link_options(xcompact INTERFACE "-cuda")
elseif(${CMAKE_Fortran_COMPILER_ID} STREQUAL "GNU")
set(CMAKE_Fortran_FLAGS "-cpp -std=f2008")
set(CMAKE_Fortran_FLAGS_DEBUG "-g -Og -Wall -Wpedantic -Werror -Wimplicit-interface -Wimplicit-procedure -Wno-unused-dummy-argument")
Expand Down
57 changes: 32 additions & 25 deletions src/cuda/kernels/spectral_processing.f90
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ module m_cuda_spectral
contains

attributes(global) subroutine process_spectral_div_u( &
div_u, waves, nx, ny, nz, ax, bx, ay, by, az, bz &
div_u, waves, nx_spec, ny_spec, y_sp_st, nx, ny, nz, &
ax, bx, ay, by, az, bz &
)
!! Post-processes the divergence of velocity in spectral space, including
!! scaling w.r.t. grid size.
Expand All @@ -21,43 +22,49 @@ attributes(global) subroutine process_spectral_div_u( &
!> Spectral equivalence constants
complex(dp), device, intent(in), dimension(:, :, :) :: waves
real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz
!> Grid size in spectral space
integer, value, intent(in) :: nx_spec, ny_spec
!> Offset in y direction in the permuted slabs in spectral space
integer, value, intent(in) :: y_sp_st
!> Grid size
integer, value, intent(in) :: nx, ny, nz

integer :: i, j, k
integer :: i, j, k, ix, iy, iz
real(dp) :: tmp_r, tmp_c, div_r, div_c

j = threadIdx%x + (blockIdx%x - 1)*blockDim%x
k = blockIdx%y
k = blockIdx%y ! nz_spec

if (j <= ny) then
do i = 1, nx/2 + 1
if (j <= ny_spec) then
do i = 1, nx_spec
! normalisation
div_r = real(div_u(i, j, k), kind=dp)/(nx*ny*nz)
div_c = aimag(div_u(i, j, k))/(nx*ny*nz)

ix = i; iy = j + y_sp_st; iz = k

! post-process forward
! post-process in z
tmp_r = div_r
tmp_c = div_c
div_r = tmp_r*bz(k) + tmp_c*az(k)
div_c = tmp_c*bz(k) - tmp_r*az(k)
div_r = tmp_r*bz(iz) + tmp_c*az(iz)
div_c = tmp_c*bz(iz) - tmp_r*az(iz)
if (iz > nz/2 + 1) div_r = -div_r
if (iz > nz/2 + 1) div_c = -div_c

! post-process in y
tmp_r = div_r
tmp_c = div_c
div_r = tmp_r*by(j) + tmp_c*ay(j)
div_c = tmp_c*by(j) - tmp_r*ay(j)
if (j > ny/2 + 1) div_r = -div_r
if (j > ny/2 + 1) div_c = -div_c
div_r = tmp_r*by(iy) + tmp_c*ay(iy)
div_c = tmp_c*by(iy) - tmp_r*ay(iy)
if (iy > ny/2 + 1) div_r = -div_r
if (iy > ny/2 + 1) div_c = -div_c

! post-process in x
tmp_r = div_r
tmp_c = div_c
div_r = tmp_r*bx(i) + tmp_c*ax(i)
div_c = tmp_c*bx(i) - tmp_r*ax(i)
if (i > nx/2 + 1) div_r = -div_r
if (i > nx/2 + 1) div_c = -div_c
div_r = tmp_r*bx(ix) + tmp_c*ax(ix)
div_c = tmp_c*bx(ix) - tmp_r*ax(ix)

! Solve Poisson
tmp_r = real(waves(i, j, k), kind=dp)
Expand All @@ -73,24 +80,24 @@ attributes(global) subroutine process_spectral_div_u( &
! post-process in z
tmp_r = div_r
tmp_c = div_c
div_r = tmp_r*bz(k) - tmp_c*az(k)
div_c = -tmp_c*bz(k) - tmp_r*az(k)
div_r = tmp_r*bz(iz) - tmp_c*az(iz)
div_c = -tmp_c*bz(iz) - tmp_r*az(iz)
if (iz > nz/2 + 1) div_r = -div_r
if (iz > nz/2 + 1) div_c = -div_c

! post-process in y
tmp_r = div_r
tmp_c = div_c
div_r = tmp_r*by(j) + tmp_c*ay(j)
div_c = tmp_c*by(j) - tmp_r*ay(j)
if (j > ny/2 + 1) div_r = -div_r
if (j > ny/2 + 1) div_c = -div_c
div_r = tmp_r*by(iy) + tmp_c*ay(iy)
div_c = tmp_c*by(iy) - tmp_r*ay(iy)
if (iy > ny/2 + 1) div_r = -div_r
if (iy > ny/2 + 1) div_c = -div_c

! post-process in x
tmp_r = div_r
tmp_c = div_c
div_r = tmp_r*bx(i) + tmp_c*ax(i)
div_c = -tmp_c*bx(i) + tmp_r*ax(i)
if (i > nx/2 + 1) div_r = -div_r
if (i > nx/2 + 1) div_c = -div_c
div_r = tmp_r*bx(ix) + tmp_c*ax(ix)
div_c = -tmp_c*bx(ix) + tmp_r*ax(ix)

! update the entry
div_u(i, j, k) = cmplx(div_r, div_c, kind=dp)
Expand Down
117 changes: 77 additions & 40 deletions src/cuda/poisson_fft.f90
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ module m_cuda_poisson_fft
use iso_c_binding, only: c_loc, c_ptr, c_f_pointer
use iso_fortran_env, only: stderr => error_unit
use cudafor
use cufftXt
use cufft
use mpi

use m_allocator, only: field_t
use m_common, only: dp, DIR_X, DIR_Y, DIR_Z, CELL
Expand All @@ -18,18 +20,17 @@ module m_cuda_poisson_fft
type, extends(poisson_fft_t) :: cuda_poisson_fft_t
!! FFT based Poisson solver

!> Local domain sized array to store data in spectral space
complex(dp), device, allocatable, dimension(:, :, :) :: c_w_dev
!> Local domain sized array storing the spectral equivalence constants
complex(dp), device, allocatable, dimension(:, :, :) :: waves_dev
!> cufft requires a local domain sized storage
complex(dp), device, allocatable, dimension(:) :: fft_worksize

!> Wave numbers in x, y, and z
real(dp), device, allocatable, dimension(:) :: ax_dev, bx_dev, &
ay_dev, by_dev, &
az_dev, bz_dev

!> Forward and backward FFT transform plans
integer :: plan3D_fw, plan3D_bw

!> cuFFTMp object manages decomposition and data storage
type(cudaLibXtDesc), pointer :: xtdesc
contains
procedure :: fft_forward => fft_forward_cuda
procedure :: fft_backward => fft_backward_cuda
Expand Down Expand Up @@ -59,9 +60,13 @@ function init(mesh, xdirps, ydirps, zdirps) result(poisson_fft)

call poisson_fft%base_init(mesh, xdirps, ydirps, zdirps)

nx = poisson_fft%nx; ny = poisson_fft%ny; nz = poisson_fft%nz
nx = poisson_fft%nx_glob
ny = poisson_fft%ny_glob
nz = poisson_fft%nz_glob

allocate (poisson_fft%waves_dev(nx/2 + 1, ny, nz))
allocate (poisson_fft%waves_dev(poisson_fft%nx_spec, &
poisson_fft%ny_spec, &
poisson_fft%nz_spec))
poisson_fft%waves_dev = poisson_fft%waves

allocate (poisson_fft%ax_dev(nx), poisson_fft%bx_dev(nx))
Expand All @@ -71,28 +76,35 @@ function init(mesh, xdirps, ydirps, zdirps) result(poisson_fft)
poisson_fft%ay_dev = poisson_fft%ay; poisson_fft%by_dev = poisson_fft%by
poisson_fft%az_dev = poisson_fft%az; poisson_fft%bz_dev = poisson_fft%bz

allocate (poisson_fft%c_w_dev(nx/2 + 1, ny, nz))
allocate (poisson_fft%fft_worksize((nx/2 + 1)*ny*nz))

! 3D plans
ierr = cufftCreate(poisson_fft%plan3D_fw)
ierr = cufftMpAttachComm(poisson_fft%plan3D_fw, CUFFT_COMM_MPI, &
MPI_COMM_WORLD)
ierr = cufftMakePlan3D(poisson_fft%plan3D_fw, nz, ny, nx, CUFFT_D2Z, &
worksize)
ierr = cufftSetWorkArea(poisson_fft%plan3D_fw, poisson_fft%fft_worksize)
if (ierr /= 0) then
write (stderr, *), "cuFFT Error Code: ", ierr
write (stderr, *), 'cuFFT Error Code: ', ierr
error stop 'Forward 3D FFT plan generation failed'
end if

ierr = cufftCreate(poisson_fft%plan3D_bw)
ierr = cufftMpAttachComm(poisson_fft%plan3D_bw, CUFFT_COMM_MPI, &
MPI_COMM_WORLD)
ierr = cufftMakePlan3D(poisson_fft%plan3D_bw, nz, ny, nx, CUFFT_Z2D, &
worksize)
ierr = cufftSetWorkArea(poisson_fft%plan3D_bw, poisson_fft%fft_worksize)
if (ierr /= 0) then
write (stderr, *), "cuFFT Error Code: ", ierr
write (stderr, *), 'cuFFT Error Code: ', ierr
error stop 'Backward 3D FFT plan generation failed'
end if

! allocate storage for cuFFTMp
ierr = cufftXtMalloc(poisson_fft%plan3D_fw, poisson_fft%xtdesc, &
CUFFT_XT_FORMAT_INPLACE)
if (ierr /= 0) then
write (stderr, *), 'cuFFT Error Code: ', ierr
error stop 'cufftXtMalloc failed'
end if

end function init

subroutine fft_forward_cuda(self, f)
Expand All @@ -101,23 +113,32 @@ subroutine fft_forward_cuda(self, f)
class(cuda_poisson_fft_t) :: self
class(field_t), intent(in) :: f

real(dp), device, pointer, dimension(:, :, :) :: f_dev
real(dp), device, pointer :: f_ptr
type(c_ptr) :: f_c_ptr
real(dp), device, pointer :: flat_dev(:, :), d_dev(:, :, :)

type(cudaXtDesc), pointer :: descriptor

type(dim3) :: blocks, threads
integer :: ierr

select type (f); type is (cuda_field_t); f_dev => f%data_d; end select
select type (f)
type is (cuda_field_t)
flat_dev(1:self%nx_loc, 1:self%ny_loc*self%nz_loc) => f%data_d
end select

call c_f_pointer(self%xtdesc%descriptor, descriptor)
call c_f_pointer(descriptor%data(1), d_dev, &
[self%nx_loc + 2, self%ny_loc*self%nz_loc])
ierr = cudaMemcpy2D(d_dev, self%nx_loc + 2, flat_dev, self%nx_loc, &
self%nx_loc, self%ny_loc*self%nz_loc)
if (ierr /= 0) then
print *, 'cudaMemcpy2D error code: ', ierr
error stop 'cudaMemcpy2D failed'
end if

! Using f_dev directly in cufft call causes a segfault
! Pointer switches below fixes the problem
f_c_ptr = c_loc(f_dev)
call c_f_pointer(f_c_ptr, f_ptr)
ierr = cufftXtExecDescriptor(self%plan3D_fw, self%xtdesc, self%xtdesc, &
CUFFT_FORWARD)

ierr = cufftExecD2Z(self%plan3D_fw, f_ptr, self%c_w_dev)
if (ierr /= 0) then
write (stderr, *), "cuFFT Error Code: ", ierr
write (stderr, *), 'cuFFT Error Code: ', ierr
error stop 'Forward 3D FFT execution failed'
end if

Expand All @@ -129,24 +150,32 @@ subroutine fft_backward_cuda(self, f)
class(cuda_poisson_fft_t) :: self
class(field_t), intent(inout) :: f

real(dp), device, pointer, dimension(:, :, :) :: f_dev
real(dp), device, pointer :: f_ptr
type(c_ptr) :: f_c_ptr
real(dp), device, pointer :: flat_dev(:, :), d_dev(:, :, :)

type(cudaXtDesc), pointer :: descriptor

type(dim3) :: blocks, threads
integer :: ierr

select type (f); type is (cuda_field_t); f_dev => f%data_d; end select
ierr = cufftXtExecDescriptor(self%plan3D_bw, self%xtdesc, self%xtdesc, &
CUFFT_INVERSE)
if (ierr /= 0) then
write (stderr, *), 'cuFFT Error Code: ', ierr
error stop 'Backward 3D FFT execution failed'
end if

! Using f_dev directly in cufft call causes a segfault
! Pointer switches below fixes the problem
f_c_ptr = c_loc(f_dev)
call c_f_pointer(f_c_ptr, f_ptr)
select type (f)
type is (cuda_field_t)
flat_dev(1:self%nx_loc, 1:self%ny_loc*self%nz_loc) => f%data_d
end select

ierr = cufftExecZ2D(self%plan3D_bw, self%c_w_dev, f_ptr)
call c_f_pointer(self%xtdesc%descriptor, descriptor)
call c_f_pointer(descriptor%data(1), d_dev, &
[self%nx_loc + 2, self%ny_loc*self%nz_loc])
ierr = cudaMemcpy2D(flat_dev, self%nx_loc, d_dev, self%nx_loc + 2, &
self%nx_loc, self%ny_loc*self%nz_loc)
if (ierr /= 0) then
write (stderr, *), "cuFFT Error Code: ", ierr
error stop 'Backward 3D FFT execution failed'
print *, 'cudaMemcpy2D error code: ', ierr
error stop 'cudaMemcpy2D failed'
end if

end subroutine fft_backward_cuda
Expand All @@ -156,19 +185,27 @@ subroutine fft_postprocess_cuda(self)

class(cuda_poisson_fft_t) :: self

type(cudaXtDesc), pointer :: descriptor

complex(dp), device, dimension(:, :, :), pointer :: c_dev
type(dim3) :: blocks, threads
integer :: tsize

! obtain a pointer to descriptor so that we can carry out postprocessing
call c_f_pointer(self%xtdesc%descriptor, descriptor)
call c_f_pointer(descriptor%data(1), c_dev, &
[self%nx_spec, self%ny_spec, self%nz_spec])

! tsize is different than SZ, because here we work on a 3D Cartesian
! data structure, and free to specify any suitable thread/block size.
tsize = 16
blocks = dim3((self%ny - 1)/tsize + 1, self%nz, 1)
blocks = dim3((self%ny_spec - 1)/tsize + 1, self%nz_spec, 1)
threads = dim3(tsize, 1, 1)

! Postprocess div_u in spectral space
call process_spectral_div_u<<<blocks, threads>>>( & !&
self%c_w_dev, self%waves_dev, self%nx, self%ny, self%nz, &
c_dev, self%waves_dev, self%nx_spec, self%ny_spec, self%y_sp_st, &
self%nx_glob, self%ny_glob, self%nz_glob, &
self%ax_dev, self%bx_dev, self%ay_dev, self%by_dev, &
self%az_dev, self%bz_dev &
)
Expand Down
Loading

0 comments on commit 38e281d

Please sign in to comment.