Skip to content

Commit

Permalink
Merge pull request #6 from semi-h/distributed
Browse files Browse the repository at this point in the history
Add a distributed tridiagonal solver.
  • Loading branch information
semi-h authored Sep 15, 2023
2 parents 35e7219 + 85658b2 commit 549f935
Show file tree
Hide file tree
Showing 6 changed files with 415 additions and 0 deletions.
5 changes: 5 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
set(SRC
allocator.f90
common.f90
derparams.f90
thomas.f90
stencil.f90
stencil_definitions.f90
Expand All @@ -9,6 +11,7 @@ set(SRC
)
set(CUDASRC
cuda/cuda_allocator.f90
cuda/kernels_dist.f90
)

if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI")
Expand All @@ -20,6 +23,8 @@ target_include_directories(x3d2 INTERFACE ${CMAKE_CURRENT_BINARY_DIR})

if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI")
target_compile_options(x3d2 PRIVATE "-cuda")
target_compile_options(x3d2 PRIVATE "-O3")
target_compile_options(x3d2 PRIVATE "-fast")
target_link_options(x3d2 INTERFACE "-cuda")
endif()

Expand Down
7 changes: 7 additions & 0 deletions src/common.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
module m_common
implicit none

integer, parameter :: dp=kind(0.0d0), SZ=32
real(dp), parameter :: pi = 4*atan(1.0_dp)

end module m_common
153 changes: 153 additions & 0 deletions src/cuda/kernels_dist.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
module m_cuda_kernels_dist
use cudafor

use m_common, only: dp

implicit none

contains

attributes(global) subroutine der_univ_dist( &
du, send_u_b, send_u_e, u, u_b, u_e, coeffs_b, coeffs_e, coeffs, n, &
alfa, ffr, fbc &
)
implicit none

! Arguments
real(dp), device, intent(out), dimension(:, :, :) :: du, send_u_b, &
send_u_e
real(dp), device, intent(in), dimension(:, :, :) :: u, u_b, u_e
real(dp), device, intent(in), dimension(:) :: ffr, fbc
real(dp), device, intent(in), dimension(:, :) :: coeffs_b, coeffs_e
real(dp), device, intent(in), dimension(:) :: coeffs
real(dp), value, intent(in) :: alfa
integer, value, intent(in) :: n

! Local variables
integer :: i, j, b, k, lj
integer :: jm2, jm1, jp1, jp2
integer :: n_s, n_m, n_b, n_e !stencil, middle, begin, end

real(dp) :: temp_du, c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4

i = threadIdx%x
b = blockIdx%x

n_s = (size(coeffs)-1)/2
n_m = size(coeffs)
n_b = size(coeffs_b, dim=2)
n_e = size(coeffs_e, dim=2)

! store bulk coeffs in the registers
c_m4 = coeffs(1); c_m3 = coeffs(2); c_m2 = coeffs(3); c_m1 = coeffs(4)
c_j = coeffs(5)
c_p1 = coeffs(6); c_p2 = coeffs(7); c_p3 = coeffs(8); c_p4 = coeffs(9)

du(i, 1, b) = coeffs(1)*u_b(i, 1, b) + coeffs(2)*u_b(i, 2, b) &
+ coeffs(3)*u_b(i, 3, b) + coeffs(4)*u_b(i, 4, b) &
+ coeffs(5)*u(i, 1, b) &
+ coeffs(6)*u(i, 2, b) + coeffs(7)*u(i, 3, b) &
+ coeffs(8)*u(i, 4, b) + coeffs(9)*u(i, 5, b)
du(i, 2, b) = coeffs(1)*u_b(i, 2, b) + coeffs(2)*u_b(i, 3, b) &
+ coeffs(3)*u_b(i, 4, b) + coeffs(4)*u(i, 1, b) &
+ coeffs(5)*u(i, 2, b) &
+ coeffs(6)*u(i, 3, b) + coeffs(7)*u(i, 4, b) &
+ coeffs(8)*u(i, 5, b) + coeffs(9)*u(i, 6, b)
du(i, 3, b) = coeffs(1)*u_b(i, 3, b) + coeffs(2)*u_b(i, 4, b) &
+ coeffs(3)*u(i, 1, b) + coeffs(4)*u(i, 2, b) &
+ coeffs(5)*u(i, 3, b) &
+ coeffs(6)*u(i, 4, b) + coeffs(7)*u(i, 5, b) &
+ coeffs(8)*u(i, 6, b) + coeffs(9)*u(i, 7, b)
du(i, 3, b) = ffr(3)*(du(i, 3, b) - alfa*du(i, 2, b))
du(i, 4, b) = coeffs(1)*u_b(i, 4, b) + coeffs(2)*u(i, 1, b) &
+ coeffs(3)*u(i, 2, b) + coeffs(4)*u(i, 3, b) &
+ coeffs(5)*u(i, 4, b) &
+ coeffs(6)*u(i, 5, b) + coeffs(7)*u(i, 6, b) &
+ coeffs(8)*u(i, 7, b) + coeffs(9)*u(i, 8, b)
du(i, 4, b) = ffr(4)*(du(i, 4, b) - alfa*du(i, 3, b))

do j = n_s+1, n-n_s
temp_du = c_m4*u(i, j-4, b) + c_m3*u(i, j-3, b) &
+ c_m2*u(i, j-2, b) + c_m1*u(i, j-1, b) &
+ c_j*u(i, j, b) &
+ c_p1*u(i, j+1, b) + c_p2*u(i, j+2, b) &
+ c_p3*u(i, j+3, b) + c_p4*u(i, j+4, b)
du(i, j, b) = ffr(j)*(temp_du - alfa*du(i, j-1, b))
end do

j = n-3
du(i, j, b) = coeffs(1)*u(i, j-4, b) + coeffs(2)*u(i, j-3, b) &
+ coeffs(3)*u(i, j-2, b) + coeffs(4)*u(i, j-1, b) &
+ coeffs(5)*u(i, j, b) &
+ coeffs(6)*u(i, j+1, b) + coeffs(7)*u(i, j+2, b) &
+ coeffs(8)*u(i, j+3, b) + coeffs(9)*u_e(i, 1, b)
du(i, j, b) = ffr(j)*(du(i, j, b) - alfa*du(i, j-1, b))
j = n-2
du(i, j, b) = coeffs(1)*u(i, j-4, b) + coeffs(2)*u(i, j-3, b) &
+ coeffs(3)*u(i, j-2, b) + coeffs(4)*u(i, j-1, b) &
+ coeffs(5)*u(i, j, b) &
+ coeffs(6)*u(i, j+1, b) + coeffs(7)*u(i, j+2, b) &
+ coeffs(8)*u_e(i, 1, b) + coeffs(9)*u_e(i, 2, b)
du(i, j, b) = ffr(j)*(du(i, j, b) - alfa*du(i, j-1, b))
j = n-1
du(i, j, b) = coeffs(1)*u(i, j-4, b) + coeffs(2)*u(i, j-3, b) &
+ coeffs(3)*u(i, j-2, b) + coeffs(4)*u(i, j-1, b) &
+ coeffs(5)*u(i, j, b) &
+ coeffs(6)*u(i, j+1, b) + coeffs(7)*u_e(i, 1, b) &
+ coeffs(8)*u_e(i, 2, b) + coeffs(9)*u_e(i, 3, b)
du(i, j, b) = ffr(j)*(du(i, j, b) - alfa*du(i, j-1, b))
j = n
du(i, j, b) = coeffs(1)*u(i, j-4, b) + coeffs(2)*u(i, j-3, b) &
+ coeffs(3)*u(i, j-2, b) + coeffs(4)*u(i, j-1, b) &
+ coeffs(5)*u(i, j, b) &
+ coeffs(6)*u_e(i, 1, b) + coeffs(7)*u_e(i, 2, b) &
+ coeffs(8)*u_e(i, 3, b) + coeffs(9)*u_e(i, 4, b)
du(i, j, b) = ffr(j)*(du(i, j, b) - alfa*du(i, j-1, b))

send_u_e(i, 1, b) = du(i, n, b)

! Backward pass of the hybrid algorithm
do j = n - 2, 2, -1
du(i, j, b) = du(i, j, b) - fbc(j)*du(i, j + 1, b)
end do
du(i, 1, b) = ffr(1)*(du(i, 1, b) - fbc(1)*du(i, 2, b))
send_u_b(i, 1, b) = du(i, 1, b)

end subroutine der_univ_dist

attributes(global) subroutine der_univ_subs(du, recv_u_b, recv_u_e, &
n, alfa, dist_sa, dist_sc)
implicit none

! Arguments
real(dp), device, intent(out), dimension(:, :, :) :: du
real(dp), device, intent(in), dimension(:, :, :) :: recv_u_b, recv_u_e
real(dp), device, intent(in), dimension(:) :: dist_sa, dist_sc
real(dp), value, intent(in) :: alfa
integer, value, intent(in) :: n

! Local variables
integer :: i, j, b
real(dp) :: ur, bl, recp, du_1, du_n

i = threadIdx%x
b = blockIdx%x

bl = dist_sa(1)
ur = dist_sc(n)
recp = 1._dp/(1._dp - ur*bl)

!du(i, 1, b) = recp*(du(i, 1, b) - bl*recv_u_b(i, 1, b))
!du(i, n, b) = recp*(du(i, n, b) - ur*recv_u_e(i, 1, b))
du_1 = recp*(du(i, 1, b) - bl*recv_u_b(i, 1, b))
du_n = recp*(du(i, n, b) - ur*recv_u_e(i, 1, b))

du(i, 1, b) = du_1
do j = 2, n-1
du(i, j, b) = (du(i, j, b) - dist_sa(j)*du_1 - dist_sc(j)*du_n)
end do
du(i, n, b) = du_n

end subroutine der_univ_subs

end module m_cuda_kernels_dist
107 changes: 107 additions & 0 deletions src/derparams.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
module m_derparams
use m_common, only: dp

implicit none

contains

subroutine der_1_vv()
implicit none
end subroutine der_1_vv

subroutine der_2_vv(coeffs, coeffs_b, coeffs_e, &
dist_fr, dist_bc, dist_sa, dist_sc, &
n_halo, alfa, dx2, n, bcond)
implicit none

real(dp), allocatable, dimension(:), intent(out) :: coeffs, &
dist_fr, dist_bc, dist_sa, dist_sc
real(dp), allocatable, dimension(:,:), intent(out) :: coeffs_b, coeffs_e
integer, intent(out) :: n_halo
real(dp), intent(out) :: alfa
real(dp), intent(in) :: dx2
integer, intent(in) :: n
character(len=*), intent(in) :: bcond

real(dp), allocatable :: dist_b(:)
real(dp) :: asi, bsi, csi, dsi
integer :: i, n_stencil

allocate(dist_sa(n), dist_sc(n), dist_b(n))

alfa = 2._dp/11._dp
asi = 12._dp/11._dp/dx2
bsi = 3._dp/44._dp/dx2
csi = 0._dp
dsi = 0._dp

n_halo = 4
n_stencil = 2*n_halo+1

coeffs = [dsi, csi, bsi, asi, &
-2._dp*(asi+bsi+csi+dsi), &
asi, bsi, csi, dsi]

select case (bcond)
case ('periodic')
dist_sa(:) = alfa; dist_sc(:) = alfa; dist_b(:) = 1._dp
allocate(coeffs_b(n_halo, n_stencil))
allocate(coeffs_e(n_halo, n_stencil))
do i = 1, n_halo
coeffs_b(i,:) = coeffs(:)
coeffs_e(i,:) = coeffs(:)
end do
case default
print*, 'Boundary condition is not recognized :', bcond
end select

call process_dist(dist_fr, dist_bc, dist_sa, dist_sc, dist_b, n)

end subroutine der_2_vv

subroutine process_dist(dist_fr, dist_bc, dist_sa, dist_sc, dist_b, n)
implicit none

real(dp), allocatable, dimension(:), intent(out) :: dist_fr, dist_bc
real(dp), dimension(:), intent(inout) :: dist_sa, dist_sc, dist_b
integer, intent(in) :: n

integer :: i, nrank, nproc, m

m = n
nrank = 0; nproc = 1

allocate(dist_fr(n), dist_bc(n))

do nrank = 0, nproc-1

! start the hybrid algorithm
do i = 1+m*nrank, 2+m*nrank
dist_sa(i) = dist_sa(i)/dist_b(i)
dist_sc(i) = dist_sc(i)/dist_b(i)
dist_bc(i) = dist_sc(i)
end do
do i = 3+m*nrank, m+m*nrank
dist_fr(i) = 1.d0/(dist_b(i)-dist_sa(i)*dist_sc(i-1))
dist_sa(i) = -dist_fr(i)*dist_sa(i)*dist_sa(i-1)
dist_sc(i) = dist_fr(i)*dist_sc(i)
!dist_bc(i) = dist_sc(i)
end do
do i = m-2+m*nrank, 2+m*nrank, -1
dist_sa(i) = dist_sa(i)-dist_sc(i)*dist_sa(i+1)
dist_bc(i) = dist_sc(i)
dist_sc(i) = -dist_sc(i)*dist_sc(i+1)
end do
! this is not good
dist_fr(1+m*nrank) = 1.d0/(1.d0-dist_sc(1+m*nrank)*dist_sa(2+m*nrank))

dist_sa(1+m*nrank) = dist_fr(1+m*nrank)*dist_sa(1+m*nrank)
dist_sc(1+m*nrank) = -dist_fr(1+m*nrank)*dist_sc(1+m*nrank) &
*dist_sc(2+m*nrank)
!dist_bc(1+m*nrank) = dist_sc(1+m*nrank)
end do

end subroutine process_dist

end module m_derparams

5 changes: 5 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set(TESTSRC
)
set(CUDATESTSRC
cuda/test_cuda_allocator.f90
cuda/test_cuda_tridiag.f90
)

if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI")
Expand All @@ -16,6 +17,10 @@ foreach(testfile IN LISTS TESTSRC)
get_filename_component(test_name ${testfile} NAME_WE)

add_executable(${test_name} ${testfile})
target_compile_options(${test_name} PRIVATE "-cuda")
target_compile_options(${test_name} PRIVATE "-O3")
target_compile_options(${test_name} PRIVATE "-fast")
target_link_options(${test_name} INTERFACE "-cuda")
target_link_libraries(${test_name} PRIVATE x3d2)

add_test(NAME ${test_name} COMMAND ${test_name})
Expand Down
Loading

0 comments on commit 549f935

Please sign in to comment.