From 1815604a0cf6c984730588edbf891f5c9db9289f Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Thu, 26 Oct 2023 14:47:14 +0100 Subject: [PATCH] feat: Add support for non-periodic BCs in the dist algorithm. --- src/cuda/kernels_dist.f90 | 160 ++++++++++++++++++------------- src/derparams.f90 | 12 +-- src/omp/kernels_dist.f90 | 126 ++++++++++++------------ tests/cuda/test_cuda_tridiag.f90 | 52 +++++----- tests/omp/test_omp_tridiag.f90 | 6 +- 5 files changed, 193 insertions(+), 163 deletions(-) diff --git a/src/cuda/kernels_dist.f90 b/src/cuda/kernels_dist.f90 index 8de15811..0c8de052 100644 --- a/src/cuda/kernels_dist.f90 +++ b/src/cuda/kernels_dist.f90 @@ -8,16 +8,16 @@ module m_cuda_kernels_dist contains attributes(global) subroutine der_univ_dist( & - du, send_u_b, send_u_e, u, u_b, u_e, coeffs_b, coeffs_e, coeffs, n, & + du, send_u_s, send_u_e, u, u_s, u_e, coeffs_s, coeffs_e, coeffs, n, & ffr, fbc, faf & ) implicit none ! Arguments - real(dp), device, intent(out), dimension(:, :, :) :: du, send_u_b, & + real(dp), device, intent(out), dimension(:, :, :) :: du, send_u_s, & send_u_e - real(dp), device, intent(in), dimension(:, :, :) :: u, u_b, u_e - real(dp), device, intent(in), dimension(:, :) :: coeffs_b, coeffs_e + real(dp), device, intent(in), dimension(:, :, :) :: u, u_s, u_e + real(dp), device, intent(in), dimension(:, :) :: coeffs_s, coeffs_e real(dp), device, intent(in), dimension(:) :: coeffs integer, value, intent(in) :: n real(dp), device, intent(in), dimension(:) :: ffr, fbc, faf @@ -38,70 +38,102 @@ attributes(global) subroutine der_univ_dist( & c_p1 = coeffs(6); c_p2 = coeffs(7); c_p3 = coeffs(8); c_p4 = coeffs(9) last_r = ffr(1) - du(i, 1, b) = coeffs(1)*u_b(i, 1, b) + coeffs(2)*u_b(i, 2, b) & - + coeffs(3)*u_b(i, 3, b) + coeffs(4)*u_b(i, 4, b) & - + coeffs(5)*u(i, 1, b) & - + coeffs(6)*u(i, 2, b) + coeffs(7)*u(i, 3, b) & - + coeffs(8)*u(i, 4, b) + coeffs(9)*u(i, 5, b) + du(i, 1, b) = coeffs_s(1, 1)*u_s(i, 1, b) & + + coeffs_s(2, 1)*u_s(i, 2, b) & + + coeffs_s(3, 1)*u_s(i, 3, b) & + + coeffs_s(4, 1)*u_s(i, 4, b) & + + coeffs_s(5, 1)*u(i, 1, b) & + + coeffs_s(6, 1)*u(i, 2, b) & + + coeffs_s(7, 1)*u(i, 3, b) & + + coeffs_s(8, 1)*u(i, 4, b) & + + coeffs_s(9, 1)*u(i, 5, b) du(i, 1, b) = du(i, 1, b)*faf(1) - du(i, 2, b) = coeffs(1)*u_b(i, 2, b) + coeffs(2)*u_b(i, 3, b) & - + coeffs(3)*u_b(i, 4, b) + coeffs(4)*u(i, 1, b) & - + coeffs(5)*u(i, 2, b) & - + coeffs(6)*u(i, 3, b) + coeffs(7)*u(i, 4, b) & - + coeffs(8)*u(i, 5, b) + coeffs(9)*u(i, 6, b) + du(i, 2, b) = coeffs_s(1, 2)*u_s(i, 2, b) & + + coeffs_s(2, 2)*u_s(i, 3, b) & + + coeffs_s(3, 2)*u_s(i, 4, b) & + + coeffs_s(4, 2)*u(i, 1, b) & + + coeffs_s(5, 2)*u(i, 2, b) & + + coeffs_s(6, 2)*u(i, 3, b) & + + coeffs_s(7, 2)*u(i, 4, b) & + + coeffs_s(8, 2)*u(i, 5, b) & + + coeffs_s(9, 2)*u(i, 6, b) du(i, 2, b) = du(i, 2, b)*faf(2) - du(i, 3, b) = coeffs(1)*u_b(i, 3, b) + coeffs(2)*u_b(i, 4, b) & - + coeffs(3)*u(i, 1, b) + coeffs(4)*u(i, 2, b) & - + coeffs(5)*u(i, 3, b) & - + coeffs(6)*u(i, 4, b) + coeffs(7)*u(i, 5, b) & - + coeffs(8)*u(i, 6, b) + coeffs(9)*u(i, 7, b) + du(i, 3, b) = coeffs_s(1, 3)*u_s(i, 3, b) & + + coeffs_s(2, 3)*u_s(i, 4, b) & + + coeffs_s(3, 3)*u(i, 1, b) & + + coeffs_s(4, 3)*u(i, 2, b) & + + coeffs_s(5, 3)*u(i, 3, b) & + + coeffs_s(6, 3)*u(i, 4, b) & + + coeffs_s(7, 3)*u(i, 5, b) & + + coeffs_s(8, 3)*u(i, 6, b) & + + coeffs_s(9, 3)*u(i, 7, b) du(i, 3, b) = ffr(3)*(du(i, 3, b) - faf(3)*du(i, 2, b)) - du(i, 4, b) = coeffs(1)*u_b(i, 4, b) + coeffs(2)*u(i, 1, b) & - + coeffs(3)*u(i, 2, b) + coeffs(4)*u(i, 3, b) & - + coeffs(5)*u(i, 4, b) & - + coeffs(6)*u(i, 5, b) + coeffs(7)*u(i, 6, b) & - + coeffs(8)*u(i, 7, b) + coeffs(9)*u(i, 8, b) + du(i, 4, b) = coeffs_s(1, 4)*u_s(i, 4, b) & + + coeffs_s(2, 4)*u(i, 1, b) & + + coeffs_s(3, 4)*u(i, 2, b) & + + coeffs_s(4, 4)*u(i, 3, b) & + + coeffs_s(5, 4)*u(i, 4, b) & + + coeffs_s(6, 4)*u(i, 5, b) & + + coeffs_s(7, 4)*u(i, 6, b) & + + coeffs_s(8, 4)*u(i, 7, b) & + + coeffs_s(9, 4)*u(i, 8, b) du(i, 4, b) = ffr(4)*(du(i, 4, b) - faf(3)*du(i, 3, b)) alpha = faf(5) - do j = 5, n-4 - temp_du = c_m4*u(i, j-4, b) + c_m3*u(i, j-3, b) & - + c_m2*u(i, j-2, b) + c_m1*u(i, j-1, b) & - + c_j*u(i, j, b) & - + c_p1*u(i, j+1, b) + c_p2*u(i, j+2, b) & - + c_p3*u(i, j+3, b) + c_p4*u(i, j+4, b) - du(i, j, b) = ffr(j)*(temp_du - alpha*du(i, j-1, b)) + do j = 5, n - 4 + temp_du = c_m4*u(i, j - 4, b) + c_m3*u(i, j - 3, b) & + + c_m2*u(i, j - 2, b) + c_m1*u(i, j - 1, b) & + + c_j*u(i, j, b) & + + c_p1*u(i, j + 1, b) + c_p2*u(i, j + 2, b) & + + c_p3*u(i, j + 3, b) + c_p4*u(i, j + 4, b) + du(i, j, b) = ffr(j)*(temp_du - alpha*du(i, j - 1, b)) end do - j = n-3 - du(i, j, b) = coeffs(1)*u(i, j-4, b) + coeffs(2)*u(i, j-3, b) & - + coeffs(3)*u(i, j-2, b) + coeffs(4)*u(i, j-1, b) & - + coeffs(5)*u(i, j, b) & - + coeffs(6)*u(i, j+1, b) + coeffs(7)*u(i, j+2, b) & - + coeffs(8)*u(i, j+3, b) + coeffs(9)*u_e(i, 1, b) - du(i, j, b) = ffr(j)*(du(i, j, b) - faf(j)*du(i, j-1, b)) - j = n-2 - du(i, j, b) = coeffs(1)*u(i, j-4, b) + coeffs(2)*u(i, j-3, b) & - + coeffs(3)*u(i, j-2, b) + coeffs(4)*u(i, j-1, b) & - + coeffs(5)*u(i, j, b) & - + coeffs(6)*u(i, j+1, b) + coeffs(7)*u(i, j+2, b) & - + coeffs(8)*u_e(i, 1, b) + coeffs(9)*u_e(i, 2, b) - du(i, j, b) = ffr(j)*(du(i, j, b) - faf(j)*du(i, j-1, b)) - j = n-1 - du(i, j, b) = coeffs(1)*u(i, j-4, b) + coeffs(2)*u(i, j-3, b) & - + coeffs(3)*u(i, j-2, b) + coeffs(4)*u(i, j-1, b) & - + coeffs(5)*u(i, j, b) & - + coeffs(6)*u(i, j+1, b) + coeffs(7)*u_e(i, 1, b) & - + coeffs(8)*u_e(i, 2, b) + coeffs(9)*u_e(i, 3, b) - du(i, j, b) = ffr(j)*(du(i, j, b) - faf(j)*du(i, j-1, b)) + j = n - 3 + du(i, j, b) = coeffs_e(1, 1)*u(i, j - 4, b) & + + coeffs_e(2, 1)*u(i, j - 3, b) & + + coeffs_e(3, 1)*u(i, j - 2, b) & + + coeffs_e(4, 1)*u(i, j - 1, b) & + + coeffs_e(5, 1)*u(i, j, b) & + + coeffs_e(6, 1)*u(i, j + 1, b) & + + coeffs_e(7, 1)*u(i, j + 2, b) & + + coeffs_e(8, 1)*u(i, j + 3, b) & + + coeffs_e(9, 1)*u_e(i, 1, b) + du(i, j, b) = ffr(j)*(du(i, j, b) - faf(j)*du(i, j - 1, b)) + j = n - 2 + du(i, j, b) = coeffs_e(1, 2)*u(i, j - 4, b) & + + coeffs_e(2, 2)*u(i, j - 3, b) & + + coeffs_e(3, 2)*u(i, j - 2, b) & + + coeffs_e(4, 2)*u(i, j - 1, b) & + + coeffs_e(5, 2)*u(i, j, b) & + + coeffs_e(6, 2)*u(i, j + 1, b) & + + coeffs_e(7, 2)*u(i, j + 2, b) & + + coeffs_e(8, 2)*u_e(i, 1, b) & + + coeffs_e(9, 2)*u_e(i, 2, b) + du(i, j, b) = ffr(j)*(du(i, j, b) - faf(j)*du(i, j - 1, b)) + j = n - 1 + du(i, j, b) = coeffs_e(1, 3)*u(i, j - 4, b) & + + coeffs_e(2, 3)*u(i, j - 3, b) & + + coeffs_e(3, 3)*u(i, j - 2, b) & + + coeffs_e(4, 3)*u(i, j - 1, b) & + + coeffs_e(5, 3)*u(i, j, b) & + + coeffs_e(6, 3)*u(i, j + 1, b) & + + coeffs_e(7, 3)*u_e(i, 1, b) & + + coeffs_e(8, 3)*u_e(i, 2, b) & + + coeffs_e(9, 3)*u_e(i, 3, b) + du(i, j, b) = ffr(j)*(du(i, j, b) - faf(j)*du(i, j - 1, b)) j = n - du(i, j, b) = coeffs(1)*u(i, j-4, b) + coeffs(2)*u(i, j-3, b) & - + coeffs(3)*u(i, j-2, b) + coeffs(4)*u(i, j-1, b) & - + coeffs(5)*u(i, j, b) & - + coeffs(6)*u_e(i, 1, b) + coeffs(7)*u_e(i, 2, b) & - + coeffs(8)*u_e(i, 3, b) + coeffs(9)*u_e(i, 4, b) - du(i, j, b) = ffr(j)*(du(i, j, b) - faf(j)*du(i, j-1, b)) + du(i, j, b) = coeffs_e(1, 4)*u(i, j - 4, b) & + + coeffs_e(2, 4)*u(i, j - 3, b) & + + coeffs_e(3, 4)*u(i, j - 2, b) & + + coeffs_e(4, 4)*u(i, j - 1, b) & + + coeffs_e(5, 4)*u(i, j, b) & + + coeffs_e(6, 4)*u_e(i, 1, b) & + + coeffs_e(7, 4)*u_e(i, 2, b) & + + coeffs_e(8, 4)*u_e(i, 3, b) & + + coeffs_e(9, 4)*u_e(i, 4, b) + du(i, j, b) = ffr(j)*(du(i, j, b) - faf(j)*du(i, j - 1, b)) send_u_e(i, 1, b) = du(i, n, b) @@ -110,17 +142,17 @@ attributes(global) subroutine der_univ_dist( & du(i, j, b) = du(i, j, b) - fbc(j)*du(i, j + 1, b) end do du(i, 1, b) = last_r*(du(i, 1, b) - fbc(1)*du(i, 2, b)) - send_u_b(i, 1, b) = du(i, 1, b) + send_u_s(i, 1, b) = du(i, 1, b) end subroutine der_univ_dist - attributes(global) subroutine der_univ_subs(du, recv_u_b, recv_u_e, & + attributes(global) subroutine der_univ_subs(du, recv_u_s, recv_u_e, & n, dist_sa, dist_sc) implicit none ! Arguments real(dp), device, intent(out), dimension(:, :, :) :: du - real(dp), device, intent(in), dimension(:, :, :) :: recv_u_b, recv_u_e + real(dp), device, intent(in), dimension(:, :, :) :: recv_u_s, recv_u_e real(dp), device, intent(in), dimension(:) :: dist_sa, dist_sc integer, value, intent(in) :: n @@ -135,13 +167,11 @@ attributes(global) subroutine der_univ_subs(du, recv_u_b, recv_u_e, & ur = dist_sc(n) recp = 1._dp/(1._dp - ur*bl) - !du(i, 1, b) = recp*(du(i, 1, b) - bl*recv_u_b(i, 1, b)) - !du(i, n, b) = recp*(du(i, n, b) - ur*recv_u_e(i, 1, b)) - du_1 = recp*(du(i, 1, b) - bl*recv_u_b(i, 1, b)) + du_1 = recp*(du(i, 1, b) - bl*recv_u_s(i, 1, b)) du_n = recp*(du(i, n, b) - ur*recv_u_e(i, 1, b)) du(i, 1, b) = du_1 - do j = 2, n-1 + do j = 2, n - 1 du(i, j, b) = (du(i, j, b) - dist_sa(j)*du_1 - dist_sc(j)*du_n) end do du(i, n, b) = du_n diff --git a/src/derparams.f90 b/src/derparams.f90 index 2a79e6b6..da5790ec 100644 --- a/src/derparams.f90 +++ b/src/derparams.f90 @@ -9,14 +9,14 @@ subroutine der_1_vv() implicit none end subroutine der_1_vv - subroutine der_2_vv(coeffs, coeffs_b, coeffs_e, & + subroutine der_2_vv(coeffs, coeffs_s, coeffs_e, & dist_fr, dist_bc, dist_af, dist_sa, dist_sc, & n_halo, dx2, n, bcond) implicit none real(dp), allocatable, dimension(:), intent(out) :: coeffs, & dist_fr, dist_bc, dist_af, dist_sa, dist_sc - real(dp), allocatable, dimension(:,:), intent(out) :: coeffs_b, coeffs_e + real(dp), allocatable, dimension(:,:), intent(out) :: coeffs_s, coeffs_e integer, intent(out) :: n_halo real(dp), intent(in) :: dx2 integer, intent(in) :: n @@ -44,11 +44,11 @@ subroutine der_2_vv(coeffs, coeffs_b, coeffs_e, & select case (bcond) case ('periodic') dist_sa(:) = alfa; dist_sc(:) = alfa; dist_b(:) = 1._dp - allocate(coeffs_b(n_halo, n_stencil)) - allocate(coeffs_e(n_halo, n_stencil)) + allocate(coeffs_s(n_stencil, n_halo)) + allocate(coeffs_e(n_stencil, n_halo)) do i = 1, n_halo - coeffs_b(i,:) = coeffs(:) - coeffs_e(i,:) = coeffs(:) + coeffs_s(:, i) = coeffs(:) + coeffs_e(:, i) = coeffs(:) end do case default print*, 'Boundary condition is not recognized :', bcond diff --git a/src/omp/kernels_dist.f90 b/src/omp/kernels_dist.f90 index b6baf5ac..9786fbe9 100644 --- a/src/omp/kernels_dist.f90 +++ b/src/omp/kernels_dist.f90 @@ -9,15 +9,15 @@ module m_omp_kernels_dist contains subroutine der_univ_dist_omp( & - du, send_u_b, send_u_e, u, u_b, u_e, coeffs_b, coeffs_e, coeffs, n, & + du, send_u_s, send_u_e, u, u_s, u_e, coeffs_s, coeffs_e, coeffs, n, & ffr, fbc, faf & ) implicit none ! Arguments - real(dp), intent(out), dimension(:, :) :: du, send_u_b, send_u_e - real(dp), intent(in), dimension(:, :) :: u, u_b, u_e - real(dp), intent(in), dimension(:, :) :: coeffs_b, coeffs_e + real(dp), intent(out), dimension(:, :) :: du, send_u_s, send_u_e + real(dp), intent(in), dimension(:, :) :: u, u_s, u_e + real(dp), intent(in), dimension(:, :) :: coeffs_s, coeffs_e ! start/end real(dp), intent(in), dimension(:) :: coeffs integer, intent(in) :: n real(dp), intent(in), dimension(:) :: ffr, fbc, faf @@ -37,29 +37,29 @@ subroutine der_univ_dist_omp( & !$omp simd do i = 1, SZ - du(i, 1) = coeffs(1)*u_b(i, 1) + coeffs(2)*u_b(i, 2) & - + coeffs(3)*u_b(i, 3) + coeffs(4)*u_b(i, 4) & - + coeffs(5)*u(i, 1) & - + coeffs(6)*u(i, 2) + coeffs(7)*u(i, 3) & - + coeffs(8)*u(i, 4) + coeffs(9)*u(i, 5) + du(i, 1) = coeffs_s(1, 1)*u_s(i, 1) + coeffs_s(2, 1)*u_s(i, 2) & + + coeffs_s(3, 1)*u_s(i, 3) + coeffs_s(4, 1)*u_s(i, 4) & + + coeffs_s(5, 1)*u(i, 1) & + + coeffs_s(6, 1)*u(i, 2) + coeffs_s(7, 1)*u(i, 3) & + + coeffs_s(8, 1)*u(i, 4) + coeffs_s(9, 1)*u(i, 5) du(i, 1) = du(i, 1)*faf(1) - du(i, 2) = coeffs(1)*u_b(i, 2) + coeffs(2)*u_b(i, 3) & - + coeffs(3)*u_b(i, 4) + coeffs(4)*u(i, 1) & - + coeffs(5)*u(i, 2) & - + coeffs(6)*u(i, 3) + coeffs(7)*u(i, 4) & - + coeffs(8)*u(i, 5) + coeffs(9)*u(i, 6) + du(i, 2) = coeffs_s(1, 2)*u_s(i, 2) + coeffs_s(2, 2)*u_s(i, 3) & + + coeffs_s(3, 2)*u_s(i, 4) + coeffs_s(4, 2)*u(i, 1) & + + coeffs_s(5, 2)*u(i, 2) & + + coeffs_s(6, 2)*u(i, 3) + coeffs_s(7, 2)*u(i, 4) & + + coeffs_s(8, 2)*u(i, 5) + coeffs_s(9, 2)*u(i, 6) du(i, 2) = du(i, 2)*faf(2) - du(i, 3) = coeffs(1)*u_b(i, 3) + coeffs(2)*u_b(i, 4) & - + coeffs(3)*u(i, 1) + coeffs(4)*u(i, 2) & - + coeffs(5)*u(i, 3) & - + coeffs(6)*u(i, 4) + coeffs(7)*u(i, 5) & - + coeffs(8)*u(i, 6) + coeffs(9)*u(i, 7) + du(i, 3) = coeffs_s(1, 3)*u_s(i, 3) + coeffs_s(2, 3)*u_s(i, 4) & + + coeffs_s(3, 3)*u(i, 1) + coeffs_s(4, 3)*u(i, 2) & + + coeffs_s(5, 3)*u(i, 3) & + + coeffs_s(6, 3)*u(i, 4) + coeffs_s(7, 3)*u(i, 5) & + + coeffs_s(8, 3)*u(i, 6) + coeffs_s(9, 3)*u(i, 7) du(i, 3) = ffr(3)*(du(i, 3) - faf(3)*du(i, 2)) - du(i, 4) = coeffs(1)*u_b(i, 4) + coeffs(2)*u(i, 1) & - + coeffs(3)*u(i, 2) + coeffs(4)*u(i, 3) & - + coeffs(5)*u(i, 4) & - + coeffs(6)*u(i, 5) + coeffs(7)*u(i, 6) & - + coeffs(8)*u(i, 7) + coeffs(9)*u(i, 8) + du(i, 4) = coeffs_s(1, 4)*u_s(i, 4) + coeffs_s(2, 4)*u(i, 1) & + + coeffs_s(3, 4)*u(i, 2) + coeffs_s(4, 4)*u(i, 3) & + + coeffs_s(5, 4)*u(i, 4) & + + coeffs_s(6, 4)*u(i, 5) + coeffs_s(7, 4)*u(i, 6) & + + coeffs_s(8, 4)*u(i, 7) + coeffs_s(9, 4)*u(i, 8) du(i, 4) = ffr(4)*(du(i, 4) - faf(4)*du(i, 3)) end do !$omp end simd @@ -67,49 +67,49 @@ subroutine der_univ_dist_omp( & ! alpha is always the same in the bulk region for us alpha = faf(5) - do j = 5, n-4 + do j = 5, n - 4 !$omp simd do i = 1, SZ - temp_du = c_m4*u(i, j-4) + c_m3*u(i, j-3) & - + c_m2*u(i, j-2) + c_m1*u(i, j-1) & - + c_j*u(i, j) & - + c_p1*u(i, j+1) + c_p2*u(i, j+2) & - + c_p3*u(i, j+3) + c_p4*u(i, j+4) - du(i, j) = ffr(j)*(temp_du - alpha*du(i, j-1)) + temp_du = c_m4*u(i, j - 4) + c_m3*u(i, j - 3) & + + c_m2*u(i, j - 2) + c_m1*u(i, j - 1) & + + c_j*u(i, j) & + + c_p1*u(i, j + 1) + c_p2*u(i, j + 2) & + + c_p3*u(i, j + 3) + c_p4*u(i, j + 4) + du(i, j) = ffr(j)*(temp_du - alpha*du(i, j - 1)) end do !$omp end simd end do !$omp simd do i = 1, SZ - j = n-3 - du(i, j) = coeffs(1)*u(i, j-4) + coeffs(2)*u(i, j-3) & - + coeffs(3)*u(i, j-2) + coeffs(4)*u(i, j-1) & - + coeffs(5)*u(i, j) & - + coeffs(6)*u(i, j+1) + coeffs(7)*u(i, j+2) & - + coeffs(8)*u(i, j+3) + coeffs(9)*u_e(i, 1) - du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j-1)) - j = n-2 - du(i, j) = coeffs(1)*u(i, j-4) + coeffs(2)*u(i, j-3) & - + coeffs(3)*u(i, j-2) + coeffs(4)*u(i, j-1) & - + coeffs(5)*u(i, j) & - + coeffs(6)*u(i, j+1) + coeffs(7)*u(i, j+2) & - + coeffs(8)*u_e(i, 1) + coeffs(9)*u_e(i, 2) - du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j-1)) - j = n-1 - du(i, j) = coeffs(1)*u(i, j-4) + coeffs(2)*u(i, j-3) & - + coeffs(3)*u(i, j-2) + coeffs(4)*u(i, j-1) & - + coeffs(5)*u(i, j) & - + coeffs(6)*u(i, j+1) + coeffs(7)*u_e(i, 1) & - + coeffs(8)*u_e(i, 2) + coeffs(9)*u_e(i, 3) - du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j-1)) + j = n - 3 + du(i, j) = coeffs_e(1, 1)*u(i, j - 4) + coeffs_e(2, 1)*u(i, j - 3) & + + coeffs_e(3, 1)*u(i, j - 2) + coeffs_e(4, 1)*u(i, j - 1) & + + coeffs_e(5, 1)*u(i, j) & + + coeffs_e(6, 1)*u(i, j + 1) + coeffs_e(7, 1)*u(i, j + 2) & + + coeffs_e(8, 1)*u(i, j + 3) + coeffs_e(9, 1)*u_e(i, 1) + du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j - 1)) + j = n - 2 + du(i, j) = coeffs_e(1, 2)*u(i, j - 4) + coeffs_e(2, 2)*u(i, j - 3) & + + coeffs_e(3, 2)*u(i, j - 2) + coeffs_e(4, 2)*u(i, j - 1) & + + coeffs_e(5, 2)*u(i, j) & + + coeffs_e(6, 2)*u(i, j + 1) + coeffs_e(7, 2)*u(i, j + 2) & + + coeffs_e(8, 2)*u_e(i, 1) + coeffs_e(9, 2)*u_e(i, 2) + du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j - 1)) + j = n - 1 + du(i, j) = coeffs_e(1, 3)*u(i, j - 4) + coeffs_e(2, 3)*u(i, j - 3) & + + coeffs_e(3, 3)*u(i, j - 2) + coeffs_e(4, 3)*u(i, j - 1) & + + coeffs_e(5, 3)*u(i, j) & + + coeffs_e(6, 3)*u(i, j + 1) + coeffs_e(7, 3)*u_e(i, 1) & + + coeffs_e(8, 3)*u_e(i, 2) + coeffs_e(9, 3)*u_e(i, 3) + du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j - 1)) j = n - du(i, j) = coeffs(1)*u(i, j-4) + coeffs(2)*u(i, j-3) & - + coeffs(3)*u(i, j-2) + coeffs(4)*u(i, j-1) & - + coeffs(5)*u(i, j) & - + coeffs(6)*u_e(i, 1) + coeffs(7)*u_e(i, 2) & - + coeffs(8)*u_e(i, 3) + coeffs(9)*u_e(i, 4) - du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j-1)) + du(i, j) = coeffs_e(1, 4)*u(i, j - 4) + coeffs_e(2, 4)*u(i, j - 3) & + + coeffs_e(3, 4)*u(i, j - 2) + coeffs_e(4, 4)*u(i, j - 1) & + + coeffs_e(5, 4)*u(i, j) & + + coeffs_e(6, 4)*u_e(i, 1) + coeffs_e(7, 4)*u_e(i, 2) & + + coeffs_e(8, 4)*u_e(i, 3) + coeffs_e(9, 4)*u_e(i, 4) + du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j - 1)) end do !$omp end simd @@ -130,18 +130,18 @@ subroutine der_univ_dist_omp( & !$omp simd do i = 1, SZ du(i, 1) = last_r*(du(i, 1) - fbc(1)*du(i, 2)) - send_u_b(i, 1) = du(i, 1) + send_u_s(i, 1) = du(i, 1) end do !$omp end simd end subroutine der_univ_dist_omp - subroutine der_univ_subs_omp(du, recv_u_b, recv_u_e, n, dist_sa, dist_sc) + subroutine der_univ_subs_omp(du, recv_u_s, recv_u_e, n, dist_sa, dist_sc) implicit none ! Arguments real(dp), intent(out), dimension(:, :) :: du - real(dp), intent(in), dimension(:, :) :: recv_u_b, recv_u_e + real(dp), intent(in), dimension(:, :) :: recv_u_s, recv_u_e real(dp), intent(in), dimension(:) :: dist_sa, dist_sc integer, intent(in) :: n @@ -155,7 +155,7 @@ subroutine der_univ_subs_omp(du, recv_u_b, recv_u_e, n, dist_sa, dist_sc) !$omp simd do i = 1, SZ - du_1 = recp*(du(i, 1) - bl*recv_u_b(i, 1)) + du_1 = recp*(du(i, 1) - bl*recv_u_s(i, 1)) du_n = recp*(du(i, n) - ur*recv_u_e(i, 1)) end do !$omp end simd @@ -165,7 +165,7 @@ subroutine der_univ_subs_omp(du, recv_u_b, recv_u_e, n, dist_sa, dist_sc) du(i, 1) = du_1 end do !$omp end simd - do j = 2, n-1 + do j = 2, n - 1 !$omp simd do i = 1, SZ du(i, j) = (du(i, j) - dist_sa(j)*du_1 - dist_sc(j)*du_n) diff --git a/tests/cuda/test_cuda_tridiag.f90 b/tests/cuda/test_cuda_tridiag.f90 index 66eb50c8..e0129c49 100644 --- a/tests/cuda/test_cuda_tridiag.f90 +++ b/tests/cuda/test_cuda_tridiag.f90 @@ -11,19 +11,19 @@ program test_cuda_tridiag implicit none logical :: allpass = .true. - real(dp), allocatable, dimension(:,:,:) :: u, du, u_b, u_e + real(dp), allocatable, dimension(:,:,:) :: u, du, u_s, u_e real(dp), device, allocatable, dimension(:,:,:) :: u_dev, du_dev - real(dp), device, allocatable, dimension(:,:,:) :: u_recv_b_dev, u_recv_e_dev, & - u_send_b_dev, u_send_e_dev + real(dp), device, allocatable, dimension(:,:,:) :: u_recv_s_dev, u_recv_e_dev, & + u_send_s_dev, u_send_e_dev - real(dp), device, allocatable, dimension(:,:,:) :: send_b_dev, send_e_dev, & - recv_b_dev, recv_e_dev + real(dp), device, allocatable, dimension(:,:,:) :: send_s_dev, send_e_dev, & + recv_s_dev, recv_e_dev - real(dp), allocatable, dimension(:,:) :: coeffs_b, coeffs_e + real(dp), allocatable, dimension(:,:) :: coeffs_s, coeffs_e real(dp), allocatable, dimension(:) :: coeffs, dist_fr, dist_bc, dist_af, & dist_sa, dist_sc - real(dp), device, allocatable, dimension(:,:) :: coeffs_b_dev, coeffs_e_dev + real(dp), device, allocatable, dimension(:,:) :: coeffs_s_dev, coeffs_e_dev real(dp), device, allocatable, dimension(:) :: coeffs_dev, & dist_fr_dev, dist_bc_dev, & dist_af_dev, & @@ -77,14 +77,14 @@ program test_cuda_tridiag u_dev = u ! set up the tridiagonal solver coeffs - call der_2_vv(coeffs, coeffs_b, coeffs_e, dist_fr, dist_bc, dist_af, & + call der_2_vv(coeffs, coeffs_s, coeffs_e, dist_fr, dist_bc, dist_af, & dist_sa, dist_sc, n_halo, dx2, n, 'periodic') n_stencil = n_halo*2 + 1 - allocate(coeffs_b_dev(n_halo, n_stencil), coeffs_e_dev(n_halo, n_stencil)) + allocate(coeffs_s_dev(n_stencil, n_halo), coeffs_e_dev(n_stencil, n_halo)) allocate(coeffs_dev(n_stencil)) - coeffs_b_dev(:, :) = coeffs_b(:, :); coeffs_e_dev(:, :) = coeffs_e(:, :) + coeffs_s_dev(:, :) = coeffs_s(:, :); coeffs_e_dev(:, :) = coeffs_e(:, :) coeffs_dev(:) = coeffs(:) allocate(dist_fr_dev(n), dist_bc_dev(n), dist_af_dev(n), & @@ -94,29 +94,29 @@ program test_cuda_tridiag dist_sa_dev(:) = dist_sa(:); dist_sc_dev(:) = dist_sc(:) ! arrays for exchanging data between ranks - allocate(u_send_b_dev(SZ, n_halo, n_block)) + allocate(u_send_s_dev(SZ, n_halo, n_block)) allocate(u_send_e_dev(SZ, n_halo, n_block)) - allocate(u_recv_b_dev(SZ, n_halo, n_block)) + allocate(u_recv_s_dev(SZ, n_halo, n_block)) allocate(u_recv_e_dev(SZ, n_halo, n_block)) - allocate(send_b_dev(SZ, 1, n_block), send_e_dev(SZ, 1, n_block)) - allocate(recv_b_dev(SZ, 1, n_block), recv_e_dev(SZ, 1, n_block)) + allocate(send_s_dev(SZ, 1, n_block), send_e_dev(SZ, 1, n_block)) + allocate(recv_s_dev(SZ, 1, n_block), recv_e_dev(SZ, 1, n_block)) blocks = dim3(n_block, 1, 1) threads = dim3(SZ, 1, 1) call cpu_time(tstart) do i = 1, n_iters - u_send_b_dev(:,:,:) = u_dev(:,1:4,:) + u_send_s_dev(:,:,:) = u_dev(:,1:4,:) u_send_e_dev(:,:,:) = u_dev(:,n-n_halo+1:n,:) ! halo exchange if (nproc == 1) then - u_recv_b_dev = u_send_e_dev - u_recv_e_dev = u_send_b_dev + u_recv_s_dev = u_send_e_dev + u_recv_e_dev = u_send_s_dev else ! MPI send/recv for multi-rank simulations - call MPI_Isend(u_send_b_dev, SZ*n_halo*n_block, & + call MPI_Isend(u_send_s_dev, SZ*n_halo*n_block, & MPI_DOUBLE_PRECISION, pprev, tag1, MPI_COMM_WORLD, & mpireq(1), srerr(1)) call MPI_Irecv(u_recv_e_dev, SZ*n_halo*n_block, & @@ -125,7 +125,7 @@ program test_cuda_tridiag call MPI_Isend(u_send_e_dev, SZ*n_halo*n_block, & MPI_DOUBLE_PRECISION, pnext, tag2, MPI_COMM_WORLD, & mpireq(3), srerr(3)) - call MPI_Irecv(u_recv_b_dev, SZ*n_halo*n_block, & + call MPI_Irecv(u_recv_s_dev, SZ*n_halo*n_block, & MPI_DOUBLE_PRECISION, pprev, tag2, MPI_COMM_WORLD, & mpireq(4), srerr(4)) @@ -134,18 +134,18 @@ program test_cuda_tridiag call der_univ_dist<<>>( & - du_dev, send_b_dev, send_e_dev, u_dev, u_recv_b_dev, u_recv_e_dev, & - coeffs_b_dev, coeffs_e_dev, coeffs_dev, & + du_dev, send_s_dev, send_e_dev, u_dev, u_recv_s_dev, u_recv_e_dev, & + coeffs_s_dev, coeffs_e_dev, coeffs_dev, & n, dist_fr_dev, dist_bc_dev, dist_af_dev & ) ! halo exchange for 2x2 systems if (nproc == 1) then - recv_b_dev = send_e_dev - recv_e_dev = send_b_dev + recv_s_dev = send_e_dev + recv_e_dev = send_s_dev else ! MPI send/recv for multi-rank simulations - call MPI_Isend(send_b_dev, SZ*n_block, & + call MPI_Isend(send_s_dev, SZ*n_block, & MPI_DOUBLE_PRECISION, pprev, tag1, MPI_COMM_WORLD, & mpireq(1), srerr(1)) call MPI_Irecv(recv_e_dev, SZ*n_block, & @@ -154,7 +154,7 @@ program test_cuda_tridiag call MPI_Isend(send_e_dev, SZ*n_block, & MPI_DOUBLE_PRECISION, pnext, tag2, MPI_COMM_WORLD, & mpireq(3), srerr(3)) - call MPI_Irecv(recv_b_dev, SZ*n_block, & + call MPI_Irecv(recv_s_dev, SZ*n_block, & MPI_DOUBLE_PRECISION, pprev, tag1, MPI_COMM_WORLD, & mpireq(4), srerr(4)) @@ -162,7 +162,7 @@ program test_cuda_tridiag end if call der_univ_subs<<>>( & - du_dev, recv_b_dev, recv_e_dev, & + du_dev, recv_s_dev, recv_e_dev, & n, dist_sa_dev, dist_sc_dev & ) end do diff --git a/tests/omp/test_omp_tridiag.f90 b/tests/omp/test_omp_tridiag.f90 index c2a77d66..1ac2f16e 100644 --- a/tests/omp/test_omp_tridiag.f90 +++ b/tests/omp/test_omp_tridiag.f90 @@ -18,7 +18,7 @@ program test_dist_tridiag real(dp), allocatable, dimension(:,:,:) :: send_b, send_e, & recv_b, recv_e - real(dp), allocatable, dimension(:,:) :: coeffs_b, coeffs_e + real(dp), allocatable, dimension(:,:) :: coeffs_s, coeffs_e real(dp), allocatable, dimension(:) :: coeffs, dist_fr, dist_bc, dist_af, & dist_sa, dist_sc @@ -62,7 +62,7 @@ program test_dist_tridiag end do ! set up the tridiagonal solver coeffs - call der_2_vv(coeffs, coeffs_b, coeffs_e, dist_fr, dist_bc, dist_af, & + call der_2_vv(coeffs, coeffs_s, coeffs_e, dist_fr, dist_bc, dist_af, & dist_sa, dist_sc, n_halo, dx2, n, 'periodic') n_stencil = n_halo*2 + 1 @@ -121,7 +121,7 @@ program test_dist_tridiag call der_univ_dist_omp( & du(:, :, k), send_b(:, :, k), send_e(:, :, k), u(:, :, k), & u_recv_b(:, :, k), u_recv_e(:, :, k), & - coeffs_b, coeffs_e, coeffs, n, dist_fr, dist_bc, dist_af & + coeffs_s, coeffs_e, coeffs, n, dist_fr, dist_bc, dist_af & ) end do !$omp end parallel do