From 5290fa4447db37fd2f6db663d77845e837801c52 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Sat, 9 Sep 2023 14:43:41 +0100 Subject: [PATCH 1/2] feat(cuda): Add a distributed tridiagonal solver. --- src/CMakeLists.txt | 5 ++ src/common.f90 | 7 ++ src/cuda/kernels_dist.f90 | 153 ++++++++++++++++++++++++++++++++++++++ src/derparams.f90 | 107 ++++++++++++++++++++++++++ 4 files changed, 272 insertions(+) create mode 100644 src/common.f90 create mode 100644 src/cuda/kernels_dist.f90 create mode 100644 src/derparams.f90 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 738ce647..a955aeb3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,7 @@ set(SRC allocator.f90 + common.f90 + derparams.f90 thomas.f90 stencil.f90 stencil_definitions.f90 @@ -9,6 +11,7 @@ set(SRC ) set(CUDASRC cuda/cuda_allocator.f90 + cuda/kernels_dist.f90 ) if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI") @@ -20,6 +23,8 @@ target_include_directories(x3d2 INTERFACE ${CMAKE_CURRENT_BINARY_DIR}) if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI") target_compile_options(x3d2 PRIVATE "-cuda") + target_compile_options(x3d2 PRIVATE "-O3") + target_compile_options(x3d2 PRIVATE "-fast") target_link_options(x3d2 INTERFACE "-cuda") endif() diff --git a/src/common.f90 b/src/common.f90 new file mode 100644 index 00000000..d46e3c49 --- /dev/null +++ b/src/common.f90 @@ -0,0 +1,7 @@ +module m_common + implicit none + + integer, parameter :: dp=kind(0.0d0), SZ=32 + real(dp), parameter :: pi = 4*atan(1.0_dp) + +end module m_common diff --git a/src/cuda/kernels_dist.f90 b/src/cuda/kernels_dist.f90 new file mode 100644 index 00000000..d757247b --- /dev/null +++ b/src/cuda/kernels_dist.f90 @@ -0,0 +1,153 @@ +module m_cuda_kernels_dist + use cudafor + + use m_common, only: dp + + implicit none + +contains + + attributes(global) subroutine der_univ_dist( & + du, send_u_b, send_u_e, u, u_b, u_e, coeffs_b, coeffs_e, coeffs, n, & + alfa, ffr, fbc & + ) + implicit none + + ! Arguments + real(dp), device, intent(out), dimension(:, :, :) :: du, send_u_b, & + send_u_e + real(dp), device, intent(in), dimension(:, :, :) :: u, u_b, u_e + real(dp), device, intent(in), dimension(:) :: ffr, fbc + real(dp), device, intent(in), dimension(:, :) :: coeffs_b, coeffs_e + real(dp), device, intent(in), dimension(:) :: coeffs + real(dp), value, intent(in) :: alfa + integer, value, intent(in) :: n + + ! Local variables + integer :: i, j, b, k, lj + integer :: jm2, jm1, jp1, jp2 + integer :: n_s, n_m, n_b, n_e !stencil, middle, begin, end + + real(dp) :: temp_du, c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4 + + i = threadIdx%x + b = blockIdx%x + + n_s = (size(coeffs)-1)/2 + n_m = size(coeffs) + n_b = size(coeffs_b, dim=2) + n_e = size(coeffs_e, dim=2) + + ! store bulk coeffs in the registers + c_m4 = coeffs(1); c_m3 = coeffs(2); c_m2 = coeffs(3); c_m1 = coeffs(4) + c_j = coeffs(5) + c_p1 = coeffs(6); c_p2 = coeffs(7); c_p3 = coeffs(8); c_p4 = coeffs(9) + + du(i, 1, b) = coeffs(1)*u_b(i, 1, b) + coeffs(2)*u_b(i, 2, b) & + + coeffs(3)*u_b(i, 3, b) + coeffs(4)*u_b(i, 4, b) & + + coeffs(5)*u(i, 1, b) & + + coeffs(6)*u(i, 2, b) + coeffs(7)*u(i, 3, b) & + + coeffs(8)*u(i, 4, b) + coeffs(9)*u(i, 5, b) + du(i, 2, b) = coeffs(1)*u_b(i, 2, b) + coeffs(2)*u_b(i, 3, b) & + + coeffs(3)*u_b(i, 4, b) + coeffs(4)*u(i, 1, b) & + + coeffs(5)*u(i, 2, b) & + + coeffs(6)*u(i, 3, b) + coeffs(7)*u(i, 4, b) & + + coeffs(8)*u(i, 5, b) + coeffs(9)*u(i, 6, b) + du(i, 3, b) = coeffs(1)*u_b(i, 3, b) + coeffs(2)*u_b(i, 4, b) & + + coeffs(3)*u(i, 1, b) + coeffs(4)*u(i, 2, b) & + + coeffs(5)*u(i, 3, b) & + + coeffs(6)*u(i, 4, b) + coeffs(7)*u(i, 5, b) & + + coeffs(8)*u(i, 6, b) + coeffs(9)*u(i, 7, b) + du(i, 3, b) = ffr(3)*(du(i, 3, b) - alfa*du(i, 2, b)) + du(i, 4, b) = coeffs(1)*u_b(i, 4, b) + coeffs(2)*u(i, 1, b) & + + coeffs(3)*u(i, 2, b) + coeffs(4)*u(i, 3, b) & + + coeffs(5)*u(i, 4, b) & + + coeffs(6)*u(i, 5, b) + coeffs(7)*u(i, 6, b) & + + coeffs(8)*u(i, 7, b) + coeffs(9)*u(i, 8, b) + du(i, 4, b) = ffr(4)*(du(i, 4, b) - alfa*du(i, 3, b)) + + do j = n_s+1, n-n_s + temp_du = c_m4*u(i, j-4, b) + c_m3*u(i, j-3, b) & + + c_m2*u(i, j-2, b) + c_m1*u(i, j-1, b) & + + c_j*u(i, j, b) & + + c_p1*u(i, j+1, b) + c_p2*u(i, j+2, b) & + + c_p3*u(i, j+3, b) + c_p4*u(i, j+4, b) + du(i, j, b) = ffr(j)*(temp_du - alfa*du(i, j-1, b)) + end do + + j = n-3 + du(i, j, b) = coeffs(1)*u(i, j-4, b) + coeffs(2)*u(i, j-3, b) & + + coeffs(3)*u(i, j-2, b) + coeffs(4)*u(i, j-1, b) & + + coeffs(5)*u(i, j, b) & + + coeffs(6)*u(i, j+1, b) + coeffs(7)*u(i, j+2, b) & + + coeffs(8)*u(i, j+3, b) + coeffs(9)*u_e(i, 1, b) + du(i, j, b) = ffr(j)*(du(i, j, b) - alfa*du(i, j-1, b)) + j = n-2 + du(i, j, b) = coeffs(1)*u(i, j-4, b) + coeffs(2)*u(i, j-3, b) & + + coeffs(3)*u(i, j-2, b) + coeffs(4)*u(i, j-1, b) & + + coeffs(5)*u(i, j, b) & + + coeffs(6)*u(i, j+1, b) + coeffs(7)*u(i, j+2, b) & + + coeffs(8)*u_e(i, 1, b) + coeffs(9)*u_e(i, 2, b) + du(i, j, b) = ffr(j)*(du(i, j, b) - alfa*du(i, j-1, b)) + j = n-1 + du(i, j, b) = coeffs(1)*u(i, j-4, b) + coeffs(2)*u(i, j-3, b) & + + coeffs(3)*u(i, j-2, b) + coeffs(4)*u(i, j-1, b) & + + coeffs(5)*u(i, j, b) & + + coeffs(6)*u(i, j+1, b) + coeffs(7)*u_e(i, 1, b) & + + coeffs(8)*u_e(i, 2, b) + coeffs(9)*u_e(i, 3, b) + du(i, j, b) = ffr(j)*(du(i, j, b) - alfa*du(i, j-1, b)) + j = n + du(i, j, b) = coeffs(1)*u(i, j-4, b) + coeffs(2)*u(i, j-3, b) & + + coeffs(3)*u(i, j-2, b) + coeffs(4)*u(i, j-1, b) & + + coeffs(5)*u(i, j, b) & + + coeffs(6)*u_e(i, 1, b) + coeffs(7)*u_e(i, 2, b) & + + coeffs(8)*u_e(i, 3, b) + coeffs(9)*u_e(i, 4, b) + du(i, j, b) = ffr(j)*(du(i, j, b) - alfa*du(i, j-1, b)) + + send_u_e(i, 1, b) = du(i, n, b) + + ! Backward pass of the hybrid algorithm + do j = n - 2, 2, -1 + du(i, j, b) = du(i, j, b) - fbc(j)*du(i, j + 1, b) + end do + du(i, 1, b) = ffr(1)*(du(i, 1, b) - fbc(1)*du(i, 2, b)) + send_u_b(i, 1, b) = du(i, 1, b) + + end subroutine der_univ_dist + + attributes(global) subroutine der_univ_subs(du, recv_u_b, recv_u_e, & + n, alfa, dist_sa, dist_sc) + implicit none + + ! Arguments + real(dp), device, intent(out), dimension(:, :, :) :: du + real(dp), device, intent(in), dimension(:, :, :) :: recv_u_b, recv_u_e + real(dp), device, intent(in), dimension(:) :: dist_sa, dist_sc + real(dp), value, intent(in) :: alfa + integer, value, intent(in) :: n + + ! Local variables + integer :: i, j, b + real(dp) :: ur, bl, recp, du_1, du_n + + i = threadIdx%x + b = blockIdx%x + + bl = dist_sa(1) + ur = dist_sc(n) + recp = 1._dp/(1._dp - ur*bl) + + !du(i, 1, b) = recp*(du(i, 1, b) - bl*recv_u_b(i, 1, b)) + !du(i, n, b) = recp*(du(i, n, b) - ur*recv_u_e(i, 1, b)) + du_1 = recp*(du(i, 1, b) - bl*recv_u_b(i, 1, b)) + du_n = recp*(du(i, n, b) - ur*recv_u_e(i, 1, b)) + + du(i, 1, b) = du_1 + do j = 2, n-1 + du(i, j, b) = (du(i, j, b) - dist_sa(j)*du_1 - dist_sc(j)*du_n) + end do + du(i, n, b) = du_n + + end subroutine der_univ_subs + +end module m_cuda_kernels_dist diff --git a/src/derparams.f90 b/src/derparams.f90 new file mode 100644 index 00000000..d2216def --- /dev/null +++ b/src/derparams.f90 @@ -0,0 +1,107 @@ +module m_derparams + use m_common, only: dp + + implicit none + +contains + + subroutine der_1_vv() + implicit none + end subroutine der_1_vv + + subroutine der_2_vv(coeffs, coeffs_b, coeffs_e, & + dist_fr, dist_bc, dist_sa, dist_sc, & + n_halo, alfa, dx2, n, bcond) + implicit none + + real(dp), allocatable, dimension(:), intent(out) :: coeffs, & + dist_fr, dist_bc, dist_sa, dist_sc + real(dp), allocatable, dimension(:,:), intent(out) :: coeffs_b, coeffs_e + integer, intent(out) :: n_halo + real(dp), intent(out) :: alfa + real(dp), intent(in) :: dx2 + integer, intent(in) :: n + character(len=*), intent(in) :: bcond + + real(dp), allocatable :: dist_b(:) + real(dp) :: asi, bsi, csi, dsi + integer :: i, n_stencil + + allocate(dist_sa(n), dist_sc(n), dist_b(n)) + + alfa = 2._dp/11._dp + asi = 12._dp/11._dp/dx2 + bsi = 3._dp/44._dp/dx2 + csi = 0._dp + dsi = 0._dp + + n_halo = 4 + n_stencil = 2*n_halo+1 + + coeffs = [dsi, csi, bsi, asi, & + -2._dp*(asi+bsi+csi+dsi), & + asi, bsi, csi, dsi] + + select case (bcond) + case ('periodic') + dist_sa(:) = alfa; dist_sc(:) = alfa; dist_b(:) = 1._dp + allocate(coeffs_b(n_halo, n_stencil)) + allocate(coeffs_e(n_halo, n_stencil)) + do i = 1, n_halo + coeffs_b(i,:) = coeffs(:) + coeffs_e(i,:) = coeffs(:) + end do + case default + print*, 'Boundary condition is not recognized :', bcond + end select + + call process_dist(dist_fr, dist_bc, dist_sa, dist_sc, dist_b, n) + + end subroutine der_2_vv + + subroutine process_dist(dist_fr, dist_bc, dist_sa, dist_sc, dist_b, n) + implicit none + + real(dp), allocatable, dimension(:), intent(out) :: dist_fr, dist_bc + real(dp), dimension(:), intent(inout) :: dist_sa, dist_sc, dist_b + integer, intent(in) :: n + + integer :: i, nrank, nproc, m + + m = n + nrank = 0; nproc = 1 + + allocate(dist_fr(n), dist_bc(n)) + + do nrank = 0, nproc-1 + + ! start the hybrid algorithm + do i = 1+m*nrank, 2+m*nrank + dist_sa(i) = dist_sa(i)/dist_b(i) + dist_sc(i) = dist_sc(i)/dist_b(i) + dist_bc(i) = dist_sc(i) + end do + do i = 3+m*nrank, m+m*nrank + dist_fr(i) = 1.d0/(dist_b(i)-dist_sa(i)*dist_sc(i-1)) + dist_sa(i) = -dist_fr(i)*dist_sa(i)*dist_sa(i-1) + dist_sc(i) = dist_fr(i)*dist_sc(i) + !dist_bc(i) = dist_sc(i) + end do + do i = m-2+m*nrank, 2+m*nrank, -1 + dist_sa(i) = dist_sa(i)-dist_sc(i)*dist_sa(i+1) + dist_bc(i) = dist_sc(i) + dist_sc(i) = -dist_sc(i)*dist_sc(i+1) + end do + ! this is not good + dist_fr(1+m*nrank) = 1.d0/(1.d0-dist_sc(1+m*nrank)*dist_sa(2+m*nrank)) + + dist_sa(1+m*nrank) = dist_fr(1+m*nrank)*dist_sa(1+m*nrank) + dist_sc(1+m*nrank) = -dist_fr(1+m*nrank)*dist_sc(1+m*nrank) & + *dist_sc(2+m*nrank) + !dist_bc(1+m*nrank) = dist_sc(1+m*nrank) + end do + + end subroutine process_dist + +end module m_derparams + From 85658b2ae5753b2851bda826c7da660bdb21e14f Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Sat, 9 Sep 2023 14:45:33 +0100 Subject: [PATCH 2/2] test(cuda): Add a test for the distributed tridiagonal solver. --- tests/CMakeLists.txt | 5 ++ tests/cuda/test_cuda_tridiag.f90 | 138 +++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+) create mode 100644 tests/cuda/test_cuda_tridiag.f90 diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index dd57e2b5..7659a869 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -6,6 +6,7 @@ set(TESTSRC ) set(CUDATESTSRC cuda/test_cuda_allocator.f90 + cuda/test_cuda_tridiag.f90 ) if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI") @@ -16,6 +17,10 @@ foreach(testfile IN LISTS TESTSRC) get_filename_component(test_name ${testfile} NAME_WE) add_executable(${test_name} ${testfile}) + target_compile_options(${test_name} PRIVATE "-cuda") + target_compile_options(${test_name} PRIVATE "-O3") + target_compile_options(${test_name} PRIVATE "-fast") + target_link_options(${test_name} INTERFACE "-cuda") target_link_libraries(${test_name} PRIVATE x3d2) add_test(NAME ${test_name} COMMAND ${test_name}) diff --git a/tests/cuda/test_cuda_tridiag.f90 b/tests/cuda/test_cuda_tridiag.f90 new file mode 100644 index 00000000..ce0f9758 --- /dev/null +++ b/tests/cuda/test_cuda_tridiag.f90 @@ -0,0 +1,138 @@ +program test_cuda_tridiag + use iso_fortran_env, only: stderr => error_unit + use cudafor + + use m_common, only: dp, SZ, pi + use m_cuda_kernels_dist, only: der_univ_dist, der_univ_subs + use m_derparams, only: der_1_vv, der_2_vv + + implicit none + + logical :: allpass = .true. + real(dp), allocatable, dimension(:,:,:) :: u, du, u_b, u_e + real(dp), device, allocatable, dimension(:,:,:) :: u_dev, du_dev + real(dp), device, allocatable, dimension(:,:,:) :: u_recv_b_dev, u_recv_e_dev, & + u_send_b_dev, u_send_e_dev + + real(dp), device, allocatable, dimension(:,:,:) :: send_b_dev, send_e_dev, & + recv_b_dev, recv_e_dev + + real(dp), allocatable, dimension(:,:) :: coeffs_b, coeffs_e + real(dp), allocatable, dimension(:) :: coeffs, dist_fr, dist_bc, & + dist_sa, dist_sc + + real(dp), device, allocatable, dimension(:,:) :: coeffs_b_dev, coeffs_e_dev + real(dp), device, allocatable, dimension(:) :: coeffs_dev, & + dist_fr_dev, dist_bc_dev, & + dist_sa_dev, dist_sc_dev + + integer :: n, n_block, i, j, k, n_halo, n_stencil, n_iters + integer :: ierr, memClockRt, memBusWidth + type(dim3) :: blocks, threads + real(dp) :: dx, dx2, alfa, norm_du, tol = 1d-8, tstart, tend + real(dp) :: achievedBW, deviceBW + + n = 512 + n_block = 512*512/SZ + n_iters = 1000 + + allocate(u(SZ, n, n_block), du(SZ, n, n_block)) + allocate(u_dev(SZ, n, n_block), du_dev(SZ, n, n_block)) + + dx = 2*pi/n + dx2 = dx*dx + + do k = 1, n_block + do j = 1, n + do i = 1, SZ + u(i, j, k) = sin((j-1)*dx) + end do + end do + end do + + ! move data to device + u_dev = u + + ! set up the tridiagonal solver coeffs + call der_2_vv(coeffs, coeffs_b, coeffs_e, dist_fr, dist_bc, & + dist_sa, dist_sc, n_halo, alfa, dx2, n, 'periodic') + + n_stencil = n_halo*2 + 1 + + allocate(coeffs_b_dev(n_halo, n_stencil), coeffs_e_dev(n_halo, n_stencil)) + allocate(coeffs_dev(n_stencil)) + allocate(dist_fr_dev(n), dist_bc_dev(n), dist_sa_dev(n), dist_sc_dev(n)) + coeffs_b_dev(:, :) = coeffs_b(:, :); coeffs_e_dev(:, :) = coeffs_e(:, :) + coeffs_dev(:) = coeffs(:) + dist_fr_dev(:) = dist_fr(:); dist_bc_dev(:) = dist_bc(:) + dist_sa_dev(:) = dist_sa(:); dist_sc_dev(:) = dist_sc(:) + + ! arrays for exchanging data between ranks + allocate(u_send_b_dev(SZ, n_halo, n_block)) + allocate(u_send_e_dev(SZ, n_halo, n_block)) + allocate(u_recv_b_dev(SZ, n_halo, n_block)) + allocate(u_recv_e_dev(SZ, n_halo, n_block)) + + allocate(send_b_dev(SZ, 1, n_block), send_e_dev(SZ, 1, n_block)) + allocate(recv_b_dev(SZ, 1, n_block), recv_e_dev(SZ, 1, n_block)) + + blocks = dim3(n_block, 1, 1) + threads = dim3(SZ, 1, 1) + + call cpu_time(tstart) + do i = 1, n_iters + u_send_b_dev(:,:,:) = u_dev(:,1:4,:) + u_send_e_dev(:,:,:) = u_dev(:,n-n_halo+1:n,:) + + ! to be converted to MPI send/recv for multi-rank simulations + u_recv_b_dev = u_send_e_dev + u_recv_e_dev = u_send_b_dev + + call der_univ_dist<<>>( & + du_dev, send_b_dev, send_e_dev, u_dev, u_recv_b_dev, u_recv_e_dev, & + coeffs_b_dev, coeffs_e_dev, coeffs_dev, & + n, alfa, dist_fr_dev, dist_bc_dev & + ) + + ! to be converted to MPI send/recv for multi-rank simulations + recv_b_dev = send_e_dev + recv_e_dev = send_b_dev + + call der_univ_subs<<>>( & + du_dev, recv_b_dev, recv_e_dev, & + n, alfa, dist_sa_dev, dist_sc_dev & + ) + end do + call cpu_time(tend) + print*, 'Total time', tend-tstart + + achievedBW = 6._dp*n_iters*n*n_block*SZ*dp/(tend-tstart) + print'(a, f8.3, a)', 'Achieved BW: ', achievedBW/2**30, ' GiB/s' + + ierr = cudaDeviceGetAttribute(memClockRt, cudaDevAttrMemoryClockRate, 0) + ierr = cudaDeviceGetAttribute(memBusWidth, cudaDevAttrGlobalMemoryBusWidth, 0) + deviceBW = 2*memBusWidth/8._dp*memClockRt*1000 + + print'(a, f8.3, a)', 'Device BW: ', deviceBW/2**30, ' GiB/s' + print'(a, f5.2)', 'Effective BW utilization: %', achievedBW/deviceBW*100 + + ! check error + du = du_dev + norm_du = norm2(u+du)/n_block + print*, 'error norm', norm_du + + if ( norm_du > tol ) then + allpass = .false. + write(stderr, '(a)') 'Check second derivatives... failed' + else + write(stderr, '(a)') 'Check second derivatives... passed' + end if + + if (allpass) then + write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.' + else + error stop 'SOME TESTS FAILED.' + end if + +end program test_cuda_tridiag +