diff --git a/src/backend.f90 b/src/backend.f90
index da0446d8..1ca24cad 100644
--- a/src/backend.f90
+++ b/src/backend.f90
@@ -237,7 +237,7 @@ subroutine get_field_data(self, data, f, dir)
     end if
 
     ! Returns 0 if no reorder required
-    rdr_dir = get_rdr_from_dirs(direction, f%dir)
+    rdr_dir = get_rdr_from_dirs(f%dir, direction)
 
     ! Carry out a reorder if we need, and copy from field to data array
     if (rdr_dir /= 0) then
@@ -269,7 +269,7 @@ subroutine set_field_data(self, f, data, dir)
     end if
 
     ! Returns 0 if no reorder required
-    rdr_dir = get_rdr_from_dirs(f%dir, direction)
+    rdr_dir = get_rdr_from_dirs(direction, f%dir)
 
     ! Carry out a reorder if we need, and copy from data array to field
     if (rdr_dir /= 0) then
diff --git a/src/common.f90 b/src/common.f90
index b0467be0..0a9a33eb 100644
--- a/src/common.f90
+++ b/src/common.f90
@@ -20,10 +20,10 @@ module m_common
                         Z_EDGE = 103, & ! Data on edges along Z
                         none = -1 ! The location of data isn't specified
   integer, protected :: &
-    rdr_map(4, 4) = reshape([0, RDR_X2Y, RDR_X2Z, RDR_X2C, &
-                             RDR_Y2X, 0, RDR_Y2Z, RDR_Y2C, &
-                             RDR_Z2X, RDR_Z2Y, 0, RDR_Z2C, &
-                             RDR_C2X, RDR_C2Y, RDR_C2Z, 0], shape=[4, 4])
+    rdr_map(4, 4) = reshape([0, RDR_Y2X, RDR_Z2X, RDR_C2X, &
+                             RDR_X2Y, 0, RDR_Z2Y, RDR_C2Y, &
+                             RDR_X2Z, RDR_Y2Z, 0, RDR_C2Z, &
+                             RDR_X2C, RDR_Y2C, RDR_Z2C, 0], shape=[4, 4])
 
   type :: globs_t
     real(dp) :: nu, dt
@@ -38,10 +38,10 @@ pure subroutine get_dirs_from_rdr(dir_from, dir_to, rdr_dir)
     integer, intent(out) :: dir_from, dir_to
     integer, intent(in) :: rdr_dir
     integer, dimension(2) :: dirs
-    
+
     dirs = findloc(rdr_map, rdr_dir)
-    dir_from = dirs(2)
-    dir_to = dirs(1)
+    dir_from = dirs(1)
+    dir_to = dirs(2)
 
   end subroutine
 
diff --git a/src/mesh.f90 b/src/mesh.f90
index a5f8e9b4..8fe9e019 100644
--- a/src/mesh.f90
+++ b/src/mesh.f90
@@ -265,9 +265,9 @@ pure function get_dims_dataloc(data_loc, vert_dims, cell_dims) result(dims)
       dims(1:2) = vert_dims(1:2)
       dims(3) = cell_dims(3)
     case (none)
-      error stop
+      error stop "Unknown location in get_dims_dataloc"
     end select
-  end function
+  end function get_dims_dataloc
 
   pure function get_padded_dims_dir(self, dir) result(dims_padded)
   !! Getter for padded dimensions with structure in `dir` direction
@@ -410,9 +410,9 @@ pure function get_n_dir(self, dir, data_loc) result(n)
         n = n_cell
       end if
     case (none)
-      error stop
+      error stop "Unknown direction in get_n_dir"
     end select
-  end function
+  end function get_n_dir
 
   pure function get_coordinates(self, i, j, k) result(xloc)
   !! Get the physical location of a cell center with i,j,k local indices
diff --git a/src/omp/backend.f90 b/src/omp/backend.f90
index 0904e881..ae91b303 100644
--- a/src/omp/backend.f90
+++ b/src/omp/backend.f90
@@ -360,6 +360,8 @@ subroutine sum_yintox_omp(self, u, u_)
     class(field_t), intent(inout) :: u
     class(field_t), intent(in) :: u_
 
+    call sum_intox_omp(self, u, u_, DIR_Y)
+
   end subroutine sum_yintox_omp
 
   subroutine sum_zintox_omp(self, u, u_)
@@ -369,8 +371,39 @@ subroutine sum_zintox_omp(self, u, u_)
     class(field_t), intent(inout) :: u
     class(field_t), intent(in) :: u_
 
+    call sum_intox_omp(self, u, u_, DIR_Z)
+
   end subroutine sum_zintox_omp
 
+  subroutine sum_intox_omp(self, u, u_, dir_to)
+
+    class(omp_backend_t) :: self
+    class(field_t), intent(inout) :: u
+    class(field_t), intent(in) :: u_
+    integer, intent(in) :: dir_to
+
+    integer :: dir_from
+    integer, dimension(3) :: dims
+    integer :: i, j, k    ! Working indices
+    integer :: ii, jj, kk ! Transpose indices
+
+    dir_from = DIR_X
+
+    dims = self%mesh%get_padded_dims(u)
+    !$omp parallel do private(i, ii, jj, kk) collapse(2)
+    do k = 1, dims(3)
+      do j = 1, dims(2)
+        do i = 1, dims(1)
+          call get_index_reordering(ii, jj, kk, i, j, k, &
+                                    dir_from, dir_to, self%mesh)
+          u%data(i, j, k) = u%data(i, j, k) + u_%data(ii, jj, kk)
+        end do
+      end do
+    end do
+    !$omp end parallel do
+
+  end subroutine sum_intox_omp
+
   subroutine vecadd_omp(self, a, x, b, y)
     implicit none
 
@@ -379,18 +412,107 @@ subroutine vecadd_omp(self, a, x, b, y)
     class(field_t), intent(in) :: x
     real(dp), intent(in) :: b
     class(field_t), intent(inout) :: y
+    integer, dimension(3) :: dims
+    integer :: i, j, k, ii
 
-    y%data = a*x%data + b*y%data ! fixme
+    integer :: nvec, remstart
+
+    if (x%dir /= y%dir) then
+      error stop "Called vector add with incompatible fields"
+    end if
+
+    dims = size(x%data)
+    nvec = dims(1)/SZ
+    remstart = nvec*SZ + 1
+
+    !$omp parallel do private(i, ii) collapse(2)
+    do k = 1, dims(3)
+      do j = 1, dims(2)
+        ! Execute inner vectorised loops
+        do ii = 1, nvec
+          !$omp simd
+          do i = 1, SZ
+            y%data(i + (ii - 1)*SZ, j, k) = &
+              a*x%data(i + (ii - 1)*SZ, j, k) + &
+              b*y%data(i + (ii - 1)*SZ, j, k)
+          end do
+          !$omp end simd
+        end do
+
+        ! Remainder loop
+        do i = remstart, dims(1)
+          y%data(i, j, k) = a*x%data(i, j, k) + b*y%data(i, j, k)
+        end do
+      end do
+    end do
+    !$omp end parallel do
 
   end subroutine vecadd_omp
 
   real(dp) function scalar_product_omp(self, x, y) result(s)
+
+    use mpi
+
+    use m_common, only: NONE, get_rdr_from_dirs
+
     implicit none
 
     class(omp_backend_t) :: self
     class(field_t), intent(in) :: x, y
+    class(field_t), pointer :: x_, y_
+    integer, dimension(3) :: dims
+    integer :: i, j, k, ii
+    integer :: nvec, remstart
+    integer :: ierr
+
+    if ((x%data_loc == NONE) .or. (y%data_loc == NONE)) then
+      error stop "You must set the field location before calling scalar product"
+    end if
+    if (x%data_loc /= y%data_loc) then
+      error stop "Called scalar product with incompatible fields"
+    end if
+
+    ! Reorient data into temporary DIR_C storage
+    x_ => self%allocator%get_block(DIR_C, x%data_loc)
+    call self%get_field_data(x_%data, x)
+    y_ => self%allocator%get_block(DIR_C, y%data_loc)
+    call self%get_field_data(y_%data, y)
+
+    dims = self%mesh%get_field_dims(x_)
+
+    nvec = dims(1)/SZ
+    remstart = nvec*SZ + 1
+
+    s = 0.0_dp
+    !$omp parallel do reduction(+:s) private(i, ii) collapse(2)
+    do k = 1, dims(3)
+      do j = 1, dims(2)
+        ! Execute inner vectorised loops
+        do ii = 1, nvec
+          !$omp simd reduction(+:s)
+          do i = 1, SZ
+            s = s + x_%data(i + (ii - 1)*SZ, j, k)* &
+                y_%data(i + (ii - 1)*SZ, j, k)
+          end do
+          !$omp end simd
+        end do
+
+        ! Remainder loop
+        do i = remstart, dims(1)
+          s = s + x_%data(i, j, k)*y_%data(i, j, k)
+        end do
+      end do
+    end do
+    !$omp end parallel do
+
+    ! Release temporary storage
+    call self%allocator%release_block(x_)
+    call self%allocator%release_block(y_)
 
-    s = 0._dp
+    ! Reduce the result
+    call MPI_Allreduce(MPI_IN_PLACE, s, 1, MPI_DOUBLE_PRECISION, &
+                       MPI_SUM, MPI_COMM_WORLD, &
+                       ierr)
 
   end function scalar_product_omp
 
diff --git a/src/ordering.f90 b/src/ordering.f90
index 5a07bdeb..0d2527c7 100644
--- a/src/ordering.f90
+++ b/src/ordering.f90
@@ -75,7 +75,7 @@ pure subroutine get_index_dir(dir_i, dir_j, dir_k, i, j, k, dir, &
   end subroutine get_index_dir
 
   pure subroutine get_index_reordering_dirs(out_i, out_j, out_k, in_i, in_j, in_k, &
-                                       dir_from, dir_to, mesh)
+                                            dir_from, dir_to, mesh)
       !! Converts a set of application storage directional index to an other direction.
       !! The two directions are defined by the reorder_dir variable, RDR_X2Y will go from storage in X to Y etc.
     integer, intent(out) :: out_i, out_j, out_k         ! new indices in the application storage
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b951a3d2..f8a11e6c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -18,6 +18,7 @@ function(define_test testfile np backend)
   else()
     target_compile_options(${test_name} PRIVATE "-cpp")
     find_package(OpenMP REQUIRED)
+    target_compile_options(${test_name} PRIVATE "-cpp")
     target_link_libraries(${test_name} PRIVATE OpenMP::OpenMP_Fortran)
   endif()
   target_link_libraries(${test_name} PRIVATE x3d2)
@@ -33,12 +34,16 @@ define_test(test_mesh.f90 4 omp)
 define_test(test_reordering.f90 1 omp)
 define_test(test_reordering.f90 2 omp)
 define_test(test_reordering.f90 4 omp)
+define_test(test_reorder_map.f90 1 omp)
 define_test(test_time_integrator.f90 1 omp)
 define_test(test_hostside_io.f90 ${CMAKE_CTEST_NPROCS} omp)
 define_test(omp/test_omp_tridiag.f90 ${CMAKE_CTEST_NPROCS} omp)
 define_test(omp/test_omp_transeq.f90 ${CMAKE_CTEST_NPROCS} omp)
 define_test(omp/test_omp_dist_transeq.f90 ${CMAKE_CTEST_NPROCS} omp)
 
+define_test(test_scalar_product.f90 ${CMAKE_CTEST_NPROCS} omp)
+define_test(test_sum_intox.f90 ${CMAKE_CTEST_NPROCS} omp)
+
 if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI" OR
     ${CMAKE_Fortran_COMPILER_ID} STREQUAL "NVHPC")
   define_test(test_reordering.f90 1 cuda)
diff --git a/tests/test_reorder_map.f90 b/tests/test_reorder_map.f90
new file mode 100644
index 00000000..c09331e1
--- /dev/null
+++ b/tests/test_reorder_map.f90
@@ -0,0 +1,93 @@
+program test_reorder_map
+  !! Check that reorder mappings return the expected values
+
+  use m_common
+  
+  implicit none
+
+  logical :: test_pass
+
+  test_pass = .true.
+  
+  call test_get_dirs_from_rdr()
+  call test_get_rdr_from_dirs()
+
+  if (.not. test_pass) then
+    error stop "FAIL"
+  end if
+  
+contains
+
+  subroutine test_get_dirs_from_rdr()
+    
+    call test_one_get_dirs_from_rdr("RDR_X2Y", RDR_X2Y, DIR_X, DIR_Y)
+    call test_one_get_dirs_from_rdr("RDR_X2Z", RDR_X2Z, DIR_X, DIR_Z)
+    call test_one_get_dirs_from_rdr("RDR_X2C", RDR_X2C, DIR_X, DIR_C)
+    
+    call test_one_get_dirs_from_rdr("RDR_Y2X", RDR_Y2X, DIR_Y, DIR_X)
+    call test_one_get_dirs_from_rdr("RDR_Y2Z", RDR_Y2Z, DIR_Y, DIR_Z)
+    call test_one_get_dirs_from_rdr("RDR_Y2C", RDR_Y2C, DIR_Y, DIR_C)
+    
+    call test_one_get_dirs_from_rdr("RDR_Z2X", RDR_Z2X, DIR_Z, DIR_X)
+    call test_one_get_dirs_from_rdr("RDR_Z2Y", RDR_Z2Y, DIR_Z, DIR_Y)
+    call test_one_get_dirs_from_rdr("RDR_Z2C", RDR_Z2C, DIR_Z, DIR_C)
+    
+    call test_one_get_dirs_from_rdr("RDR_C2X", RDR_C2X, DIR_C, DIR_X)
+    call test_one_get_dirs_from_rdr("RDR_C2Y", RDR_C2Y, DIR_C, DIR_Y)
+    call test_one_get_dirs_from_rdr("RDR_C2Z", RDR_C2Z, DIR_C, DIR_Z)
+    
+  end subroutine test_get_dirs_from_rdr
+
+  subroutine test_one_get_dirs_from_rdr(test, rdr, expect_from, expect_to)
+    character(len=*), intent(in) :: test
+    integer, intent(in) :: rdr, expect_from, expect_to
+
+    integer :: dir_from, dir_to
+
+    call get_dirs_from_rdr(dir_from, dir_to, rdr)
+    print *, test
+    print *, "- Expect: ", expect_from, expect_to
+    print *, "- Got: ", dir_from, dir_to
+    if ((dir_from /= expect_from) .or. (dir_to /= expect_to)) then
+      test_pass = .false.
+    end if
+
+  end subroutine test_one_get_dirs_from_rdr
+
+  subroutine test_get_rdr_from_dirs()
+
+    call test_one_get_rdr_from_dirs("X->Y", DIR_X, DIR_Y, RDR_X2Y)
+    call test_one_get_rdr_from_dirs("X->Z", DIR_X, DIR_Z, RDR_X2Z)
+    call test_one_get_rdr_from_dirs("X->C", DIR_X, DIR_C, RDR_X2C)
+
+    call test_one_get_rdr_from_dirs("Y->X", DIR_Y, DIR_X, RDR_Y2X)
+    call test_one_get_rdr_from_dirs("Y->Z", DIR_Y, DIR_Z, RDR_Y2Z)
+    call test_one_get_rdr_from_dirs("Y->C", DIR_Y, DIR_C, RDR_Y2C)
+
+    call test_one_get_rdr_from_dirs("Z->X", DIR_Z, DIR_X, RDR_Z2X)
+    call test_one_get_rdr_from_dirs("Z->Y", DIR_Z, DIR_Y, RDR_Z2Y)
+    call test_one_get_rdr_from_dirs("Z->C", DIR_Z, DIR_C, RDR_Z2C)
+
+    call test_one_get_rdr_from_dirs("C->X", DIR_C, DIR_X, RDR_C2X)
+    call test_one_get_rdr_from_dirs("C->Y", DIR_C, DIR_Y, RDR_C2Y)
+    call test_one_get_rdr_from_dirs("C->Z", DIR_C, DIR_Z, RDR_C2Z)
+    
+  end subroutine test_get_rdr_from_dirs
+
+  subroutine test_one_get_rdr_from_dirs(test, from, to, expect_rdr)
+
+    character(len=*), intent(in) :: test
+    integer, intent(in) :: from, to, expect_rdr
+    integer :: rdr
+
+    rdr = get_rdr_from_dirs(from, to)
+    print *, test
+    print *, "- Expect: ", expect_rdr
+    print *, "- Got: ", rdr
+    if (rdr /= expect_rdr) then
+      test_pass = .false.
+    end if
+
+  end subroutine test_one_get_rdr_from_dirs
+  
+end program test_reorder_map
diff --git a/tests/test_scalar_product.f90 b/tests/test_scalar_product.f90
new file mode 100644
index 00000000..dcd79a7e
--- /dev/null
+++ b/tests/test_scalar_product.f90
@@ -0,0 +1,157 @@
+program test_scalar_product
+  !! Tests the implementation of the scalar product function.
+  !
+  !! Given two fields, a and b, computes s = a_i * b_i where repeated indices
+  !! imply summation.
+
+  use m_common, only: DIR_X, DIR_Y, DIR_Z, DIR_C
+
+  use m_allocator
+  use m_base_backend
+#ifdef CUDA
+#else
+  use m_omp_backend
+  use m_omp_common, only: SZ
+#endif
+
+  implicit none
+
+  integer, parameter :: nx = 17, ny = 32, nz = 59
+  real(dp), parameter :: lx = 1.618, ly = 3.141529, lz = 1.729
+
+  class(base_backend_t), pointer :: backend
+  class(allocator_t), pointer :: allocator
+#ifdef CUDA
+#else
+  type(omp_backend_t), target :: omp_backend
+  type(allocator_t), target :: omp_allocator
+#endif
+
+  type(mesh_t) :: mesh
+
+  character(len=5), dimension(4), parameter :: test = &
+    ["DIR_X", "DIR_Y", "DIR_Z", "DIR_C"]
+  integer, dimension(4), parameter :: dir = [DIR_X, DIR_Y, DIR_Z, DIR_C]
+  integer :: i
+
+  integer :: nrank, nproc
+  integer :: ierr
+
+  logical :: test_pass = .true.
+
+  call MPI_Init(ierr)
+  call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr)
+  call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr)
+
+  mesh = mesh_t([nx, ny, nz], &
+                [1, 1, nproc], &
+                [lx, ly, lz])
+
+#ifdef CUDA
+#else
+  omp_allocator = allocator_t(mesh, SZ)
+  allocator => omp_allocator
+  omp_backend = omp_backend_t(mesh, allocator)
+  backend => omp_backend
+#endif
+
+  do i = 1, 4
+    call runtest(test(i), dir(i))
+  end do
+
+  if (nrank == 0) then
+    if (.not. test_pass) then
+      error stop "Test failed"
+    end if
+  end if
+  call MPI_Barrier(MPI_COMM_WORLD, ierr)
+
+  call MPI_Finalize(ierr)
+
+contains
+
+  subroutine runtest(test, dir)
+
+    character(len=*), intent(in) :: test
+    integer, intent(in) :: dir
+
+    class(field_t), pointer :: a, b, c
+    real(dp) :: s
+
+    integer :: n
+    integer :: expt
+    logical :: check_pass
+
+    if (nrank == 0) then
+      print *, "Testing ", test
+    end if
+    
+    a => backend%allocator%get_block(dir, VERT)
+    b => backend%allocator%get_block(dir, VERT)
+
+    if (nrank == 0) then
+      print *, "Simplest check: dot(0, 0) = 0"
+    end if
+    a%data = 0
+    b%data = 0
+    s = backend%scalar_product(a, b)
+    if (s /= 0) then
+      check_pass = .false.
+    else
+      check_pass = .true.
+    end if
+    call MPI_Allreduce(MPI_IN_PLACE, check_pass, 1, &
+      MPI_LOGICAL, MPI_LAND, MPI_COMM_WORLD, &
+      ierr)
+    if (nrank == 0) then
+      print *, "- Got: ", s
+      print *, "- Expected: ", 0
+      if (.not. check_pass) then
+        print *, "- FAIL"
+      else
+        print *, "- PASS"
+      end if
+    end if
+    test_pass = test_pass .and. check_pass
+      
+    if (nrank == 0) then
+      print *, "Check: dot(nrank, nrank) = sum^{nrank-1}_i=0 sum_n(i) i**2"
+    end if
+    a%data = (nrank + 1)
+    s = backend%scalar_product(a, a)
+
+    ! Determine number of interior points, using a temporary DIR_C field
+    c => backend%allocator%get_block(DIR_C)
+    call c%set_data_loc(a%data_loc)
+    n = product(mesh%get_field_dims(c))
+    call backend%allocator%release_block(c)
+    
+    expt = n*(nrank + 1)**2
+    call MPI_Allreduce(MPI_IN_PLACE, expt, 1, &
+                       MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, &
+                       ierr)
+    if (s /= real(expt, kind(s))) then
+      check_pass = .false.
+    else
+      check_pass = .true.
+    end if
+    call MPI_Allreduce(MPI_IN_PLACE, check_pass, 1, &
+      MPI_LOGICAL, MPI_LAND, MPI_COMM_WORLD, &
+      ierr)
+    if (nrank == 0) then
+      print *, "- Got: ", s
+      print *, "- Expected: ", expt
+      if (.not. check_pass) then
+        print *, "- FAIL"
+      else
+        print *, "- PASS"
+      end if
+    end if
+    test_pass = test_pass .and. check_pass
+
+    call backend%allocator%release_block(a)
+    call backend%allocator%release_block(b)
+
+  end subroutine runtest
+
+end program test_scalar_product
diff --git a/tests/test_sum_intox.f90 b/tests/test_sum_intox.f90
new file mode 100644
index 00000000..a2257d94
--- /dev/null
+++ b/tests/test_sum_intox.f90
@@ -0,0 +1,129 @@
+program test_sum_intox
+  !! Tests the implementation of summing a Y-oriented field into an X-oriented
+  !! one.
+
+  use m_common, only: DIR_X, DIR_Y, DIR_Z
+
+  use m_allocator
+  use m_base_backend
+#ifdef CUDA
+#else
+  use m_omp_backend
+  use m_omp_common, only: SZ
+#endif
+
+  implicit none
+
+  integer, parameter :: nx = 17, ny = 32, nz = 59
+  real(dp), parameter :: lx = 1.618, ly = 3.141529, lz = 1.729
+
+  class(base_backend_t), pointer :: backend
+  class(allocator_t), pointer :: allocator
+#ifdef CUDA
+#else
+  type(omp_backend_t), target :: omp_backend
+  type(allocator_t), target :: omp_allocator
+#endif
+
+  type(mesh_t) :: mesh
+
+  integer :: nrank, nproc
+  integer :: ierr
+
+  logical :: test_pass = .true.
+
+  call MPI_Init(ierr)
+  call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr)
+  call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr)
+
+  mesh = mesh_t([nx, ny, nz], &
+                [1, 1, nproc], &
+                [lx, ly, lz])
+
+#ifdef CUDA
+#else
+  omp_allocator = allocator_t(mesh, SZ)
+  allocator => omp_allocator
+  omp_backend = omp_backend_t(mesh, allocator)
+  backend => omp_backend
+#endif
+
+  call runtest("YintoX", DIR_Y)
+  call runtest("ZintoX", DIR_Z)
+
+  if (nrank == 0) then
+    if (.not. test_pass) then
+      error stop "Test failed"
+    end if
+  end if
+
+  call MPI_Finalize(ierr)
+
+contains
+
+  subroutine runtest(test, dir_from)
+
+    use m_ordering, only: get_index_dir
+
+    character(len=*), intent(in) :: test
+    integer, intent(in) :: dir_from
+    
+    class(field_t), pointer :: a, b
+    integer :: ctr
+    integer :: i, j, k
+    integer :: ii, jj, kk
+
+    integer, dimension(3) :: dims
+    logical :: check_pass
+
+    if (nrank == 0) then
+      print *, "Test ", test
+    end if
+
+    a => backend%allocator%get_block(DIR_X)
+    b => backend%allocator%get_block(dir_from)
+
+    dims = mesh%get_padded_dims(DIR_C)
+
+    ! Initialise fields so that b = -a
+    ctr = 0
+    do k = 1, dims(3)
+      do j = 1, dims(2)
+        do i = 1, dims(1)
+          call get_index_dir(ii, jj, kk, i, j, k, DIR_X, SZ, &
+                             dims(1), dims(2), dims(3))
+          a%data(ii, jj, kk) = ctr
+          call get_index_dir(ii, jj, kk, i, j, k, dir_from, SZ, &
+                             dims(1), dims(2), dims(3))
+          b%data(ii, jj, kk) = -ctr
+          ctr = ctr + 1
+        end do
+      end do
+    end do
+
+    if (dir_from == DIR_Y) then
+      call backend%sum_yintox(a, b)
+    else
+      call backend%sum_zintox(a, b)
+    end if
+    
+    check_pass = .not. ((minval(a%data) /= 0) .or. (maxval(a%data) /= 0))
+    call MPI_Allreduce(MPI_IN_PLACE, check_pass, 1, &
+                       MPI_LOGICAL, MPI_LAND, MPI_COMM_WORLD, &
+                       ierr)
+    test_pass = test_pass .and. check_pass
+
+    if (nrank == 0) then
+      if (check_pass) then
+        print *, "- PASS"
+      else
+        print *, "- FAIL"
+      end if
+    end if
+
+    call backend%allocator%release_block(a)
+    call backend%allocator%release_block(b)
+
+  end subroutine runtest
+
+end program test_sum_intox