Merge pull request #623 from nR3D/sycl

[SYCL] restructured cell linked list and other optimizations
Xiangyu-Hu · Jul 13, 2024 · e55c877 · e55c877
2 parents 80a5d08 + 8981ddd
commit e55c877
Show file tree

Hide file tree

Showing 20 changed files with 308 additions and 191 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -101,7 +101,8 @@ endif()
 
 if(SPHINXSYS_USE_SYCL)
     if(NOT SPHINXSYS_SYCL_TARGETS)
-        set(SPHINXSYS_SYCL_TARGETS nvptx64-nvidia-cuda)
+        message("-- Using default SYCL device target, select specific target with option SPHINXSYS_SYCL_TARGETS")
+        set(SPHINXSYS_SYCL_TARGETS spir64)
     endif()
     message("-- SPHinXsys use SYCL with targets ${SPHINXSYS_SYCL_TARGETS}")
     target_compile_options(sphinxsys_core INTERFACE -fsycl -fsycl-targets=${SPHINXSYS_SYCL_TARGETS}  -Wno-unknown-cuda-version)

diff --git a/src/shared/common/base_data_type.h b/src/shared/common/base_data_type.h
@@ -93,6 +93,7 @@ using Rotation2d = Eigen::Rotation2D<Real>;
 using Rotation3d = Eigen::AngleAxis<Real>;
 /** Device data types. */
 using DeviceReal = Real;
+using DeviceInt = u_int32_t;
 using DeviceVec2d = Vec2d;
 using DeviceVec3d = Vec3d;
 using DeviceArray2i = Array2i;

diff --git a/src/shared/common/vector_functions.h b/src/shared/common/vector_functions.h
@@ -202,6 +202,10 @@ inline execution::ExecutionEvent copyDataToDevice(const Real* host, DeviceReal*
     return transformAndCopyDataToDevice(host, device, size, [](Real val) { return static_cast<DeviceReal>(val); });
 }
 
+inline execution::ExecutionEvent copyDataToDevice(const size_t* host, DeviceInt* device, std::size_t size) {
+    return transformAndCopyDataToDevice(host, device, size, [](size_t val) { return static_cast<DeviceInt>(val); });
+}
+
 template<class T>
 inline execution::ExecutionEvent copyDataToDevice(const T& value, T* device, std::size_t size) {
     return execution::executionQueue.getQueue().fill(device, value, size);
@@ -234,5 +238,9 @@ inline execution::ExecutionEvent copyDataFromDevice(Vec3d* host, const DeviceVec
 inline execution::ExecutionEvent copyDataFromDevice(Real* host, const DeviceReal* device, std::size_t size) {
     return transformAndCopyDataFromDevice(host, device, size, [](DeviceReal val) { return static_cast<Real>(val); });
 }
+
+inline execution::ExecutionEvent copyDataFromDevice(size_t* host, const DeviceInt* device, std::size_t size) {
+    return transformAndCopyDataFromDevice(host, device, size, [](DeviceInt val) { return static_cast<size_t>(val); });
+}
 } // namespace SPH
 #endif // VECTOR_FUNCTIONS_H
diff --git a/src/shared/materials/riemann_solver.h b/src/shared/materials/riemann_solver.h
@@ -48,6 +48,23 @@ struct FluidStateOut
     FluidStateOut(Real rho, Vecd vel, Real p) : vel_(vel), rho_(rho), p_(p){};
 };
 
+class NoRiemannSolverKernel
+{
+  public:
+    NoRiemannSolverKernel(DeviceReal rho0c0_i, DeviceReal rho0c0_j, DeviceReal inv_rho0c0_sum)
+        : rho0c0_i_(rho0c0_i), rho0c0_j_(rho0c0_j), inv_rho0c0_sum_(inv_rho0c0_sum) {}
+
+    DeviceReal DissipativePJump(const DeviceReal &) const { return DeviceReal(0.0); }
+    DeviceReal DissipativeUJump(const DeviceReal &) const { return DeviceReal(0.0); }
+
+    DeviceReal AverageP(const DeviceReal &p_i, const DeviceReal &p_j)
+    {
+        return (p_i * rho0c0_j_ + p_j * rho0c0_i_) * inv_rho0c0_sum_;
+    };
+  protected:
+    DeviceReal rho0c0_i_, rho0c0_j_, inv_rho0c0_sum_;
+};
+
 /**
  * @struct NoRiemannSolver
  * @brief  Central difference scheme without Riemann flux.
@@ -60,7 +77,8 @@ class NoRiemannSolver
         : rho0_i_(fluid_i.ReferenceDensity()), rho0_j_(fluid_j.ReferenceDensity()),
           c0_i_(fluid_i.ReferenceSoundSpeed()), c0_j_(fluid_j.ReferenceSoundSpeed()),
           rho0c0_i_(rho0_i_ * c0_i_), rho0c0_j_(rho0_j_ * c0_j_),
-          inv_rho0c0_sum_(1.0 / (rho0c0_i_ + rho0c0_j_)){};
+          inv_rho0c0_sum_(1.0 / (rho0c0_i_ + rho0c0_j_)),
+          device_kernel(rho0c0_i_, rho0c0_j_, inv_rho0c0_sum_) {};
     Real DissipativePJump(const Real &u_jump) const { return 0.0; }
     Real DissipativeUJump(const Real &p_jump) const { return 0.0; }
 
@@ -77,6 +95,32 @@ class NoRiemannSolver
     Real rho0_i_, rho0_j_;
     Real c0_i_, c0_j_;
     Real rho0c0_i_, rho0c0_j_, inv_rho0c0_sum_;
+
+  public:
+    execution::DeviceImplementation<NoRiemannSolverKernel> device_kernel;
+};
+
+class AcousticRiemannSolverKernel : public NoRiemannSolverKernel
+{
+  public:
+    template<class... Args>
+    AcousticRiemannSolverKernel(DeviceReal inv_rho0c0_ave, DeviceReal rho0c0_geo_ave,
+                                DeviceReal inv_c_ave, DeviceReal limiter_coeff, Args&& ...baseArgs)
+        : NoRiemannSolverKernel(std::forward<Args>(baseArgs)...), inv_rho0c0_ave_(inv_rho0c0_ave),
+          rho0c0_geo_ave_(rho0c0_geo_ave), inv_c_ave_(inv_c_ave), limiter_coeff_(limiter_coeff) {}
+
+    DeviceReal DissipativePJump(const DeviceReal &u_jump) const {
+        return rho0c0_geo_ave_ * u_jump * sycl::min(limiter_coeff_ * sycl::max(u_jump * inv_c_ave_, DeviceReal(0)), DeviceReal(1));
+    }
+
+    DeviceReal DissipativeUJump(const DeviceReal &p_jump) const  {
+        return p_jump * inv_rho0c0_ave_;
+    }
+
+  protected:
+    DeviceReal inv_rho0c0_ave_, rho0c0_geo_ave_;
+    DeviceReal inv_c_ave_;
+    DeviceReal limiter_coeff_;
 };
 
 class AcousticRiemannSolver : public NoRiemannSolver
@@ -88,7 +132,9 @@ class AcousticRiemannSolver : public NoRiemannSolver
           inv_rho0c0_ave_(2.0 * inv_rho0c0_sum_),
           rho0c0_geo_ave_(2.0 * rho0c0_i_ * rho0c0_j_ * inv_rho0c0_sum_),
           inv_c_ave_(0.5 * (rho0_i_ + rho0_j_) * inv_rho0c0_ave_),
-          limiter_coeff_(limiter_coeff){};
+          limiter_coeff_(limiter_coeff),
+          device_kernel(inv_rho0c0_ave_, rho0c0_geo_ave_, inv_c_ave_, limiter_coeff_,
+                        rho0c0_i_, rho0c0_j_, inv_rho0c0_sum_) {};
     Real DissipativePJump(const Real &u_jump) const {
         return rho0c0_geo_ave_ * u_jump * SMIN(limiter_coeff_ * SMAX(u_jump * inv_c_ave_, Real(0)), Real(1));
     }
@@ -101,6 +147,9 @@ class AcousticRiemannSolver : public NoRiemannSolver
     Real inv_rho0c0_ave_, rho0c0_geo_ave_;
     Real inv_c_ave_;
     Real limiter_coeff_;
+
+  public:
+    execution::DeviceImplementation<AcousticRiemannSolverKernel> device_kernel;
 };
 
 class DissipativeRiemannSolver : public AcousticRiemannSolver

diff --git a/src/shared/materials/weakly_compressible_fluid.h b/src/shared/materials/weakly_compressible_fluid.h
@@ -44,10 +44,10 @@ class WeaklyCompressibleFluidKernel
     inline DeviceReal getPressure(DeviceReal rho) const { return p0_ * (rho / rho0_ - static_cast<DeviceReal>(1.0)); }
     inline DeviceReal getSoundSpeed(DeviceReal p = 0.0, DeviceReal rho = 1.0) const { return c0_; }
     inline DeviceReal ReferenceDensity() const { return rho0_; }
-    inline DeviceReal ReferenceSoundSpeed() { return c0_; };
+    inline DeviceReal ReferenceSoundSpeed() const { return c0_; };
 
   private:
-    DeviceReal p0_, c0_, rho0_;
+    const DeviceReal p0_, c0_, rho0_;
 };
 
 /**

diff --git a/src/shared/meshes/cell_linked_list.cpp b/src/shared/meshes/cell_linked_list.cpp
@@ -39,8 +39,8 @@ CellLinkedListKernel::CellLinkedListKernel(const DeviceVecd &meshLowerBound, Dev
     : total_real_particles_(0), list_data_pos_(nullptr),  // will be initialized at first UpdateCellLists execution
       mesh_lower_bound_(allocateDeviceData<DeviceVecd>(1)), grid_spacing_(allocateDeviceData<DeviceReal>(1)),
       all_grid_points_(allocateDeviceData<DeviceArrayi>(1)), all_cells_(allocateDeviceData<DeviceArrayi>(1)),
-      index_list_(nullptr), index_head_list_(allocateDeviceData<size_t>(VecdFoldProduct(allCells))),
-      index_head_list_size_(VecdFoldProduct(allCells))
+      linear_cell_size_(VecdFoldProduct(allCells)), offset_cell_size_(allocateDeviceData<DeviceInt>(linear_cell_size_+1)),
+      curr_cell_size_(allocateDeviceData<DeviceInt>(linear_cell_size_))
 {
     copyDataToDevice(&meshLowerBound, mesh_lower_bound_, 1)
         .add(copyDataToDevice(&gridSpacing, grid_spacing_, 1))
@@ -58,64 +58,91 @@ CellLinkedListKernel::~CellLinkedListKernel()
 
 execution::ExecutionEvent CellLinkedListKernel::clearCellLists()
 {
-    // Only clear head list, since index list does not depend on its previous values
-    return std::move(copyDataToDevice(static_cast<size_t>(0), index_head_list_, index_head_list_size_));
+    return std::move(copyDataToDevice(static_cast<DeviceInt>(0), offset_cell_size_, linear_cell_size_+1)
+                .add(copyDataToDevice(static_cast<DeviceInt>(0), curr_cell_size_, linear_cell_size_)));
 }
 
 execution::ExecutionEvent CellLinkedListKernel::UpdateCellLists(const SPH::BaseParticles &base_particles)
 {
-    if(!index_list_)  // initialize list data with base_particles at first execution
+    if(!list_data_pos_)  // initialize list data with base_particles at first execution
     {
         total_real_particles_ = base_particles.total_real_particles_;
         list_data_pos_ = base_particles.getDeviceVariableByName<DeviceVecd>("Position");
-        index_list_ = allocateDeviceData<size_t>(total_real_particles_);
+        particle_id_list_ = allocateDeviceData<DeviceInt>(total_real_particles_);
     }
     auto clear_event = clearCellLists();
-    auto *pos_n = base_particles.getDeviceVariableByName<DeviceVecd>("Position");
-    size_t total_real_particles = base_particles.total_real_particles_;
+
+    DeviceInt total_real_particles = base_particles.total_real_particles_;
+    auto determine_cells_size_event =  executionQueue.getQueue().submit(
+        [&, mesh_lower_bound = mesh_lower_bound_, grid_spacing = grid_spacing_, all_grid_points = all_grid_points_,
+         all_cells = all_cells_, offset_cell_size = offset_cell_size_, pos_n = list_data_pos_](sycl::handler &cgh)
+                      {
+                             cgh.depends_on(clear_event.getEventList());
+                             cgh.parallel_for(executionQueue.getUniformNdRange(total_real_particles),
+                                              [=](sycl::nd_item<1> item) {
+                                                  if(item.get_global_id() < total_real_particles)
+                                                  {
+                                                        const DeviceInt index_i = item.get_global_id();
+                                                        const auto cell_index = CellIndexFromPosition(pos_n[index_i], *mesh_lower_bound,
+                                                                                                     *grid_spacing, *all_grid_points);
+                                                        const auto linear_cell_index = transferCellIndexTo1D(cell_index, *all_cells);
+                                                        sycl::atomic_ref<DeviceInt, sycl::memory_order_relaxed, sycl::memory_scope_device,
+                                                                        sycl::access::address_space::global_space>
+                                                           atomic_cell_size(offset_cell_size[linear_cell_index]);
+                                                        ++atomic_cell_size;
+                                                  }
+                                         }); });
+
+    auto exclusive_scan_cells_size_event = executionQueue.getQueue().submit(
+        [&, offset_cell_size = offset_cell_size_, num_cells = linear_cell_size_ + 1](sycl::handler &cgh) {
+            cgh.depends_on(determine_cells_size_event);
+            cgh.parallel_for(executionQueue.getUniformNdRange(executionQueue.getWorkGroupSize()),
+                             [=](sycl::nd_item<1> item) {
+                                          if(item.get_group_linear_id() == 0) {
+                                              sycl::joint_exclusive_scan(item.get_group(), offset_cell_size,
+                                                                         offset_cell_size + num_cells,
+                                                                         offset_cell_size, sycl::plus<>{});
+                                          }
+                                      });
+                 });
+
     return executionQueue.getQueue().submit(
-                                 [&, mesh_lower_bound = mesh_lower_bound_, grid_spacing = grid_spacing_, all_grid_points = all_grid_points_,
-                                  all_cells = all_cells_, index_list = index_list_, index_head_list = index_head_list_](sycl::handler &cgh)
-                                 {
-                                     cgh.depends_on(clear_event.getEventList());
-                                     cgh.parallel_for(executionQueue.getUniformNdRange(total_real_particles), [=](sycl::nd_item<1> item)
-                                                      {
-                                 const size_t index_i = item.get_global_id();
-                                 if(index_i < total_real_particles)
+        [&, mesh_lower_bound = mesh_lower_bound_, grid_spacing = grid_spacing_, all_grid_points = all_grid_points_,
+        all_cells = all_cells_, particle_id_list = particle_id_list_, offset_cell_size = offset_cell_size_,
+        curr_cell_size = curr_cell_size_, pos_n = list_data_pos_] (sycl::handler &cgh)
+        {
+            cgh.depends_on(exclusive_scan_cells_size_event);
+            cgh.parallel_for(executionQueue.getUniformNdRange(total_real_particles),
+                             [=](sycl::nd_item<1> item)
+                             {
+                                 if(item.get_global_id() < total_real_particles)
                                  {
+                                     const DeviceInt index_i = item.get_global_id();
                                      const auto cell_index = CellIndexFromPosition(pos_n[index_i], *mesh_lower_bound,
                                                                                    *grid_spacing, *all_grid_points);
                                      const auto linear_cell_index = transferCellIndexTo1D(cell_index, *all_cells);
-                                     sycl::atomic_ref<size_t, sycl::memory_order_relaxed, sycl::memory_scope_device,
+                                     sycl::atomic_ref<DeviceInt, sycl::memory_order_relaxed, sycl::memory_scope_device,
                                                       sycl::access::address_space::global_space>
-                                         atomic_head_list(index_head_list[linear_cell_index]);
-                                     /*
-                                      * Insert index at the head of the list, the index previously at the top is then
-                                      * used as the next one pointed by the new index.
-                                      * Indices values are increased by 1 to let 0 be the value that indicates list termination.
-                                      * If the cell list is empty (i.e. head == 0) then head will point to the new index and the
-                                      * new index will point to 0 (i.e. the new index will be the first and last element of the cell list).
-                                      *     index_head_list[linear_cell_index] = index_i+1  ---> index_list[index_i] = 0
-                                      * Since the cell list order is not relevant, memory_order_relaxed will only ensure that each cell of
-                                      * index_head_list gets a new index one at a time.
-                                      */
-                                     index_list[index_i] = atomic_head_list.exchange(index_i + 1);
-                                 } });
-                                 });
+                                         atomic_current_cell_size(curr_cell_size[linear_cell_index]);
+
+                                     particle_id_list[offset_cell_size[linear_cell_index] + atomic_current_cell_size++] = index_i;
+                                 }
+                             });
+        });
 }
 //=================================================================================================//
-size_t *CellLinkedListKernel::computingSequence(BaseParticles &baseParticles)
+DeviceInt *CellLinkedListKernel::computingSequence(BaseParticles &baseParticles)
 {
     auto *pos = baseParticles.getDeviceVariableByName<DeviceVecd>("Position");
     auto *sequence = baseParticles.sequence_device_;
-    size_t total_real_particles = baseParticles.total_real_particles_;
+    DeviceInt total_real_particles = baseParticles.total_real_particles_;
     executionQueue.getQueue().submit(
                                  [&, mesh_lower_bound = mesh_lower_bound_, grid_spacing = grid_spacing_,
                                   all_grid_points = all_grid_points_](sycl::handler &cgh)
                                  {
                                      cgh.parallel_for(executionQueue.getUniformNdRange(total_real_particles), [=](sycl::nd_item<1> item)
                                                       {
-                                                          size_t i = item.get_global_id();
+                                                          DeviceInt i = item.get_global_id();
                                                           if(i < total_real_particles)
                                                               sequence[i] = BaseMesh::transferMeshIndexToMortonOrder(
                                                                   CellIndexFromPosition(pos[i], *mesh_lower_bound,