From e11c19fb656d1ed49a7bb1c9e9e6ecfba5637787 Mon Sep 17 00:00:00 2001
From: Alberto Guarnieri <nR3D@pm.me>
Date: Fri, 24 May 2024 12:01:36 +0200
Subject: [PATCH 1/8] Added device-specific u_int32 type (DeviceInt) that
 replaces size_t for kernel executions

---
 src/shared/common/base_data_type.h            |  1 +
 src/shared/common/vector_functions.h          |  8 ++
 src/shared/materials/riemann_solver.h         | 53 ++++++++++-
 .../materials/weakly_compressible_fluid.h     |  4 +-
 src/shared/meshes/cell_linked_list.cpp        | 14 +--
 src/shared/meshes/cell_linked_list.h          | 24 ++---
 .../external_force/external_force.h           |  2 +-
 .../fluid_dynamics/density_summation.h        | 12 +--
 .../fluid_dynamics/fluid_integration.h        | 88 +++++++++++--------
 .../fluid_dynamics/fluid_integration.hpp      | 18 ++--
 .../fluid_dynamics/fluid_time_step.h          |  2 +-
 .../particle_dynamics/particle_functors.h     |  2 +-
 src/shared/particles/base_particles.cpp       |  6 +-
 src/shared/particles/base_particles.h         |  6 +-
 src/shared/particles/base_particles.hpp       |  2 +-
 15 files changed, 156 insertions(+), 86 deletions(-)
diff --git a/src/shared/common/base_data_type.h b/src/shared/common/base_data_type.h
index c776dfb440..6b864371fd 100644
--- a/src/shared/common/base_data_type.h
+++ b/src/shared/common/base_data_type.h
@@ -93,6 +93,7 @@ using Rotation2d = Eigen::Rotation2D<Real>;
 using Rotation3d = Eigen::AngleAxis<Real>;
 /** Device data types. */
 using DeviceReal = Real;
+using DeviceInt = u_int32_t;
 using DeviceVec2d = Vec2d;
 using DeviceVec3d = Vec3d;
 using DeviceArray2i = Array2i;
diff --git a/src/shared/common/vector_functions.h b/src/shared/common/vector_functions.h
index c995838918..5f34c33081 100644
--- a/src/shared/common/vector_functions.h
+++ b/src/shared/common/vector_functions.h
@@ -202,6 +202,10 @@ inline execution::ExecutionEvent copyDataToDevice(const Real* host, DeviceReal*
     return transformAndCopyDataToDevice(host, device, size, [](Real val) { return static_cast<DeviceReal>(val); });
 }
 
+inline execution::ExecutionEvent copyDataToDevice(const size_t* host, DeviceInt* device, std::size_t size) {
+    return transformAndCopyDataToDevice(host, device, size, [](size_t val) { return static_cast<DeviceInt>(val); });
+}
+
 template<class T>
 inline execution::ExecutionEvent copyDataToDevice(const T& value, T* device, std::size_t size) {
     return execution::executionQueue.getQueue().fill(device, value, size);
@@ -234,5 +238,9 @@ inline execution::ExecutionEvent copyDataFromDevice(Vec3d* host, const DeviceVec
 inline execution::ExecutionEvent copyDataFromDevice(Real* host, const DeviceReal* device, std::size_t size) {
     return transformAndCopyDataFromDevice(host, device, size, [](DeviceReal val) { return static_cast<Real>(val); });
 }
+
+inline execution::ExecutionEvent copyDataFromDevice(size_t* host, const DeviceInt* device, std::size_t size) {
+    return transformAndCopyDataFromDevice(host, device, size, [](DeviceInt val) { return static_cast<size_t>(val); });
+}
 } // namespace SPH
 #endif // VECTOR_FUNCTIONS_H
diff --git a/src/shared/materials/riemann_solver.h b/src/shared/materials/riemann_solver.h
index 2b81a312b0..94582d96ff 100644
--- a/src/shared/materials/riemann_solver.h
+++ b/src/shared/materials/riemann_solver.h
@@ -48,6 +48,23 @@ struct FluidStateOut
     FluidStateOut(Real rho, Vecd vel, Real p) : vel_(vel), rho_(rho), p_(p){};
 };
 
+class NoRiemannSolverKernel
+{
+  public:
+    NoRiemannSolverKernel(DeviceReal rho0c0_i, DeviceReal rho0c0_j, DeviceReal inv_rho0c0_sum)
+        : rho0c0_i_(rho0c0_i), rho0c0_j_(rho0c0_j), inv_rho0c0_sum_(inv_rho0c0_sum) {}
+
+    DeviceReal DissipativePJump(const DeviceReal &) const { return DeviceReal(0.0); }
+    DeviceReal DissipativeUJump(const DeviceReal &) const { return DeviceReal(0.0); }
+
+    DeviceReal AverageP(const DeviceReal &p_i, const DeviceReal &p_j)
+    {
+        return (p_i * rho0c0_j_ + p_j * rho0c0_i_) * inv_rho0c0_sum_;
+    };
+  protected:
+    DeviceReal rho0c0_i_, rho0c0_j_, inv_rho0c0_sum_;
+};
+
 /**
  * @struct NoRiemannSolver
  * @brief  Central difference scheme without Riemann flux.
@@ -60,7 +77,8 @@ class NoRiemannSolver
         : rho0_i_(fluid_i.ReferenceDensity()), rho0_j_(fluid_j.ReferenceDensity()),
           c0_i_(fluid_i.ReferenceSoundSpeed()), c0_j_(fluid_j.ReferenceSoundSpeed()),
           rho0c0_i_(rho0_i_ * c0_i_), rho0c0_j_(rho0_j_ * c0_j_),
-          inv_rho0c0_sum_(1.0 / (rho0c0_i_ + rho0c0_j_)){};
+          inv_rho0c0_sum_(1.0 / (rho0c0_i_ + rho0c0_j_)),
+          device_kernel(rho0c0_i_, rho0c0_j_, inv_rho0c0_sum_) {};
     Real DissipativePJump(const Real &u_jump) const { return 0.0; }
     Real DissipativeUJump(const Real &p_jump) const { return 0.0; }
 
@@ -77,6 +95,32 @@ class NoRiemannSolver
     Real rho0_i_, rho0_j_;
     Real c0_i_, c0_j_;
     Real rho0c0_i_, rho0c0_j_, inv_rho0c0_sum_;
+
+  public:
+    execution::DeviceImplementation<NoRiemannSolverKernel> device_kernel;
+};
+
+class AcousticRiemannSolverKernel : public NoRiemannSolverKernel
+{
+  public:
+    template<class... Args>
+    AcousticRiemannSolverKernel(DeviceReal inv_rho0c0_ave, DeviceReal rho0c0_geo_ave,
+                                DeviceReal inv_c_ave, DeviceReal limiter_coeff, Args&& ...baseArgs)
+        : NoRiemannSolverKernel(std::forward<Args>(baseArgs)...), inv_rho0c0_ave_(inv_rho0c0_ave),
+          rho0c0_geo_ave_(rho0c0_geo_ave), inv_c_ave_(inv_c_ave), limiter_coeff_(limiter_coeff) {}
+
+    DeviceReal DissipativePJump(const DeviceReal &u_jump) const {
+        return rho0c0_geo_ave_ * u_jump * sycl::min(limiter_coeff_ * sycl::max(u_jump * inv_c_ave_, DeviceReal(0)), DeviceReal(1));
+    }
+
+    DeviceReal DissipativeUJump(const DeviceReal &p_jump) const  {
+        return p_jump * inv_rho0c0_ave_;
+    }
+
+  protected:
+    DeviceReal inv_rho0c0_ave_, rho0c0_geo_ave_;
+    DeviceReal inv_c_ave_;
+    DeviceReal limiter_coeff_;
 };
 
 class AcousticRiemannSolver : public NoRiemannSolver
@@ -88,7 +132,9 @@ class AcousticRiemannSolver : public NoRiemannSolver
           inv_rho0c0_ave_(2.0 * inv_rho0c0_sum_),
           rho0c0_geo_ave_(2.0 * rho0c0_i_ * rho0c0_j_ * inv_rho0c0_sum_),
           inv_c_ave_(0.5 * (rho0_i_ + rho0_j_) * inv_rho0c0_ave_),
-          limiter_coeff_(limiter_coeff){};
+          limiter_coeff_(limiter_coeff),
+          device_kernel(inv_rho0c0_ave_, rho0c0_geo_ave_, inv_c_ave_, limiter_coeff_,
+                        rho0c0_i_, rho0c0_j_, inv_rho0c0_sum_) {};
     Real DissipativePJump(const Real &u_jump) const {
         return rho0c0_geo_ave_ * u_jump * SMIN(limiter_coeff_ * SMAX(u_jump * inv_c_ave_, Real(0)), Real(1));
     }
@@ -101,6 +147,9 @@ class AcousticRiemannSolver : public NoRiemannSolver
     Real inv_rho0c0_ave_, rho0c0_geo_ave_;
     Real inv_c_ave_;
     Real limiter_coeff_;
+
+  public:
+    execution::DeviceImplementation<AcousticRiemannSolverKernel> device_kernel;
 };
 
 class DissipativeRiemannSolver : public AcousticRiemannSolver
diff --git a/src/shared/materials/weakly_compressible_fluid.h b/src/shared/materials/weakly_compressible_fluid.h
index b3418df974..d1543db2d9 100644
--- a/src/shared/materials/weakly_compressible_fluid.h
+++ b/src/shared/materials/weakly_compressible_fluid.h
@@ -44,10 +44,10 @@ class WeaklyCompressibleFluidKernel
     inline DeviceReal getPressure(DeviceReal rho) const { return p0_ * (rho / rho0_ - static_cast<DeviceReal>(1.0)); }
     inline DeviceReal getSoundSpeed(DeviceReal p = 0.0, DeviceReal rho = 1.0) const { return c0_; }
     inline DeviceReal ReferenceDensity() const { return rho0_; }
-    inline DeviceReal ReferenceSoundSpeed() { return c0_; };
+    inline DeviceReal ReferenceSoundSpeed() const { return c0_; };
 
   private:
-    DeviceReal p0_, c0_, rho0_;
+    const DeviceReal p0_, c0_, rho0_;
 };
 
 /**
diff --git a/src/shared/meshes/cell_linked_list.cpp b/src/shared/meshes/cell_linked_list.cpp
index ee6eeab92b..0fb29d4d60 100644
--- a/src/shared/meshes/cell_linked_list.cpp
+++ b/src/shared/meshes/cell_linked_list.cpp
@@ -39,7 +39,7 @@ CellLinkedListKernel::CellLinkedListKernel(const DeviceVecd &meshLowerBound, Dev
     : total_real_particles_(0), list_data_pos_(nullptr),  // will be initialized at first UpdateCellLists execution
       mesh_lower_bound_(allocateDeviceData<DeviceVecd>(1)), grid_spacing_(allocateDeviceData<DeviceReal>(1)),
       all_grid_points_(allocateDeviceData<DeviceArrayi>(1)), all_cells_(allocateDeviceData<DeviceArrayi>(1)),
-      index_list_(nullptr), index_head_list_(allocateDeviceData<size_t>(VecdFoldProduct(allCells))),
+      index_list_(nullptr), index_head_list_(allocateDeviceData<DeviceInt>(VecdFoldProduct(allCells))),
       index_head_list_size_(VecdFoldProduct(allCells))
 {
     copyDataToDevice(&meshLowerBound, mesh_lower_bound_, 1)
@@ -59,7 +59,7 @@ CellLinkedListKernel::~CellLinkedListKernel()
 execution::ExecutionEvent CellLinkedListKernel::clearCellLists()
 {
     // Only clear head list, since index list does not depend on its previous values
-    return std::move(copyDataToDevice(static_cast<size_t>(0), index_head_list_, index_head_list_size_));
+    return std::move(copyDataToDevice(static_cast<DeviceInt>(0), index_head_list_, index_head_list_size_));
 }
 
 execution::ExecutionEvent CellLinkedListKernel::UpdateCellLists(const SPH::BaseParticles &base_particles)
@@ -68,7 +68,7 @@ execution::ExecutionEvent CellLinkedListKernel::UpdateCellLists(const SPH::BaseP
     {
         total_real_particles_ = base_particles.total_real_particles_;
         list_data_pos_ = base_particles.getDeviceVariableByName<DeviceVecd>("Position");
-        index_list_ = allocateDeviceData<size_t>(total_real_particles_);
+        index_list_ = allocateDeviceData<DeviceInt>(total_real_particles_);
     }
     auto clear_event = clearCellLists();
     auto *pos_n = base_particles.getDeviceVariableByName<DeviceVecd>("Position");
@@ -86,7 +86,7 @@ execution::ExecutionEvent CellLinkedListKernel::UpdateCellLists(const SPH::BaseP
                                      const auto cell_index = CellIndexFromPosition(pos_n[index_i], *mesh_lower_bound,
                                                                                    *grid_spacing, *all_grid_points);
                                      const auto linear_cell_index = transferCellIndexTo1D(cell_index, *all_cells);
-                                     sycl::atomic_ref<size_t, sycl::memory_order_relaxed, sycl::memory_scope_device,
+                                     sycl::atomic_ref<DeviceInt, sycl::memory_order_relaxed, sycl::memory_scope_device,
                                                       sycl::access::address_space::global_space>
                                          atomic_head_list(index_head_list[linear_cell_index]);
                                      /*
@@ -104,18 +104,18 @@ execution::ExecutionEvent CellLinkedListKernel::UpdateCellLists(const SPH::BaseP
                                  });
 }
 //=================================================================================================//
-size_t *CellLinkedListKernel::computingSequence(BaseParticles &baseParticles)
+DeviceInt *CellLinkedListKernel::computingSequence(BaseParticles &baseParticles)
 {
     auto *pos = baseParticles.getDeviceVariableByName<DeviceVecd>("Position");
     auto *sequence = baseParticles.sequence_device_;
-    size_t total_real_particles = baseParticles.total_real_particles_;
+    DeviceInt total_real_particles = baseParticles.total_real_particles_;
     executionQueue.getQueue().submit(
                                  [&, mesh_lower_bound = mesh_lower_bound_, grid_spacing = grid_spacing_,
                                   all_grid_points = all_grid_points_](sycl::handler &cgh)
                                  {
                                      cgh.parallel_for(executionQueue.getUniformNdRange(total_real_particles), [=](sycl::nd_item<1> item)
                                                       {
-                                                          size_t i = item.get_global_id();
+                                                          DeviceInt i = item.get_global_id();
                                                           if(i < total_real_particles)
                                                               sequence[i] = BaseMesh::transferMeshIndexToMortonOrder(
                                                                   CellIndexFromPosition(pos[i], *mesh_lower_bound,
diff --git a/src/shared/meshes/cell_linked_list.h b/src/shared/meshes/cell_linked_list.h
index 1527030f4f..244b62174a 100644
--- a/src/shared/meshes/cell_linked_list.h
+++ b/src/shared/meshes/cell_linked_list.h
@@ -93,19 +93,19 @@ class CellLinkedListKernel {
     execution::ExecutionEvent UpdateCellLists(const BaseParticles &base_particles);
 
     template <typename FunctionOnEach>
-    void forEachNeighbor(size_t index_i, const DeviceVecd *self_position,
+    void forEachNeighbor(DeviceInt index_i, const DeviceVecd *self_position,
                          const FunctionOnEach &function) const
     {
         const DeviceVecd pos_i = self_position[index_i];
-        const size_t search_depth = 1;
+        const DeviceInt search_depth = 1;
         const auto target_cell_index = CellIndexFromPosition(pos_i, *mesh_lower_bound_,
                                                              *grid_spacing_, *all_grid_points_);
         mesh_for_each_array(
             VecdMax(DeviceArrayi{0}, DeviceArrayi{target_cell_index - search_depth}),
-            VecdMin(*all_cells_, DeviceArrayi{target_cell_index + search_depth + 1}),
+            VecdMin(*all_cells_, DeviceArrayi{target_cell_index + search_depth + DeviceInt{1}}),
             [&](DeviceArrayi&& cell_index) {
                 const auto linear_cell_index = transferCellIndexTo1D(cell_index, *all_cells_);
-                size_t index_j = index_head_list_[linear_cell_index];
+                DeviceInt index_j = index_head_list_[linear_cell_index];
                 // Cell list ends when index_j == 0, if index_j is already zero then cell is empty.
                 while(index_j--) {  // abbreviates while(index_j != 0) { index_j -= 1; ... }
                     function(pos_i, index_j, list_data_pos_[index_j]);
@@ -121,7 +121,7 @@ class CellLinkedListKernel {
     }
 
 
-    size_t* computingSequence(BaseParticles &baseParticles);
+    DeviceInt* computingSequence(BaseParticles &baseParticles);
 
     template<class Type, int Dim>
     static inline DeviceArrayi CellIndexFromPosition(const sycl::vec<Type,Dim> &position, const sycl::vec<Type,Dim>& mesh_lower_bound,
@@ -130,7 +130,7 @@ class CellLinkedListKernel {
         return sycl::min(
             sycl::max(
                 sycl::floor((position - mesh_lower_bound) / grid_spacing)
-                    .template convert<int, sycl::rounding_mode::rtz>(), DeviceArrayi{0}),
+                    .template convert<DeviceInt, sycl::rounding_mode::rtz>(), DeviceArrayi{0}),
             all_grid_points - DeviceArrayi{2});
     }
 
@@ -143,12 +143,12 @@ class CellLinkedListKernel {
         return VecdMin(VecdMax(pos_floor, DeviceArrayi{0}), DeviceArrayi{all_grid_points - DeviceArrayi{2}});
     }
 
-    static inline size_t transferCellIndexTo1D(const DeviceArray2i &cell_index, const DeviceArray2i &all_cells)
+    static inline DeviceInt transferCellIndexTo1D(const DeviceArray2i &cell_index, const DeviceArray2i &all_cells)
     {
         return cell_index[0] * all_cells[1] + cell_index[1];
     }
 
-    static inline size_t transferCellIndexTo1D(const DeviceArray3i &cell_index, const DeviceArray3i &all_cells)
+    static inline DeviceInt transferCellIndexTo1D(const DeviceArray3i &cell_index, const DeviceArray3i &all_cells)
     {
         return cell_index[0] * all_cells[1] * all_cells[2] +
                cell_index[1] * all_cells[2] +
@@ -156,16 +156,16 @@ class CellLinkedListKernel {
     }
 
   private:
-    size_t total_real_particles_;
+    DeviceInt total_real_particles_;
     DeviceVecd* list_data_pos_;
 
     DeviceVecd *mesh_lower_bound_;
     DeviceReal *grid_spacing_;
     DeviceArrayi *all_grid_points_, *all_cells_;
 
-    size_t* index_list_;
-    size_t* index_head_list_;
-    size_t index_head_list_size_;
+    DeviceInt* index_list_;
+    DeviceInt* index_head_list_;
+    DeviceInt index_head_list_size_;
 };
 
 
diff --git a/src/shared/particle_dynamics/external_force/external_force.h b/src/shared/particle_dynamics/external_force/external_force.h
index 173b25acbd..0e30aaecf1 100644
--- a/src/shared/particle_dynamics/external_force/external_force.h
+++ b/src/shared/particle_dynamics/external_force/external_force.h
@@ -75,7 +75,7 @@ class Gravity
     template<class Vec,
               typename = std::enable_if_t<std::conjunction_v<std::is_same<Vec, DeviceVecd>>,
                                           std::is_same<is_device_type_different_from_host<DeviceVecd>, std::true_type>>>
-    DeviceVecd InducedAcceleration(const Vecd &position = VecdZero<DeviceVecd>()) const {
+    DeviceVecd InducedAcceleration(const Vec &position = VecdZero<DeviceVecd>()) const {
         return global_acceleration_device_;
     }
     template<class Vec,
diff --git a/src/shared/particle_dynamics/fluid_dynamics/density_summation.h b/src/shared/particle_dynamics/fluid_dynamics/density_summation.h
index 0815a259c1..e6bb85898d 100644
--- a/src/shared/particle_dynamics/fluid_dynamics/density_summation.h
+++ b/src/shared/particle_dynamics/fluid_dynamics/density_summation.h
@@ -107,7 +107,7 @@ class DensitySummationKernel<Inner<>> : public DensitySummationKernel<Inner<Base
     explicit DensitySummationKernel(Args &&...baseArgs)
         : DensitySummationKernel<Inner<Base>>(std::forward<Args>(baseArgs)...) {}
 
-    void interaction(size_t index_i, Real dt = 0.0) {
+    void interaction(DeviceInt index_i, DeviceReal dt = 0.0) {
         DeviceReal sigma = W0_;
         const auto &neighbor_builder = *inner_neighbor_builder_;
         cell_linked_list_->forEachInnerNeighbor(index_i, [&](const DeviceVecd &pos_i, size_t index_j, const DeviceVecd &pos_j)
@@ -118,7 +118,7 @@ class DensitySummationKernel<Inner<>> : public DensitySummationKernel<Inner<Base
         rho_sum_[index_i] = sigma * rho0_ * inv_sigma0_;
     }
 
-    void update(size_t index_i, Real dt = 0.0) {
+    void update(DeviceInt index_i, DeviceReal dt = 0.0) {
         rho_[index_i] = rho_sum_[index_i];
         Vol_[index_i] = mass_[index_i] / rho_[index_i];
     }
@@ -171,16 +171,16 @@ class DensitySummationKernel<Contact<Base>> : public DensitySummationKernel<Base
           contact_bodies_size_(contact_relation.contact_bodies_.size()) {}
 
   protected:
-    DeviceReal ContactSummation(size_t index_i)
+    DeviceReal ContactSummation(DeviceInt index_i)
     {
         DeviceReal sigma{0.0};
-        for (size_t k = 0; k < contact_bodies_size_; ++k)
+        for (auto k = 0; k < contact_bodies_size_; ++k)
         {
             const DeviceReal* contact_mass_k = contact_mass_[k];
             const DeviceReal& contact_inv_rho0_k = contact_inv_rho0_[k];
             const auto& neighbor_builder = *contact_neighbor_builders_[k];
             contact_cell_linked_lists_[k]->forEachNeighbor(index_i, particles_position_,
-                                                           [&](const DeviceVecd &pos_i, size_t index_j, const DeviceVecd &pos_j)
+                                                           [&](const DeviceVecd &pos_i, DeviceInt index_j, const DeviceVecd &pos_j)
                                                            {
                                                                if(neighbor_builder.isWithinCutoff(pos_i, pos_j))
                                                                    sigma += neighbor_builder.W_ij(pos_i, pos_j) *
@@ -194,7 +194,7 @@ class DensitySummationKernel<Contact<Base>> : public DensitySummationKernel<Base
     DeviceVecd *particles_position_;
     CellLinkedListKernel **contact_cell_linked_lists_;
     NeighborBuilderContactKernel **contact_neighbor_builders_;
-    size_t contact_bodies_size_;
+    const DeviceInt contact_bodies_size_;
 };
 
 template <>
diff --git a/src/shared/particle_dynamics/fluid_dynamics/fluid_integration.h b/src/shared/particle_dynamics/fluid_dynamics/fluid_integration.h
index b1b65148bb..1b1848b83a 100644
--- a/src/shared/particle_dynamics/fluid_dynamics/fluid_integration.h
+++ b/src/shared/particle_dynamics/fluid_dynamics/fluid_integration.h
@@ -84,7 +84,7 @@ class BaseIntegrationKernel {
           force_prior_(particles->getDeviceVariableByName<DeviceVecd>("ForcePrior")) {}
 
   protected:
-    MaterialTypeKernel fluid_;
+    const MaterialTypeKernel fluid_;
     DeviceReal *rho_, *p_, *drho_dt_, *mass_, *Vol_;
     DeviceVecd *pos_, *vel_, *force_, *force_prior_;
 };
@@ -113,23 +113,24 @@ template<class RiemannSolverType, class MaterialType>
 class Integration1stHalfKernel<Inner<>, RiemannSolverType, MaterialType> : public BaseIntegrationKernel<MaterialType>
 {
   public:
-    Integration1stHalfKernel(BaseInnerRelation& inner_relation, BaseParticles* particles)
-        : BaseIntegrationKernel<MaterialType>(particles),
-          correction_(particles), riemann_solver_(this->fluid_, this->fluid_),
+    Integration1stHalfKernel(BaseInnerRelation& inner_relation, BaseParticles* particles,
+                             const RiemannSolverType &riemann_solver)
+        : BaseIntegrationKernel<MaterialType>(particles), correction_(particles),
+          riemann_solver_(*riemann_solver.device_kernel.get_ptr()),
           cell_linked_list_(inner_relation.getInnerCellLinkedListDevice()),
           inner_neighbor_builder_(inner_relation.getInnerNeighborBuilderDevice()) {}
 
-    void initialization(size_t index_i, Real dt = 0.0) {
-        this->rho_[index_i] += this->drho_dt_[index_i] * dt * 0.5;
+    void initialization(DeviceInt index_i, DeviceReal dt = 0.0) {
+        this->rho_[index_i] += this->drho_dt_[index_i] * dt * DeviceReal(0.5);
         this->p_[index_i] = this->fluid_.getPressure(this->rho_[index_i]);
-        this->pos_[index_i] += this->vel_[index_i] * dt * 0.5;
+        this->pos_[index_i] += this->vel_[index_i] * dt * DeviceReal(0.5);
     }
 
-    void update(size_t index_i, Real dt = 0.0) {
+    void update(DeviceInt index_i, DeviceReal dt = 0.0) {
         this->vel_[index_i] += (this->force_prior_[index_i] + this->force_[index_i]) / this->mass_[index_i] * dt;
     }
 
-    void interaction(size_t index_i, Real dt = 0.0) {
+    void interaction(DeviceInt index_i, DeviceReal dt = 0.0) {
         auto force = VecdZero<DeviceVecd>();
         DeviceReal rho_dissipation(0);
         const auto &pressure_i = this->p_[index_i];
@@ -152,10 +153,12 @@ class Integration1stHalfKernel<Inner<>, RiemannSolverType, MaterialType> : publi
     }
 
   protected:
-    RiemannSolverType riemann_solver_;
-    NoKernelCorrection correction_;
-    CellLinkedListKernel* cell_linked_list_;
-    NeighborBuilderInnerKernel* inner_neighbor_builder_;
+    using RiemannSolverTypeKernel = typename decltype(RiemannSolverType::device_kernel)::KernelType;
+
+    const RiemannSolverTypeKernel riemann_solver_;
+    const NoKernelCorrection correction_;
+    const CellLinkedListKernel* cell_linked_list_;
+    const NeighborBuilderInnerKernel* inner_neighbor_builder_;
 };
 
 
@@ -208,16 +211,16 @@ class Integration1stHalfKernel<Contact<Wall>, RiemannSolverType, MaterialType>
 {
   public:
     template<class ...BaseArgs>
-    Integration1stHalfKernel(const BaseContactRelation& contact_relation,
-                             BaseParticles* particles, BaseArgs&& ...args)
+    Integration1stHalfKernel(const BaseContactRelation& contact_relation, BaseParticles* particles,
+                             const RiemannSolverType& riemann_solver, BaseArgs&& ...args)
         : BaseIntegrationWithWallKernel<MaterialType>(particles, std::forward<BaseArgs>(args)...),
-          correction_(particles), riemann_solver_(this->fluid_, this->fluid_),
+          correction_(particles), riemann_solver_(*riemann_solver.device_kernel.get_ptr()),
           contact_bodies_size_(contact_relation.contact_bodies_.size()),
           contact_cell_linked_lists_(contact_relation.getContactCellLinkedListsDevice()),
           contact_neighbor_builders_(contact_relation.getContactNeighborBuilderDevice()),
           particles_position_(contact_relation.base_particles_.getDeviceVariableByName<DeviceVecd>("Position")) {}
 
-    void interaction(size_t index_i, Real dt = 0.0) {
+    void interaction(DeviceInt index_i, DeviceReal dt = 0.0) {
         DeviceVecd force = VecdZero<DeviceVecd>();
         DeviceReal rho_dissipation{0};
         const DeviceVecd &force_prior_i = this->force_prior_[index_i];
@@ -226,14 +229,14 @@ class Integration1stHalfKernel<Contact<Wall>, RiemannSolverType, MaterialType>
                          &pressure_i{this->p_[index_i]},
                          &rho_i{this->rho_[index_i]};
         const auto correction_i = correction_(index_i);
-        for (size_t k = 0; k < contact_bodies_size_; ++k)
+        for (auto k = 0; k < contact_bodies_size_; ++k)
         {
             const DeviceVecd* force_ave_k = this->wall_force_ave_[k];
             const DeviceReal* wall_mass_k = this->wall_mass_[k];
             const DeviceReal* wall_Vol_k = this->wall_Vol_[k];
             const auto& neighbor_builder = *contact_neighbor_builders_[k];
             contact_cell_linked_lists_[k]->forEachNeighbor(index_i, particles_position_,
-                                                           [&](const DeviceVecd &pos_i, size_t index_j, const DeviceVecd &pos_j)
+                                                           [&](const DeviceVecd &pos_i, DeviceInt index_j, const DeviceVecd &pos_j)
                                                            {
                                                                if (neighbor_builder.isWithinCutoff(pos_i, pos_j))
                                                                {
@@ -253,12 +256,14 @@ class Integration1stHalfKernel<Contact<Wall>, RiemannSolverType, MaterialType>
     }
 
   protected:
-    NoKernelCorrection correction_;
-    RiemannSolverType riemann_solver_;
-    size_t contact_bodies_size_;
+    using RiemannSolverTypeKernel = typename decltype(RiemannSolverType::device_kernel)::KernelType;
+
+    const NoKernelCorrection correction_;
+    const RiemannSolverTypeKernel riemann_solver_;
+    const DeviceInt contact_bodies_size_;
     CellLinkedListKernel **contact_cell_linked_lists_;
     NeighborBuilderContactKernel **contact_neighbor_builders_;
-    DeviceVecd *particles_position_;
+    const DeviceVecd *particles_position_;
 };
 
 template <class RiemannSolverType, class KernelCorrectionType>
@@ -316,21 +321,22 @@ template<class RiemannSolverType, class MaterialType>
 class Integration2ndHalfKernel<Inner<>, RiemannSolverType, MaterialType> : public BaseIntegrationKernel<MaterialType>
 {
   public:
-    Integration2ndHalfKernel(BaseInnerRelation& inner_relation, BaseParticles* particles)
+    Integration2ndHalfKernel(BaseInnerRelation& inner_relation, BaseParticles* particles,
+                             const RiemannSolverType &riemann_solver)
         : BaseIntegrationKernel<MaterialType>(particles),
-          riemann_solver_(this->fluid_, this->fluid_),
+          riemann_solver_(*riemann_solver.device_kernel.get_ptr()),
           cell_linked_list_(inner_relation.getInnerCellLinkedListDevice()),
           inner_neighbor_builder_(inner_relation.getInnerNeighborBuilderDevice()) {}
 
-    void initialization(size_t index_i, Real dt = 0.0) {
-        this->pos_[index_i] += this->vel_[index_i] * dt * 0.5;
+    void initialization(DeviceInt index_i, DeviceReal dt = 0.0) {
+        this->pos_[index_i] += this->vel_[index_i] * dt * DeviceReal(0.5);
     }
 
-    void update(size_t index_i, Real dt = 0.0) {
-        this->rho_[index_i] += this->drho_dt_[index_i] * dt * 0.5;
+    void update(DeviceInt index_i, DeviceReal dt = 0.0) {
+        this->rho_[index_i] += this->drho_dt_[index_i] * dt * DeviceReal(0.5);
     }
 
-    void interaction(size_t index_i, Real dt = 0.0)
+    void interaction(DeviceInt index_i, DeviceReal dt = 0.0)
     {
         DeviceReal density_change_rate(0);
         auto p_dissipation = VecdZero<DeviceVecd>();
@@ -353,7 +359,9 @@ class Integration2ndHalfKernel<Inner<>, RiemannSolverType, MaterialType> : publi
     }
 
   protected:
-    RiemannSolverType riemann_solver_;
+    using RiemannSolverTypeKernel = typename decltype(RiemannSolverType::device_kernel)::KernelType;
+
+    const RiemannSolverTypeKernel riemann_solver_;
     CellLinkedListKernel* cell_linked_list_;
     NeighborBuilderInnerKernel* inner_neighbor_builder_;
 };
@@ -388,28 +396,28 @@ class Integration2ndHalfKernel<Contact<Wall>, RiemannSolverType, MaterialType>
 {
   public:
     template<class ...BaseArgs>
-    Integration2ndHalfKernel(const BaseContactRelation& contact_relation,
-                             BaseParticles* particles, BaseArgs&& ...args)
+    Integration2ndHalfKernel(const BaseContactRelation& contact_relation, BaseParticles* particles,
+                             const RiemannSolverType &riemann_solver, BaseArgs&& ...args)
         : BaseIntegrationWithWallKernel<MaterialType>(particles, std::forward<BaseArgs>(args)...),
-          riemann_solver_(this->fluid_, this->fluid_),
+          riemann_solver_(*riemann_solver.device_kernel.get_ptr()),
           contact_bodies_size_(contact_relation.contact_bodies_.size()),
           contact_cell_linked_lists_(contact_relation.getContactCellLinkedListsDevice()),
           contact_neighbor_builders_(contact_relation.getContactNeighborBuilderDevice()),
           particles_position_(contact_relation.base_particles_.getDeviceVariableByName<DeviceVecd>("Position")) {}
 
-    void interaction(size_t index_i, Real dt = 0.0) {
+    void interaction(DeviceInt index_i, DeviceReal dt = 0.0) {
         DeviceReal density_change_rate{0};
         auto p_dissipation = VecdZero<DeviceVecd>();
         const DeviceVecd vel_i = this->vel_[index_i];
         const DeviceReal mass_i = this->mass_[index_i];
-        for (size_t k = 0; k < contact_bodies_size_; ++k)
+        for (auto k = 0; k < contact_bodies_size_; ++k)
         {
             const DeviceReal *wall_Vol_k = this->wall_Vol_[k];
             const DeviceVecd *vel_ave_k = this->wall_vel_ave_[k];
             const DeviceVecd *n_k = this->wall_n_[k];
             const auto& neighbor_builder = *contact_neighbor_builders_[k];
             contact_cell_linked_lists_[k]->forEachNeighbor(index_i, particles_position_,
-                                                           [&](const DeviceVecd &pos_i, size_t index_j, const DeviceVecd &pos_j)
+                                                           [&](const DeviceVecd &pos_i, DeviceInt index_j, const DeviceVecd &pos_j)
                                                            {
                                                                if (neighbor_builder.isWithinCutoff(pos_i, pos_j))
                                                                {
@@ -428,8 +436,10 @@ class Integration2ndHalfKernel<Contact<Wall>, RiemannSolverType, MaterialType>
     }
 
   protected:
-    RiemannSolverType riemann_solver_;
-    size_t contact_bodies_size_;
+    using RiemannSolverTypeKernel = typename decltype(RiemannSolverType::device_kernel)::KernelType;
+
+    const RiemannSolverTypeKernel riemann_solver_;
+    const DeviceInt contact_bodies_size_;
     CellLinkedListKernel **contact_cell_linked_lists_;
     NeighborBuilderContactKernel **contact_neighbor_builders_;
     DeviceVecd *particles_position_;
diff --git a/src/shared/particle_dynamics/fluid_dynamics/fluid_integration.hpp b/src/shared/particle_dynamics/fluid_dynamics/fluid_integration.hpp
index 2c568a7102..2bd28e5364 100644
--- a/src/shared/particle_dynamics/fluid_dynamics/fluid_integration.hpp
+++ b/src/shared/particle_dynamics/fluid_dynamics/fluid_integration.hpp
@@ -23,7 +23,7 @@ Integration1stHalf<Inner<>, RiemannSolverType, KernelCorrectionType>::
     Integration1stHalf(BaseInnerRelation &inner_relation)
     : BaseIntegration<FluidDataInner>(inner_relation),
       correction_(particles_), riemann_solver_(fluid_, fluid_),
-      device_kernel(inner_relation, particles_)
+      device_kernel(inner_relation, particles_, riemann_solver_)
 {
     static_assert(std::is_base_of<KernelCorrection, KernelCorrectionType>::value,
                   "KernelCorrection is not the base of KernelCorrectionType!");
@@ -84,9 +84,10 @@ Integration1stHalf<Contact<Wall>, RiemannSolverType, KernelCorrectionType>::
     Integration1stHalf(BaseContactRelation &wall_contact_relation)
     : BaseIntegrationWithWall(wall_contact_relation),
       correction_(particles_), riemann_solver_(fluid_, fluid_),
-      device_kernel(wall_contact_relation, particles_, wall_mass_device_.data(),
-                    wall_Vol_device_.data(), wall_vel_ave_device_.data(),
-                    wall_force_ave_device_.data(), wall_n_device_.data()) {}
+      device_kernel(wall_contact_relation, particles_, riemann_solver_,
+                    wall_mass_device_.data(), wall_Vol_device_.data(),
+                    wall_vel_ave_device_.data(), wall_force_ave_device_.data(),
+                    wall_n_device_.data()) {}
 //=================================================================================================//
 template <class RiemannSolverType, class KernelCorrectionType>
 void Integration1stHalf<Contact<Wall>, RiemannSolverType, KernelCorrectionType>::interaction(size_t index_i, Real dt)
@@ -165,7 +166,7 @@ Integration2ndHalf<Inner<>, RiemannSolverType>::
     Integration2ndHalf(BaseInnerRelation &inner_relation)
     : BaseIntegration<FluidDataInner>(inner_relation), riemann_solver_(this->fluid_, this->fluid_),
       mass_(particles_->mass_), Vol_(particles_->Vol_),
-      device_kernel(inner_relation, particles_) {}
+      device_kernel(inner_relation, particles_, riemann_solver_) {}
 //=================================================================================================//
 template <class RiemannSolverType>
 void Integration2ndHalf<Inner<>, RiemannSolverType>::initialization(size_t index_i, Real dt)
@@ -204,9 +205,10 @@ Integration2ndHalf<Contact<Wall>, RiemannSolverType>::
     Integration2ndHalf(BaseContactRelation &wall_contact_relation)
     : BaseIntegrationWithWall(wall_contact_relation),
       riemann_solver_(this->fluid_, this->fluid_),
-      device_kernel(wall_contact_relation, particles_, wall_mass_device_.data(),
-                    wall_Vol_device_.data(), wall_vel_ave_device_.data(),
-                    wall_force_ave_device_.data(), wall_n_device_.data()) {}
+      device_kernel(wall_contact_relation, particles_, riemann_solver_,
+                    wall_mass_device_.data(), wall_Vol_device_.data(),
+                    wall_vel_ave_device_.data(), wall_force_ave_device_.data(),
+                    wall_n_device_.data()) {}
 //=================================================================================================//
 template <class RiemannSolverType>
 void Integration2ndHalf<Contact<Wall>, RiemannSolverType>::interaction(size_t index_i, Real dt)
diff --git a/src/shared/particle_dynamics/fluid_dynamics/fluid_time_step.h b/src/shared/particle_dynamics/fluid_dynamics/fluid_time_step.h
index dcef86335d..61f8f356f5 100644
--- a/src/shared/particle_dynamics/fluid_dynamics/fluid_time_step.h
+++ b/src/shared/particle_dynamics/fluid_dynamics/fluid_time_step.h
@@ -55,7 +55,7 @@ class AcousticTimeStepSizeKernel {
 
     DeviceReal reduce(size_t index_i, Real dt = 0.0) const {
         DeviceReal acceleration_scale = 4.0 * smoothing_length_min_ *
-                                  (force_[index_i] + force_prior_[index_i]).norm() / mass_[index_i];
+                                  VecdNorm(DeviceVecd(force_[index_i] + force_prior_[index_i])) / mass_[index_i];
         return sycl::fmax(fluid_.getSoundSpeed(p_[index_i], rho_[index_i]) + VecdNorm(vel_[index_i]), acceleration_scale);
     }
 
diff --git a/src/shared/particle_dynamics/particle_functors.h b/src/shared/particle_dynamics/particle_functors.h
index aa239991aa..20b48e0461 100644
--- a/src/shared/particle_dynamics/particle_functors.h
+++ b/src/shared/particle_dynamics/particle_functors.h
@@ -206,7 +206,7 @@ class NoKernelCorrection : public KernelCorrection
 {
   public:
     NoKernelCorrection(BaseParticles *particles) : KernelCorrection(){};
-    Real operator()(size_t index_i)
+    Real operator()(size_t index_i) const
     {
         return 1.0;
     };
diff --git a/src/shared/particles/base_particles.cpp b/src/shared/particles/base_particles.cpp
index 375bf6140d..459ab7e888 100644
--- a/src/shared/particles/base_particles.cpp
+++ b/src/shared/particles/base_particles.cpp
@@ -344,9 +344,9 @@ void BaseParticles::readFromXmlForReloadParticle(std::string &filefullpath)
 }
 //=================================================================================================//
 void BaseParticles::registerDeviceMemory() {
-    unsorted_id_device_ = allocateDeviceData<size_t>(total_real_particles_);
-    sorted_id_device_ = allocateDeviceData<size_t>(total_real_particles_);
-    sequence_device_ = allocateDeviceData<size_t>(total_real_particles_);
+    unsorted_id_device_ = allocateDeviceData<DeviceInt>(total_real_particles_);
+    sorted_id_device_ = allocateDeviceData<DeviceInt>(total_real_particles_);
+    sequence_device_ = allocateDeviceData<DeviceInt>(total_real_particles_);
     registerDeviceVariable<DeviceVecd>("Position", total_real_particles_, pos_.data());
     registerDeviceVariable<DeviceVecd>("Velocity", total_real_particles_, vel_.data());
     registerDeviceVariable<DeviceVecd>("Force", total_real_particles_, force_.data());
diff --git a/src/shared/particles/base_particles.h b/src/shared/particles/base_particles.h
index 3e91d5d889..1604efbc07 100644
--- a/src/shared/particles/base_particles.h
+++ b/src/shared/particles/base_particles.h
@@ -189,9 +189,9 @@ class BaseParticles
     ParticleData sortable_data_;
     ParticleVariables sortable_variables_;
     ParticleSorting particle_sorting_;
-    size_t* unsorted_id_device_ = nullptr;      /**< copy of unsorted ids stored inside device. */
-    size_t* sorted_id_device_ = nullptr;        /**< copy of sorted ids stored inside device. */
-    size_t* sequence_device_ = nullptr;         /**< the sequence referred for sorting within device execution. */
+    DeviceInt* unsorted_id_device_ = nullptr;      /**< copy of unsorted ids stored inside device. */
+    DeviceInt* sorted_id_device_ = nullptr;        /**< copy of sorted ids stored inside device. */
+    DeviceInt* sequence_device_ = nullptr;         /**< the sequence referred for sorting within device execution. */
     DeviceVariables sortable_device_variables_;
 
     template <typename DataType>
diff --git a/src/shared/particles/base_particles.hpp b/src/shared/particles/base_particles.hpp
index a3396a4257..4ffb585777 100644
--- a/src/shared/particles/base_particles.hpp
+++ b/src/shared/particles/base_particles.hpp
@@ -215,7 +215,7 @@ void BaseParticles::registerSortableVariable(const std::string &variable_name)
 template <typename SequenceMethod, class ExecutionPolicy>
 void BaseParticles::sortParticles(SequenceMethod &sequence_method, ExecutionPolicy execution_policy)
 {
-    size_t* sequence = sequence_method.computingSequence(*this);
+    auto* sequence = sequence_method.computingSequence(*this);
     particle_sorting_.sortingParticleData(sequence, total_real_particles_, execution_policy);
 }
 //=================================================================================================//

From 858736bff5b27f7233c09d3085cde76c6bd95d59 Mon Sep 17 00:00:00 2001
From: Alberto Guarnieri <nR3D@pm.me>
Date: Fri, 24 May 2024 12:16:56 +0200
Subject: [PATCH 2/8] Use DeviceInt for device sorting methods

---
 src/shared/particles/particle_sorting.cpp | 88 +++++++++++------------
 src/shared/particles/particle_sorting.h   | 44 ++++++------
 2 files changed, 66 insertions(+), 66 deletions(-)

diff --git a/src/shared/particles/particle_sorting.cpp b/src/shared/particles/particle_sorting.cpp
index 01b8bab46f..67d57967db 100644
--- a/src/shared/particles/particle_sorting.cpp
+++ b/src/shared/particles/particle_sorting.cpp
@@ -27,7 +27,7 @@ void SwapSortableParticleData::operator()(size_t *a, size_t *b)
 MoveSortableParticleDeviceData::MoveSortableParticleDeviceData(BaseParticles &base_particles)
     : base_particles_(base_particles), initialized_swap_variables_(false) {}
 //=================================================================================================//
-void MoveSortableParticleDeviceData::operator()(size_t *index_permutation, size_t size)
+void MoveSortableParticleDeviceData::operator()(DeviceInt *index_permutation, DeviceInt size)
 {
     if (!initialized_swap_variables_)
     {
@@ -73,25 +73,24 @@ void ParticleSorting::updateSortedId()
 //=================================================================================================//
 void ParticleSorting::updateSortedDeviceId() const
 {
-    size_t *unsorted_id_device = base_particles_.unsorted_id_device_;
-    size_t *sorted_id_device = base_particles_.sorted_id_device_;
-    size_t total_real_particles = base_particles_.total_real_particles_;
+    DeviceInt *unsorted_id_device = base_particles_.unsorted_id_device_;
+    DeviceInt *sorted_id_device = base_particles_.sorted_id_device_;
+    DeviceInt total_real_particles = base_particles_.total_real_particles_;
     execution::executionQueue.getQueue()
         .parallel_for(execution::executionQueue.getUniformNdRange(total_real_particles),
                       [=](sycl::nd_item<1> item)
                       {
-                          size_t i = item.get_global_id();
+                          DeviceInt i = item.get_global_id();
                           if (i < total_real_particles)
                               sorted_id_device[unsorted_id_device[i]] = i;
                       })
         .wait();
 }
 //=================================================================================================//
-template <>
-void ParticleSorting::sortingParticleData(size_t *begin, size_t size, execution::ParallelSYCLDevicePolicy execution_policy)
+void ParticleSorting::sortingParticleData(DeviceInt *begin, DeviceInt size, execution::ParallelSYCLDevicePolicy execution_policy)
 {
     if (!index_sorting_device_variables_)
-        index_sorting_device_variables_ = allocateDeviceData<size_t>(size);
+        index_sorting_device_variables_ = allocateDeviceData<DeviceInt>(size);
 
     device_radix_sorting.sort_by_key(begin, index_sorting_device_variables_, size, execution::executionQueue.getQueue(), 512, 4).wait();
 
@@ -101,17 +100,17 @@ void ParticleSorting::sortingParticleData(size_t *begin, size_t size, execution:
 }
 //=================================================================================================//
 template <class ValueType>
-SYCL_EXTERNAL size_t DeviceRadixSort<ValueType>::split_count(bool bit, sycl::nd_item<1> &item)
+SYCL_EXTERNAL DeviceInt DeviceRadixSort<ValueType>::split_count(bool bit, sycl::nd_item<1> &item)
 {
     const auto group_range = item.get_local_range().size();
-    const size_t id = item.get_local_id();
+    const DeviceInt id = item.get_local_id();
 
     // Count 1s held by lower-numbered work-items and current work-item
-    size_t true_before = sycl::inclusive_scan_over_group(item.get_group(), static_cast<size_t>(bit),
-                                                         sycl::plus<size_t>{}); // prefix sum over group
+    DeviceInt true_before = sycl::inclusive_scan_over_group(item.get_group(), static_cast<DeviceInt>(bit),
+                                                         sycl::plus<>{}); // prefix sum over group
 
     // Total number of 0s
-    size_t false_totals = sycl::group_broadcast(item.get_group(), group_range - true_before,
+    DeviceInt false_totals = sycl::group_broadcast(item.get_group(), group_range - true_before,
                                                 group_range - 1);
 
     // Return work-item's rank
@@ -119,24 +118,24 @@ SYCL_EXTERNAL size_t DeviceRadixSort<ValueType>::split_count(bool bit, sycl::nd_
 }
 //=================================================================================================//
 template <class ValueType>
-size_t DeviceRadixSort<ValueType>::get_digit(size_t key, size_t d, size_t radix_bits)
+DeviceInt DeviceRadixSort<ValueType>::get_digit(DeviceInt key, DeviceInt d, DeviceInt radix_bits)
 {
-    return (key >> d * radix_bits) & ((1ul << radix_bits) - 1);
+    return (key >> d * radix_bits) & ((static_cast<DeviceInt>(1) << radix_bits) - 1);
 }
 //=================================================================================================//
 template <class ValueType>
-size_t DeviceRadixSort<ValueType>::get_bit(size_t key, size_t b)
+DeviceInt DeviceRadixSort<ValueType>::get_bit(DeviceInt key, DeviceInt b)
 {
     return (key >> b) & 1;
 }
 //=================================================================================================//
 template <class ValueType>
-size_t DeviceRadixSort<ValueType>::find_max_element(const size_t *data, size_t size, size_t identity)
+DeviceInt DeviceRadixSort<ValueType>::find_max_element(const DeviceInt *data, DeviceInt size, DeviceInt identity)
 {
-    size_t result = identity;
+    DeviceInt result = identity;
     auto &sycl_queue = execution::executionQueue.getQueue();
     {
-        sycl::buffer<size_t> buffer_result(&result, 1);
+        sycl::buffer<DeviceInt> buffer_result(&result, 1);
         sycl_queue.submit([&](sycl::handler &cgh)
                           {
                               auto reduction_operator = sycl::reduction(buffer_result, cgh, sycl::maximum<>());
@@ -151,7 +150,7 @@ size_t DeviceRadixSort<ValueType>::find_max_element(const size_t *data, size_t s
 }
 //=================================================================================================//
 template <class ValueType>
-void DeviceRadixSort<ValueType>::resize(size_t data_size, size_t radix_bits, size_t workgroup_size)
+void DeviceRadixSort<ValueType>::resize(DeviceInt data_size, DeviceInt radix_bits, DeviceInt workgroup_size)
 {
     data_size_ = data_size;
     radix_bits_ = radix_bits;
@@ -161,31 +160,32 @@ void DeviceRadixSort<ValueType>::resize(size_t data_size, size_t radix_bits, siz
     kernel_range_ = {uniform_global_size_, workgroup_size};
     workgroups_ = kernel_range_.get_group_range().size();
 
-    radix_ = 1ul << radix_bits;  // radix = 2^b
+    radix_ = static_cast<DeviceInt>(1) << radix_bits;  // radix = 2^b
 
-    sycl::range<2> buckets_column_major_range = {radix_, workgroups_}, buckets_row_major_range = {workgroups_, radix_};
+    sycl::range<2> buckets_column_major_range = {static_cast<size_t>(radix_), static_cast<size_t>(workgroups_)},
+                   buckets_row_major_range = {static_cast<size_t>(workgroups_), static_cast<size_t>(radix_)};
     // Each entry contains global number of digits with the same value
     // Column-major, so buckets offsets can be computed by just applying a scan over it
-    global_buckets_ = std::make_unique<sycl::buffer<size_t, 2>>(buckets_column_major_range);
+    global_buckets_ = std::make_unique<sycl::buffer<DeviceInt, 2>>(buckets_column_major_range);
     // Each entry contains global number of digits with the same and lower values
-    global_buckets_offsets_ = std::make_unique<sycl::buffer<size_t, 2>>(buckets_column_major_range);
-    local_buckets_offsets_buffer_ = std::make_unique<sycl::buffer<size_t, 2>>(buckets_row_major_range);  // save state of local accessor
+    global_buckets_offsets_ = std::make_unique<sycl::buffer<DeviceInt, 2>>(buckets_column_major_range);
+    local_buckets_offsets_buffer_ = std::make_unique<sycl::buffer<DeviceInt, 2>>(buckets_row_major_range);  // save state of local accessor
     data_swap_buffer_ = std::make_unique<sycl::buffer<SortablePair>>(uniform_global_size_);  // temporary memory for swapping
     // Keep extra values to be swapped when kernel range has been made uniform
     uniform_extra_swap_buffer_ = std::make_unique<sycl::buffer<SortablePair>>(uniform_global_size_ - data_size);
 }
 //=================================================================================================//
 template <class ValueType>
-sycl::event DeviceRadixSort<ValueType>::sort_by_key(size_t *keys, ValueType *data, size_t data_size, sycl::queue &queue, size_t workgroup_size, size_t radix_bits)
+sycl::event DeviceRadixSort<ValueType>::sort_by_key(DeviceInt *keys, ValueType *data, DeviceInt data_size, sycl::queue &queue, DeviceInt workgroup_size, DeviceInt radix_bits)
 {
     if(data_size_ != data_size || radix_bits_ != radix_bits || workgroup_size_ != workgroup_size)
         resize(data_size, radix_bits, workgroup_size);
 
     // Largest key, increased by 1 if the workgroup is not homogeneous with the data vector,
     // the new maximum will be used for those work-items out of data range, that will then be excluded once sorted
-    const size_t max_key = find_max_element(keys, data_size, 0ul) + (uniform_case_masking_ ? 1 : 0);
-    const size_t bits_max_key = std::floor(std::log2(max_key)) + 1.0;                                    // bits needed to represent max_key
-    const size_t length = max_key ? bits_max_key / radix_bits + (bits_max_key % radix_bits ? 1 : 0) : 1; // max number of radix digits
+    const DeviceInt max_key = find_max_element(keys, data_size, 0ul) + (uniform_case_masking_ ? 1 : 0);
+    const DeviceInt bits_max_key = std::floor(std::log2(max_key)) + 1.0;                                    // bits needed to represent max_key
+    const DeviceInt length = max_key ? bits_max_key / radix_bits + (bits_max_key % radix_bits ? 1 : 0) : 1; // max number of radix digits
 
     sycl::event sort_event{};
     for (int digit = 0; digit < length; ++digit)
@@ -195,14 +195,14 @@ sycl::event DeviceRadixSort<ValueType>::sort_by_key(size_t *keys, ValueType *dat
                                           {
                                               cgh.depends_on(sort_event);
                                               auto data_swap_acc = data_swap_buffer_->get_access(cgh, sycl::write_only, sycl::no_init);
-                                              auto local_buckets = sycl::local_accessor<size_t>(radix_, cgh);
+                                              auto local_buckets = sycl::local_accessor<DeviceInt>(radix_, cgh);
                                               auto local_output = sycl::local_accessor<SortablePair>(kernel_range_.get_local_range(), cgh);
                                               auto global_buckets_accessor = global_buckets_->get_access(cgh, sycl::read_write, sycl::no_init);
                                               auto local_buckets_offsets_accessor = local_buckets_offsets_buffer_->get_access(cgh, sycl::write_only,
                                                                                                                               sycl::no_init);
 
                                               cgh.parallel_for(kernel_range_, [=, radix=radix_](sycl::nd_item<1> item) {
-                                                                   const size_t workgroup = item.get_group_linear_id(),
+                                                                   const DeviceInt workgroup = item.get_group_linear_id(),
                                                                                 global_id = item.get_global_id();
 
                                                                    SortablePair number;
@@ -221,7 +221,7 @@ sycl::event DeviceRadixSort<ValueType>::sort_by_key(size_t *keys, ValueType *dat
                                                                    auto radix_digit = get_digit(number.first, digit, radix_bits);
                                                                    auto rank = split_count(get_bit(radix_digit, 0), item);  // sorting first bit
                                                                    local_output[rank] = number;
-                                                                   for (size_t b = 1; b < radix_bits; ++b) {  // sorting remaining bits
+                                                                   for (DeviceInt b = 1; b < radix_bits; ++b) {  // sorting remaining bits
                                                                        item.barrier(sycl::access::fence_space::local_space);
                                                                        number = local_output[item.get_local_id()];
                                                                        radix_digit = get_digit(number.first, digit, radix_bits);
@@ -231,28 +231,28 @@ sycl::event DeviceRadixSort<ValueType>::sort_by_key(size_t *keys, ValueType *dat
                                                                    }
 
                                                                    // Initialize local buckets to zero, since they are uninitialized by default
-                                                                   for (size_t r = 0; r < radix; ++r)
+                                                                   for (DeviceInt r = 0; r < radix; ++r)
                                                                        local_buckets[r] = 0;
 
                                                                    item.barrier(sycl::access::fence_space::local_space);
                                                                    {
-                                                                       sycl::atomic_ref<size_t, sycl::memory_order_relaxed, sycl::memory_scope_work_group,
+                                                                       sycl::atomic_ref<DeviceInt, sycl::memory_order_relaxed, sycl::memory_scope_work_group,
                                                                                         sycl::access::address_space::local_space> bucket_r{local_buckets[radix_digit]};
                                                                        ++bucket_r;
                                                                        item.barrier(sycl::access::fence_space::local_space);
                                                                    }
 
                                                                    // Save local buckets to global memory, with one row per work-group (in column-major order)
-                                                                   for (size_t r = 0; r < radix; ++r)
+                                                                   for (DeviceInt r = 0; r < radix; ++r)
                                                                        global_buckets_accessor[r][workgroup] = local_buckets[r];
 
                                                                    if(global_id < data_size)
                                                                        data_swap_acc[workgroup_size * workgroup + rank] = number;  // save local sorting back to data
 
                                                                    // Compute local buckets offsets
-                                                                   size_t *begin = local_buckets.get_pointer(), *end = begin + radix,
+                                                                   DeviceInt *begin = local_buckets.get_pointer(), *end = begin + radix,
                                                                           *outBegin = local_buckets_offsets_accessor.get_pointer().get() + workgroup * radix;
-                                                                   sycl::joint_exclusive_scan(item.get_group(), begin, end, outBegin, sycl::plus<size_t>{});
+                                                                   sycl::joint_exclusive_scan(item.get_group(), begin, end, outBegin, sycl::plus<>{});
                                                                }); });
 
         // Global synchronization to make sure that all locally computed buckets have been copied to global memory
@@ -264,9 +264,9 @@ sycl::event DeviceRadixSort<ValueType>::sort_by_key(size_t *keys, ValueType *dat
                                                   cgh.parallel_for(kernel_range_, [=](sycl::nd_item<1> item) {
                                                                        // Compute global buckets offsets
                                                                        if(item.get_group_linear_id() == 0) {
-                                                                           size_t *begin = global_buckets_accessor.get_pointer(), *end = begin + global_buckets_accessor.size();
+                                                                           DeviceInt *begin = global_buckets_accessor.get_pointer(), *end = begin + global_buckets_accessor.size();
                                                                            sycl::joint_exclusive_scan(item.get_group(), begin, end,
-                                                                                                      global_buckets_offsets_accessor.get_pointer(), sycl::plus<size_t>{});
+                                                                                                      global_buckets_offsets_accessor.get_pointer(), sycl::plus<>{});
                                                                        }
                                                                    });
                                               });
@@ -280,20 +280,20 @@ sycl::event DeviceRadixSort<ValueType>::sort_by_key(size_t *keys, ValueType *dat
                                       auto local_buckets_offsets_accessor = local_buckets_offsets_buffer_->get_access(cgh, sycl::read_only);
                                       cgh.parallel_for(kernel_range_, [=](sycl::nd_item<1> item) {
                                                            // Compute global buckets offsets
-                                                           size_t *begin = global_buckets_accessor.get_pointer(), *end = begin + global_buckets_accessor.size();
+                                                           DeviceInt *begin = global_buckets_accessor.get_pointer(), *end = begin + global_buckets_accessor.size();
                                                            sycl::joint_exclusive_scan(item.get_group(), begin, end,
-                                                                                      global_buckets_offsets_accessor.get_pointer(), sycl::plus<size_t>{});
+                                                                                      global_buckets_offsets_accessor.get_pointer(), sycl::plus<>{});
 
                                                            // Mask only relevant indexes. All max_keys added to homogenize the computations
                                                            // should be owned by work-items with global_id >= data_size
                                                            if(item.get_global_id() < data_size) {
                                                                // Retrieve position and sorted data from swap memory
-                                                               const size_t rank = item.get_local_id(), workgroup = item.get_group_linear_id();
+                                                               const DeviceInt rank = item.get_local_id(), workgroup = item.get_group_linear_id();
                                                                const SortablePair number = data_swap_acc[workgroup_size * workgroup + rank];
-                                                               const size_t radix_digit = get_digit(number.first, digit, radix_bits);
+                                                               const DeviceInt radix_digit = get_digit(number.first, digit, radix_bits);
 
                                                                // Compute sorted position based on global and local buckets
-                                                               const size_t data_offset = global_buckets_offsets_accessor[radix_digit][workgroup] + rank -
+                                                               const DeviceInt data_offset = global_buckets_offsets_accessor[radix_digit][workgroup] + rank -
                                                                                           local_buckets_offsets_accessor[workgroup][radix_digit];
 
                                                                // Copy to original data pointers
diff --git a/src/shared/particles/particle_sorting.h b/src/shared/particles/particle_sorting.h
index c822d3864b..4e9c1e79e3 100644
--- a/src/shared/particles/particle_sorting.h
+++ b/src/shared/particles/particle_sorting.h
@@ -202,34 +202,34 @@ namespace SPH
 template<class ValueType>
 class DeviceRadixSort
 {
-    using SortablePair = std::pair<size_t, ValueType>;
+    using SortablePair = std::pair<DeviceInt, ValueType>;
   public:
     /** Get the number of bits corresponding to the d-th digit of key,
      *  with each digit composed of a number of bits equal to radix_bits */
-    static inline size_t get_digit(size_t key, size_t d, size_t radix_bits);
+    static inline DeviceInt get_digit(DeviceInt key, DeviceInt d, DeviceInt radix_bits);
 
     /** Get the b-th bit of key */
-    static inline size_t get_bit(size_t key, size_t b);
+    static inline DeviceInt get_bit(DeviceInt key, DeviceInt b);
 
     /** Group operation to compute rank, i.e. sorted position, of each work-item based on one bit.
      *  All work-items with bit = 0 will be on the first half of the ranking, while work-items with
      *  bit = 1 will be placed on the second half. */
-    static SYCL_EXTERNAL size_t split_count(bool bit, sycl::nd_item<1> &item);
+    static SYCL_EXTERNAL DeviceInt split_count(bool bit, sycl::nd_item<1> &item);
 
-    size_t find_max_element(const size_t *data, size_t size, size_t identity);
+    DeviceInt find_max_element(const DeviceInt *data, DeviceInt size, DeviceInt identity);
 
-    void resize(size_t data_size, size_t radix_bits, size_t workgroup_size);
+    void resize(DeviceInt data_size, DeviceInt radix_bits, DeviceInt workgroup_size);
 
     sycl::event sort_by_key(
-        size_t *keys, ValueType *data, size_t data_size, sycl::queue &queue,
-        size_t workgroup_size = 256, size_t radix_bits = 4);
+        DeviceInt *keys, ValueType *data, DeviceInt data_size, sycl::queue &queue,
+        DeviceInt workgroup_size = 256, DeviceInt radix_bits = 4);
 
   private:
     bool uniform_case_masking_;
-    size_t data_size_ = 0, radix_bits_, workgroup_size_,
+    DeviceInt data_size_ = 0, radix_bits_, workgroup_size_,
            uniform_global_size_, workgroups_, radix_;
     sycl::nd_range<1> kernel_range_{0,0};
-    std::unique_ptr<sycl::buffer<size_t, 2>> global_buckets_, global_buckets_offsets_, local_buckets_offsets_buffer_;
+    std::unique_ptr<sycl::buffer<DeviceInt, 2>> global_buckets_, global_buckets_offsets_, local_buckets_offsets_buffer_;
     std::unique_ptr<sycl::buffer<SortablePair>> data_swap_buffer_, uniform_extra_swap_buffer_;
 };
 
@@ -261,14 +261,14 @@ struct swapParticleDeviceDataValue
                                                 : 0;
 
         StdVec<DeviceVariable<VariableType> *> variables_ptr = std::get<DataTypeIndex<VariableType>::value>(device_variables);
-        for (size_t i = 0; i != variables_ptr.size(); ++i)
+        for (DeviceInt i = 0; i != variables_ptr.size(); ++i)
             variables_.push_back(variables_ptr[i]->VariableAddress());
 
         tmp_variable_ = allocateDeviceData<VariableType>(size_single_variable_);
     }
 
     /** Initialize swapping for extra variable not registered in DeviceVariables */
-    void init(VariableType *single_variable, size_t size_variable)
+    void init(VariableType *single_variable, DeviceInt size_variable)
     {
         size_variables_ = 1;
         size_single_variable_ = size_variable;
@@ -279,20 +279,20 @@ struct swapParticleDeviceDataValue
     ~swapParticleDeviceDataValue() { freeDeviceData(tmp_variable_); }
 
     /** Reorder initialized variables based on a permutation indicating where each element should be placed */
-    sycl::event operator()(const size_t *index_permutation, size_t size) const
+    sycl::event operator()(const DeviceInt *index_permutation, DeviceInt size) const
     {
         if (size_variables_)
             assert(size == size_single_variable_ && "Provided index permutation has different size than device variables");
 
         sycl::event sort_event{};
-        for (size_t var = 0; var < size_variables_; ++var)
+        for (DeviceInt var = 0; var < size_variables_; ++var)
         {
             auto *sortable_device_variable = variables_[var];
             auto tmp_copy_event = execution::executionQueue.getQueue().copy(sortable_device_variable, tmp_variable_, size, sort_event);
             sort_event = execution::executionQueue.getQueue().parallel_for(execution::executionQueue.getUniformNdRange(size), tmp_copy_event,
                                                                            [=, tmp_variable = tmp_variable_](sycl::nd_item<1> item)
                                                                            {
-                                                                               size_t i = item.get_global_id();
+                                                                               DeviceInt i = item.get_global_id();
                                                                                if (i < size)
                                                                                    sortable_device_variable[i] = tmp_variable[index_permutation[i]];
                                                                            });
@@ -301,7 +301,7 @@ struct swapParticleDeviceDataValue
     }
 
   private:
-    size_t size_variables_,                 // number of device variables registered,
+    DeviceInt size_variables_,                 // number of device variables registered,
         size_single_variable_;              // size of each variable
     std::vector<VariableType *> variables_; // variables to be sorted
     VariableType *tmp_variable_ = nullptr;            // temporary memory to execute sorting in parallel
@@ -347,12 +347,12 @@ class MoveSortableParticleDeviceData
     BaseParticles &base_particles_;
     bool initialized_swap_variables_;
     DeviceDataAssembleOperation<swapParticleDeviceDataValue> swap_particle_device_data_value_;
-    swapParticleDeviceDataValue<size_t> swap_unsorted_id_;
+    swapParticleDeviceDataValue<DeviceInt> swap_unsorted_id_;
 
   public:
     explicit MoveSortableParticleDeviceData(BaseParticles &base_particles);
     /** Reorder sortable device variables based on sorting permutation*/
-    void operator()(size_t *index_permutation, size_t size);
+    void operator()(DeviceInt *index_permutation, DeviceInt size);
 };
 
 /**
@@ -363,12 +363,12 @@ class ParticleSorting
 {
   protected:
     BaseParticles &base_particles_;
-    size_t *index_sorting_device_variables_;
+    DeviceInt *index_sorting_device_variables_;
 
     /** using pointer because it is constructed after particles. */
     SwapSortableParticleData swap_sortable_particle_data_;
     MoveSortableParticleDeviceData move_sortable_particle_device_data_;
-    DeviceRadixSort<size_t> device_radix_sorting;
+    DeviceRadixSort<DeviceInt> device_radix_sorting;
     CompareParticleSequence compare_;
     tbb::interface9::QuickSortParticleRange<
         size_t *, CompareParticleSequence, SwapSortableParticleData>
@@ -389,8 +389,8 @@ class ParticleSorting
     {
         this->sortingParticleData(begin, size);
     }
-    template <>
-    void sortingParticleData(size_t *begin, size_t size, execution::ParallelSYCLDevicePolicy execution_policy);
+
+    void sortingParticleData(DeviceInt *begin, DeviceInt size, execution::ParallelSYCLDevicePolicy execution_policy);
 
     /** update the reference of sorted data from unsorted data */
     virtual void updateSortedId();

From 497dfb150d237f30fab10b1938b5349e5a255b3f Mon Sep 17 00:00:00 2001
From: Alberto Guarnieri <nR3D@pm.me>
Date: Fri, 24 May 2024 12:27:41 +0200
Subject: [PATCH 3/8] Set generic spir64 target for SYCL execution when
 specific target is not specified

---
 CMakeLists.txt                                                 | 3 ++-
 .../particle_dynamics/execution_unit/execution_queue.hpp       | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e0aaa51624..f625fc83d7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -101,7 +101,8 @@ endif()
 
 if(SPHINXSYS_USE_SYCL)
     if(NOT SPHINXSYS_SYCL_TARGETS)
-        set(SPHINXSYS_SYCL_TARGETS nvptx64-nvidia-cuda)
+        message("-- Using default SYCL device target, select specific target with option SPHINXSYS_SYCL_TARGETS")
+        set(SPHINXSYS_SYCL_TARGETS spir64)
     endif()
     message("-- SPHinXsys use SYCL with targets ${SPHINXSYS_SYCL_TARGETS}")
     target_compile_options(sphinxsys_core INTERFACE -fsycl -fsycl-targets=${SPHINXSYS_SYCL_TARGETS}  -Wno-unknown-cuda-version)
diff --git a/src/shared/particle_dynamics/execution_unit/execution_queue.hpp b/src/shared/particle_dynamics/execution_unit/execution_queue.hpp
index 32eb449a00..513d055b38 100644
--- a/src/shared/particle_dynamics/execution_unit/execution_queue.hpp
+++ b/src/shared/particle_dynamics/execution_unit/execution_queue.hpp
@@ -16,7 +16,7 @@ namespace SPH::execution {
 
         sycl::queue &getQueue() {
             if(!sycl_queue)
-                sycl_queue = std::make_unique<sycl::queue>(sycl::gpu_selector_v);
+                sycl_queue = std::make_unique<sycl::queue>(sycl::default_selector_v);
             return *sycl_queue;
         }
 

From 96a70c09b5982d4bb672709362f6366b532303b6 Mon Sep 17 00:00:00 2001
From: Alberto Guarnieri <nR3D@pm.me>
Date: Fri, 24 May 2024 12:28:12 +0200
Subject: [PATCH 4/8] Increased default workgroup size

---
 src/shared/particle_dynamics/execution_unit/execution_queue.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/shared/particle_dynamics/execution_unit/execution_queue.hpp b/src/shared/particle_dynamics/execution_unit/execution_queue.hpp
index 513d055b38..3fe7994a1e 100644
--- a/src/shared/particle_dynamics/execution_unit/execution_queue.hpp
+++ b/src/shared/particle_dynamics/execution_unit/execution_queue.hpp
@@ -38,7 +38,7 @@ namespace SPH::execution {
         }
 
     private:
-        ExecutionQueue() : work_group_size(32), sycl_queue() {}
+        ExecutionQueue() : work_group_size(128), sycl_queue() {}
 
         std::size_t work_group_size;
         std::unique_ptr<sycl::queue> sycl_queue;

From 6fbb22942f8cee53bf3cc66ee328994f100a5488 Mon Sep 17 00:00:00 2001
From: Alberto Guarnieri <nR3D@pm.me>
Date: Sat, 13 Jul 2024 19:58:19 +0200
Subject: [PATCH 5/8] get_pointer substituted with get_multi_pointer for better
 compliance with SYCL 2020

---
 src/shared/particles/particle_sorting.cpp | 35 +++++++++++++++++------
 src/shared/particles/particle_sorting.h   |  3 +-
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/src/shared/particles/particle_sorting.cpp b/src/shared/particles/particle_sorting.cpp
index 67d57967db..b5032878ce 100644
--- a/src/shared/particles/particle_sorting.cpp
+++ b/src/shared/particles/particle_sorting.cpp
@@ -5,6 +5,9 @@
 #include "base_particles.h"
 #include "cell_linked_list.h"
 
+#include <oneapi/dpl/execution>
+#include <oneapi/dpl/algorithm>
+
 namespace SPH
 {
 //=================================================================================================//
@@ -94,6 +97,16 @@ void ParticleSorting::sortingParticleData(DeviceInt *begin, DeviceInt size, exec
 
     device_radix_sorting.sort_by_key(begin, index_sorting_device_variables_, size, execution::executionQueue.getQueue(), 512, 4).wait();
 
+    /*execution::executionQueue.getQueue().parallel_for(execution::executionQueue.getUniformNdRange(size),
+                                                      [=, index_sorting = index_sorting_device_variables_](sycl::nd_item<1> it) {
+                                                          DeviceInt i = it.get_global_id(0);
+                                                          if(i < size)
+                                                              index_sorting[i] = i;
+                                                      }).wait();
+
+    oneapi::dpl::sort_by_key(oneapi::dpl::execution::make_device_policy(execution::executionQueue.getQueue()),
+                             begin, begin + size, index_sorting_device_variables_);*/
+
     move_sortable_particle_device_data_(index_sorting_device_variables_, size);
 
     updateSortedDeviceId();
@@ -169,7 +182,7 @@ void DeviceRadixSort<ValueType>::resize(DeviceInt data_size, DeviceInt radix_bit
     global_buckets_ = std::make_unique<sycl::buffer<DeviceInt, 2>>(buckets_column_major_range);
     // Each entry contains global number of digits with the same and lower values
     global_buckets_offsets_ = std::make_unique<sycl::buffer<DeviceInt, 2>>(buckets_column_major_range);
-    local_buckets_offsets_buffer_ = std::make_unique<sycl::buffer<DeviceInt, 2>>(buckets_row_major_range);  // save state of local accessor
+    local_buckets_offsets_buffer_ = std::make_unique<sycl::buffer<DeviceInt, 1>>(buckets_row_major_range.size());  // save state of local accessor
     data_swap_buffer_ = std::make_unique<sycl::buffer<SortablePair>>(uniform_global_size_);  // temporary memory for swapping
     // Keep extra values to be swapped when kernel range has been made uniform
     uniform_extra_swap_buffer_ = std::make_unique<sycl::buffer<SortablePair>>(uniform_global_size_ - data_size);
@@ -250,8 +263,8 @@ sycl::event DeviceRadixSort<ValueType>::sort_by_key(DeviceInt *keys, ValueType *
                                                                        data_swap_acc[workgroup_size * workgroup + rank] = number;  // save local sorting back to data
 
                                                                    // Compute local buckets offsets
-                                                                   DeviceInt *begin = local_buckets.get_pointer(), *end = begin + radix,
-                                                                          *outBegin = local_buckets_offsets_accessor.get_pointer().get() + workgroup * radix;
+                                                                   DeviceInt *begin = local_buckets.get_multi_ptr<sycl::access::decorated::no>().get(), *end = begin + radix,
+                                                                             *outBegin = local_buckets_offsets_accessor.get_multi_ptr<sycl::access::decorated::no>().get() + workgroup * radix;
                                                                    sycl::joint_exclusive_scan(item.get_group(), begin, end, outBegin, sycl::plus<>{});
                                                                }); });
 
@@ -264,9 +277,11 @@ sycl::event DeviceRadixSort<ValueType>::sort_by_key(DeviceInt *keys, ValueType *
                                                   cgh.parallel_for(kernel_range_, [=](sycl::nd_item<1> item) {
                                                                        // Compute global buckets offsets
                                                                        if(item.get_group_linear_id() == 0) {
-                                                                           DeviceInt *begin = global_buckets_accessor.get_pointer(), *end = begin + global_buckets_accessor.size();
+                                                                           const DeviceInt *begin = global_buckets_accessor.get_multi_ptr<sycl::access::decorated::no>().get(),
+                                                                                           *end = begin + global_buckets_accessor.size();
                                                                            sycl::joint_exclusive_scan(item.get_group(), begin, end,
-                                                                                                      global_buckets_offsets_accessor.get_pointer(), sycl::plus<>{});
+                                                                                                      global_buckets_offsets_accessor.get_multi_ptr<sycl::access::decorated::no>().get(),
+                                                                                                      sycl::plus<>{});
                                                                        }
                                                                    });
                                               });
@@ -278,11 +293,13 @@ sycl::event DeviceRadixSort<ValueType>::sort_by_key(DeviceInt *keys, ValueType *
                                       auto global_buckets_accessor = global_buckets_->get_access(cgh, sycl::read_only);
                                       auto global_buckets_offsets_accessor = global_buckets_offsets_->get_access(cgh, sycl::read_write);
                                       auto local_buckets_offsets_accessor = local_buckets_offsets_buffer_->get_access(cgh, sycl::read_only);
-                                      cgh.parallel_for(kernel_range_, [=](sycl::nd_item<1> item) {
+                                      cgh.parallel_for(kernel_range_, [=, radix = radix_](sycl::nd_item<1> item) {
                                                            // Compute global buckets offsets
-                                                           DeviceInt *begin = global_buckets_accessor.get_pointer(), *end = begin + global_buckets_accessor.size();
+                                                           const DeviceInt *begin = global_buckets_accessor.get_multi_ptr<sycl::access::decorated::no>().get(),
+                                                                           *end = begin + global_buckets_accessor.size();
                                                            sycl::joint_exclusive_scan(item.get_group(), begin, end,
-                                                                                      global_buckets_offsets_accessor.get_pointer(), sycl::plus<>{});
+                                                                                      global_buckets_offsets_accessor.get_multi_ptr<sycl::access::decorated::no>().get(),
+                                                                                      sycl::plus<>{});
 
                                                            // Mask only relevant indexes. All max_keys added to homogenize the computations
                                                            // should be owned by work-items with global_id >= data_size
@@ -294,7 +311,7 @@ sycl::event DeviceRadixSort<ValueType>::sort_by_key(DeviceInt *keys, ValueType *
 
                                                                // Compute sorted position based on global and local buckets
                                                                const DeviceInt data_offset = global_buckets_offsets_accessor[radix_digit][workgroup] + rank -
-                                                                                          local_buckets_offsets_accessor[workgroup][radix_digit];
+                                                                                             local_buckets_offsets_accessor[workgroup * radix + radix_digit];
 
                                                                // Copy to original data pointers
                                                                keys[data_offset] = number.first;
diff --git a/src/shared/particles/particle_sorting.h b/src/shared/particles/particle_sorting.h
index 4e9c1e79e3..18ffce506b 100644
--- a/src/shared/particles/particle_sorting.h
+++ b/src/shared/particles/particle_sorting.h
@@ -229,7 +229,8 @@ class DeviceRadixSort
     DeviceInt data_size_ = 0, radix_bits_, workgroup_size_,
            uniform_global_size_, workgroups_, radix_;
     sycl::nd_range<1> kernel_range_{0,0};
-    std::unique_ptr<sycl::buffer<DeviceInt, 2>> global_buckets_, global_buckets_offsets_, local_buckets_offsets_buffer_;
+    std::unique_ptr<sycl::buffer<DeviceInt, 2>> global_buckets_, global_buckets_offsets_;
+    std::unique_ptr<sycl::buffer<DeviceInt, 1>> local_buckets_offsets_buffer_;
     std::unique_ptr<sycl::buffer<SortablePair>> data_swap_buffer_, uniform_extra_swap_buffer_;
 };
 

From 73a16cbc781a52574a8105505f3480e652d8a1ac Mon Sep 17 00:00:00 2001
From: Alberto Guarnieri <nR3D@pm.me>
Date: Sat, 13 Jul 2024 20:01:12 +0200
Subject: [PATCH 6/8] Improved CellLinkedListKernel data structure, with better
 memory access for contiguous cells

---
 src/shared/meshes/cell_linked_list.cpp | 87 +++++++++++++++++---------
 src/shared/meshes/cell_linked_list.h   | 17 +++--
 2 files changed, 65 insertions(+), 39 deletions(-)

diff --git a/src/shared/meshes/cell_linked_list.cpp b/src/shared/meshes/cell_linked_list.cpp
index 0fb29d4d60..32b704f1d2 100644
--- a/src/shared/meshes/cell_linked_list.cpp
+++ b/src/shared/meshes/cell_linked_list.cpp
@@ -39,8 +39,8 @@ CellLinkedListKernel::CellLinkedListKernel(const DeviceVecd &meshLowerBound, Dev
     : total_real_particles_(0), list_data_pos_(nullptr),  // will be initialized at first UpdateCellLists execution
       mesh_lower_bound_(allocateDeviceData<DeviceVecd>(1)), grid_spacing_(allocateDeviceData<DeviceReal>(1)),
       all_grid_points_(allocateDeviceData<DeviceArrayi>(1)), all_cells_(allocateDeviceData<DeviceArrayi>(1)),
-      index_list_(nullptr), index_head_list_(allocateDeviceData<DeviceInt>(VecdFoldProduct(allCells))),
-      index_head_list_size_(VecdFoldProduct(allCells))
+      linear_cell_size_(VecdFoldProduct(allCells)), offset_cell_size_(allocateDeviceData<DeviceInt>(linear_cell_size_+1)),
+      curr_cell_size_(allocateDeviceData<DeviceInt>(linear_cell_size_))
 {
     copyDataToDevice(&meshLowerBound, mesh_lower_bound_, 1)
         .add(copyDataToDevice(&gridSpacing, grid_spacing_, 1))
@@ -58,50 +58,77 @@ CellLinkedListKernel::~CellLinkedListKernel()
 
 execution::ExecutionEvent CellLinkedListKernel::clearCellLists()
 {
-    // Only clear head list, since index list does not depend on its previous values
-    return std::move(copyDataToDevice(static_cast<DeviceInt>(0), index_head_list_, index_head_list_size_));
+    return std::move(copyDataToDevice(static_cast<DeviceInt>(0), offset_cell_size_, linear_cell_size_+1)
+                .add(copyDataToDevice(static_cast<DeviceInt>(0), curr_cell_size_, linear_cell_size_)));
 }
 
 execution::ExecutionEvent CellLinkedListKernel::UpdateCellLists(const SPH::BaseParticles &base_particles)
 {
-    if(!index_list_)  // initialize list data with base_particles at first execution
+    if(!list_data_pos_)  // initialize list data with base_particles at first execution
     {
         total_real_particles_ = base_particles.total_real_particles_;
         list_data_pos_ = base_particles.getDeviceVariableByName<DeviceVecd>("Position");
-        index_list_ = allocateDeviceData<DeviceInt>(total_real_particles_);
+        particle_id_list_ = allocateDeviceData<DeviceInt>(total_real_particles_);
     }
     auto clear_event = clearCellLists();
-    auto *pos_n = base_particles.getDeviceVariableByName<DeviceVecd>("Position");
-    size_t total_real_particles = base_particles.total_real_particles_;
+
+    DeviceInt total_real_particles = base_particles.total_real_particles_;
+    auto determine_cells_size_event =  executionQueue.getQueue().submit(
+        [&, mesh_lower_bound = mesh_lower_bound_, grid_spacing = grid_spacing_, all_grid_points = all_grid_points_,
+         all_cells = all_cells_, offset_cell_size = offset_cell_size_, pos_n = list_data_pos_](sycl::handler &cgh)
+                      {
+                             cgh.depends_on(clear_event.getEventList());
+                             cgh.parallel_for(executionQueue.getUniformNdRange(total_real_particles),
+                                              [=](sycl::nd_item<1> item) {
+                                                  if(item.get_global_id() < total_real_particles)
+                                                  {
+                                                        const DeviceInt index_i = item.get_global_id();
+                                                        const auto cell_index = CellIndexFromPosition(pos_n[index_i], *mesh_lower_bound,
+                                                                                                     *grid_spacing, *all_grid_points);
+                                                        const auto linear_cell_index = transferCellIndexTo1D(cell_index, *all_cells);
+                                                        sycl::atomic_ref<DeviceInt, sycl::memory_order_relaxed, sycl::memory_scope_device,
+                                                                        sycl::access::address_space::global_space>
+                                                           atomic_cell_size(offset_cell_size[linear_cell_index]);
+                                                        ++atomic_cell_size;
+                                                  }
+                                         }); });
+
+    auto exclusive_scan_cells_size_event = executionQueue.getQueue().submit(
+        [&, offset_cell_size = offset_cell_size_, num_cells = linear_cell_size_ + 1](sycl::handler &cgh) {
+            cgh.depends_on(determine_cells_size_event);
+            cgh.parallel_for(executionQueue.getUniformNdRange(executionQueue.getWorkGroupSize()),
+                             [=](sycl::nd_item<1> item) {
+                                          if(item.get_group_linear_id() == 0) {
+                                              sycl::joint_exclusive_scan(item.get_group(), offset_cell_size,
+                                                                         offset_cell_size + num_cells,
+                                                                         offset_cell_size, sycl::plus<>{});
+                                          }
+                                      });
+                 });
+
     return executionQueue.getQueue().submit(
-                                 [&, mesh_lower_bound = mesh_lower_bound_, grid_spacing = grid_spacing_, all_grid_points = all_grid_points_,
-                                  all_cells = all_cells_, index_list = index_list_, index_head_list = index_head_list_](sycl::handler &cgh)
-                                 {
-                                     cgh.depends_on(clear_event.getEventList());
-                                     cgh.parallel_for(executionQueue.getUniformNdRange(total_real_particles), [=](sycl::nd_item<1> item)
-                                                      {
-                                 const size_t index_i = item.get_global_id();
-                                 if(index_i < total_real_particles)
+        [&, mesh_lower_bound = mesh_lower_bound_, grid_spacing = grid_spacing_, all_grid_points = all_grid_points_,
+        all_cells = all_cells_, particle_id_list = particle_id_list_, offset_cell_size = offset_cell_size_,
+        curr_cell_size = curr_cell_size_, pos_n = list_data_pos_] (sycl::handler &cgh)
+        {
+            cgh.depends_on(exclusive_scan_cells_size_event);
+            cgh.parallel_for(executionQueue.getUniformNdRange(total_real_particles),
+                             [=](sycl::nd_item<1> item)
+                             {
+                                 if(item.get_global_id() < total_real_particles)
                                  {
+                                     const DeviceInt index_i = item.get_global_id();
                                      const auto cell_index = CellIndexFromPosition(pos_n[index_i], *mesh_lower_bound,
                                                                                    *grid_spacing, *all_grid_points);
                                      const auto linear_cell_index = transferCellIndexTo1D(cell_index, *all_cells);
                                      sycl::atomic_ref<DeviceInt, sycl::memory_order_relaxed, sycl::memory_scope_device,
                                                       sycl::access::address_space::global_space>
-                                         atomic_head_list(index_head_list[linear_cell_index]);
-                                     /*
-                                      * Insert index at the head of the list, the index previously at the top is then
-                                      * used as the next one pointed by the new index.
-                                      * Indices values are increased by 1 to let 0 be the value that indicates list termination.
-                                      * If the cell list is empty (i.e. head == 0) then head will point to the new index and the
-                                      * new index will point to 0 (i.e. the new index will be the first and last element of the cell list).
-                                      *     index_head_list[linear_cell_index] = index_i+1  ---> index_list[index_i] = 0
-                                      * Since the cell list order is not relevant, memory_order_relaxed will only ensure that each cell of
-                                      * index_head_list gets a new index one at a time.
-                                      */
-                                     index_list[index_i] = atomic_head_list.exchange(index_i + 1);
-                                 } });
-                                 });
+                                         atomic_current_cell_size(curr_cell_size[linear_cell_index]);
+
+                                     particle_id_list[offset_cell_size[linear_cell_index] + atomic_current_cell_size++] = index_i;
+                                 }
+                             });
+        });
 }
 //=================================================================================================//
 DeviceInt *CellLinkedListKernel::computingSequence(BaseParticles &baseParticles)
diff --git a/src/shared/meshes/cell_linked_list.h b/src/shared/meshes/cell_linked_list.h
index 244b62174a..359a32b4ae 100644
--- a/src/shared/meshes/cell_linked_list.h
+++ b/src/shared/meshes/cell_linked_list.h
@@ -105,12 +105,10 @@ class CellLinkedListKernel {
             VecdMin(*all_cells_, DeviceArrayi{target_cell_index + search_depth + DeviceInt{1}}),
             [&](DeviceArrayi&& cell_index) {
                 const auto linear_cell_index = transferCellIndexTo1D(cell_index, *all_cells_);
-                DeviceInt index_j = index_head_list_[linear_cell_index];
-                // Cell list ends when index_j == 0, if index_j is already zero then cell is empty.
-                while(index_j--) {  // abbreviates while(index_j != 0) { index_j -= 1; ... }
-                    function(pos_i, index_j, list_data_pos_[index_j]);
-                    index_j = index_list_[index_j];
-                }
+                // Since offset_cell_size_ has linear_cell_size_+1 elements, no boundary checks are needed.
+                // offset_cell_size_[0] == 0 && offset_cell_size_[linear_cell_size_] == total_real_particles_
+                for(DeviceInt j = offset_cell_size_[linear_cell_index]; j < offset_cell_size_[linear_cell_index+1]; ++j)
+                    function(pos_i, particle_id_list_[j], list_data_pos_[particle_id_list_[j]]);
             });
     }
 
@@ -163,9 +161,10 @@ class CellLinkedListKernel {
     DeviceReal *grid_spacing_;
     DeviceArrayi *all_grid_points_, *all_cells_;
 
-    DeviceInt* index_list_;
-    DeviceInt* index_head_list_;
-    DeviceInt index_head_list_size_;
+    DeviceInt linear_cell_size_;    // Total number of cells
+    DeviceInt *offset_cell_size_,   // Identify begin and end offsets of each cell in particle_id_list_
+              *curr_cell_size_;     // Auxiliary memory to compute particle_id_list_ starting from offset_cell_size_
+    DeviceInt *particle_id_list_;   // List of particles ordered based on their cells
 };
 
 

From bcb7192447033dd7a1026f6d48e43842e2826f26 Mon Sep 17 00:00:00 2001
From: Alberto Guarnieri <nR3D@pm.me>
Date: Sat, 13 Jul 2024 20:23:17 +0200
Subject: [PATCH 7/8] Use default workgroup size in test_2d_dambreak_sycl

---
 tests/2d_examples/test_2d_dambreak_sycl/dambreak_sycl.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/2d_examples/test_2d_dambreak_sycl/dambreak_sycl.cpp b/tests/2d_examples/test_2d_dambreak_sycl/dambreak_sycl.cpp
index bfbad58768..a396d80c70 100644
--- a/tests/2d_examples/test_2d_dambreak_sycl/dambreak_sycl.cpp
+++ b/tests/2d_examples/test_2d_dambreak_sycl/dambreak_sycl.cpp
@@ -103,8 +103,6 @@ int main(int ac, char *av[])
     ReduceDynamics<fluid_dynamics::AcousticTimeStepSize, ParallelSYCLDevicePolicy> fluid_acoustic_time_step(water_block);
 
     water_block.getBaseParticles().copyToDeviceMemory().wait();
-    executionQueue.setWorkGroupSize(16);
-
     //----------------------------------------------------------------------
     //	Define the methods for I/O operations, observations
     //	and regression tests of the simulation.

From 8981ddd7c303e07c81205ae54407e11633d5c4d5 Mon Sep 17 00:00:00 2001
From: Alberto Guarnieri <nR3D@pm.me>
Date: Sat, 13 Jul 2024 20:39:00 +0200
Subject: [PATCH 8/8] Option to use oneDPL sorting with macro
 SPHINXSYS_USE_ONEDPL_SORTING

---
 src/shared/particles/particle_sorting.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/shared/particles/particle_sorting.cpp b/src/shared/particles/particle_sorting.cpp
index b5032878ce..3464fa3fd9 100644
--- a/src/shared/particles/particle_sorting.cpp
+++ b/src/shared/particles/particle_sorting.cpp
@@ -5,8 +5,10 @@
 #include "base_particles.h"
 #include "cell_linked_list.h"
 
+#ifdef SPHINXSYS_USE_ONEDPL_SORTING
 #include <oneapi/dpl/execution>
 #include <oneapi/dpl/algorithm>
+#endif
 
 namespace SPH
 {
@@ -95,9 +97,10 @@ void ParticleSorting::sortingParticleData(DeviceInt *begin, DeviceInt size, exec
     if (!index_sorting_device_variables_)
         index_sorting_device_variables_ = allocateDeviceData<DeviceInt>(size);
 
+#ifndef SPHINXSYS_USE_ONEDPL_SORTING
     device_radix_sorting.sort_by_key(begin, index_sorting_device_variables_, size, execution::executionQueue.getQueue(), 512, 4).wait();
-
-    /*execution::executionQueue.getQueue().parallel_for(execution::executionQueue.getUniformNdRange(size),
+#else
+    execution::executionQueue.getQueue().parallel_for(execution::executionQueue.getUniformNdRange(size),
                                                       [=, index_sorting = index_sorting_device_variables_](sycl::nd_item<1> it) {
                                                           DeviceInt i = it.get_global_id(0);
                                                           if(i < size)
@@ -105,7 +108,8 @@ void ParticleSorting::sortingParticleData(DeviceInt *begin, DeviceInt size, exec
                                                       }).wait();
 
     oneapi::dpl::sort_by_key(oneapi::dpl::execution::make_device_policy(execution::executionQueue.getQueue()),
-                             begin, begin + size, index_sorting_device_variables_);*/
+                             begin, begin + size, index_sorting_device_variables_);
+#endif
 
     move_sortable_particle_device_data_(index_sorting_device_variables_, size);