From e11c19fb656d1ed49a7bb1c9e9e6ecfba5637787 Mon Sep 17 00:00:00 2001 From: Alberto Guarnieri Date: Fri, 24 May 2024 12:01:36 +0200 Subject: [PATCH 1/8] Added device-specific u_int32 type (DeviceInt) that replaces size_t for kernel executions --- src/shared/common/base_data_type.h | 1 + src/shared/common/vector_functions.h | 8 ++ src/shared/materials/riemann_solver.h | 53 ++++++++++- .../materials/weakly_compressible_fluid.h | 4 +- src/shared/meshes/cell_linked_list.cpp | 14 +-- src/shared/meshes/cell_linked_list.h | 24 ++--- .../external_force/external_force.h | 2 +- .../fluid_dynamics/density_summation.h | 12 +-- .../fluid_dynamics/fluid_integration.h | 88 +++++++++++-------- .../fluid_dynamics/fluid_integration.hpp | 18 ++-- .../fluid_dynamics/fluid_time_step.h | 2 +- .../particle_dynamics/particle_functors.h | 2 +- src/shared/particles/base_particles.cpp | 6 +- src/shared/particles/base_particles.h | 6 +- src/shared/particles/base_particles.hpp | 2 +- 15 files changed, 156 insertions(+), 86 deletions(-) diff --git a/src/shared/common/base_data_type.h b/src/shared/common/base_data_type.h index c776dfb440..6b864371fd 100644 --- a/src/shared/common/base_data_type.h +++ b/src/shared/common/base_data_type.h @@ -93,6 +93,7 @@ using Rotation2d = Eigen::Rotation2D; using Rotation3d = Eigen::AngleAxis; /** Device data types. */ using DeviceReal = Real; +using DeviceInt = u_int32_t; using DeviceVec2d = Vec2d; using DeviceVec3d = Vec3d; using DeviceArray2i = Array2i; diff --git a/src/shared/common/vector_functions.h b/src/shared/common/vector_functions.h index c995838918..5f34c33081 100644 --- a/src/shared/common/vector_functions.h +++ b/src/shared/common/vector_functions.h @@ -202,6 +202,10 @@ inline execution::ExecutionEvent copyDataToDevice(const Real* host, DeviceReal* return transformAndCopyDataToDevice(host, device, size, [](Real val) { return static_cast(val); }); } +inline execution::ExecutionEvent copyDataToDevice(const size_t* host, DeviceInt* device, std::size_t size) { + return transformAndCopyDataToDevice(host, device, size, [](size_t val) { return static_cast(val); }); +} + template inline execution::ExecutionEvent copyDataToDevice(const T& value, T* device, std::size_t size) { return execution::executionQueue.getQueue().fill(device, value, size); @@ -234,5 +238,9 @@ inline execution::ExecutionEvent copyDataFromDevice(Vec3d* host, const DeviceVec inline execution::ExecutionEvent copyDataFromDevice(Real* host, const DeviceReal* device, std::size_t size) { return transformAndCopyDataFromDevice(host, device, size, [](DeviceReal val) { return static_cast(val); }); } + +inline execution::ExecutionEvent copyDataFromDevice(size_t* host, const DeviceInt* device, std::size_t size) { + return transformAndCopyDataFromDevice(host, device, size, [](DeviceInt val) { return static_cast(val); }); +} } // namespace SPH #endif // VECTOR_FUNCTIONS_H diff --git a/src/shared/materials/riemann_solver.h b/src/shared/materials/riemann_solver.h index 2b81a312b0..94582d96ff 100644 --- a/src/shared/materials/riemann_solver.h +++ b/src/shared/materials/riemann_solver.h @@ -48,6 +48,23 @@ struct FluidStateOut FluidStateOut(Real rho, Vecd vel, Real p) : vel_(vel), rho_(rho), p_(p){}; }; +class NoRiemannSolverKernel +{ + public: + NoRiemannSolverKernel(DeviceReal rho0c0_i, DeviceReal rho0c0_j, DeviceReal inv_rho0c0_sum) + : rho0c0_i_(rho0c0_i), rho0c0_j_(rho0c0_j), inv_rho0c0_sum_(inv_rho0c0_sum) {} + + DeviceReal DissipativePJump(const DeviceReal &) const { return DeviceReal(0.0); } + DeviceReal DissipativeUJump(const DeviceReal &) const { return DeviceReal(0.0); } + + DeviceReal AverageP(const DeviceReal &p_i, const DeviceReal &p_j) + { + return (p_i * rho0c0_j_ + p_j * rho0c0_i_) * inv_rho0c0_sum_; + }; + protected: + DeviceReal rho0c0_i_, rho0c0_j_, inv_rho0c0_sum_; +}; + /** * @struct NoRiemannSolver * @brief Central difference scheme without Riemann flux. @@ -60,7 +77,8 @@ class NoRiemannSolver : rho0_i_(fluid_i.ReferenceDensity()), rho0_j_(fluid_j.ReferenceDensity()), c0_i_(fluid_i.ReferenceSoundSpeed()), c0_j_(fluid_j.ReferenceSoundSpeed()), rho0c0_i_(rho0_i_ * c0_i_), rho0c0_j_(rho0_j_ * c0_j_), - inv_rho0c0_sum_(1.0 / (rho0c0_i_ + rho0c0_j_)){}; + inv_rho0c0_sum_(1.0 / (rho0c0_i_ + rho0c0_j_)), + device_kernel(rho0c0_i_, rho0c0_j_, inv_rho0c0_sum_) {}; Real DissipativePJump(const Real &u_jump) const { return 0.0; } Real DissipativeUJump(const Real &p_jump) const { return 0.0; } @@ -77,6 +95,32 @@ class NoRiemannSolver Real rho0_i_, rho0_j_; Real c0_i_, c0_j_; Real rho0c0_i_, rho0c0_j_, inv_rho0c0_sum_; + + public: + execution::DeviceImplementation device_kernel; +}; + +class AcousticRiemannSolverKernel : public NoRiemannSolverKernel +{ + public: + template + AcousticRiemannSolverKernel(DeviceReal inv_rho0c0_ave, DeviceReal rho0c0_geo_ave, + DeviceReal inv_c_ave, DeviceReal limiter_coeff, Args&& ...baseArgs) + : NoRiemannSolverKernel(std::forward(baseArgs)...), inv_rho0c0_ave_(inv_rho0c0_ave), + rho0c0_geo_ave_(rho0c0_geo_ave), inv_c_ave_(inv_c_ave), limiter_coeff_(limiter_coeff) {} + + DeviceReal DissipativePJump(const DeviceReal &u_jump) const { + return rho0c0_geo_ave_ * u_jump * sycl::min(limiter_coeff_ * sycl::max(u_jump * inv_c_ave_, DeviceReal(0)), DeviceReal(1)); + } + + DeviceReal DissipativeUJump(const DeviceReal &p_jump) const { + return p_jump * inv_rho0c0_ave_; + } + + protected: + DeviceReal inv_rho0c0_ave_, rho0c0_geo_ave_; + DeviceReal inv_c_ave_; + DeviceReal limiter_coeff_; }; class AcousticRiemannSolver : public NoRiemannSolver @@ -88,7 +132,9 @@ class AcousticRiemannSolver : public NoRiemannSolver inv_rho0c0_ave_(2.0 * inv_rho0c0_sum_), rho0c0_geo_ave_(2.0 * rho0c0_i_ * rho0c0_j_ * inv_rho0c0_sum_), inv_c_ave_(0.5 * (rho0_i_ + rho0_j_) * inv_rho0c0_ave_), - limiter_coeff_(limiter_coeff){}; + limiter_coeff_(limiter_coeff), + device_kernel(inv_rho0c0_ave_, rho0c0_geo_ave_, inv_c_ave_, limiter_coeff_, + rho0c0_i_, rho0c0_j_, inv_rho0c0_sum_) {}; Real DissipativePJump(const Real &u_jump) const { return rho0c0_geo_ave_ * u_jump * SMIN(limiter_coeff_ * SMAX(u_jump * inv_c_ave_, Real(0)), Real(1)); } @@ -101,6 +147,9 @@ class AcousticRiemannSolver : public NoRiemannSolver Real inv_rho0c0_ave_, rho0c0_geo_ave_; Real inv_c_ave_; Real limiter_coeff_; + + public: + execution::DeviceImplementation device_kernel; }; class DissipativeRiemannSolver : public AcousticRiemannSolver diff --git a/src/shared/materials/weakly_compressible_fluid.h b/src/shared/materials/weakly_compressible_fluid.h index b3418df974..d1543db2d9 100644 --- a/src/shared/materials/weakly_compressible_fluid.h +++ b/src/shared/materials/weakly_compressible_fluid.h @@ -44,10 +44,10 @@ class WeaklyCompressibleFluidKernel inline DeviceReal getPressure(DeviceReal rho) const { return p0_ * (rho / rho0_ - static_cast(1.0)); } inline DeviceReal getSoundSpeed(DeviceReal p = 0.0, DeviceReal rho = 1.0) const { return c0_; } inline DeviceReal ReferenceDensity() const { return rho0_; } - inline DeviceReal ReferenceSoundSpeed() { return c0_; }; + inline DeviceReal ReferenceSoundSpeed() const { return c0_; }; private: - DeviceReal p0_, c0_, rho0_; + const DeviceReal p0_, c0_, rho0_; }; /** diff --git a/src/shared/meshes/cell_linked_list.cpp b/src/shared/meshes/cell_linked_list.cpp index ee6eeab92b..0fb29d4d60 100644 --- a/src/shared/meshes/cell_linked_list.cpp +++ b/src/shared/meshes/cell_linked_list.cpp @@ -39,7 +39,7 @@ CellLinkedListKernel::CellLinkedListKernel(const DeviceVecd &meshLowerBound, Dev : total_real_particles_(0), list_data_pos_(nullptr), // will be initialized at first UpdateCellLists execution mesh_lower_bound_(allocateDeviceData(1)), grid_spacing_(allocateDeviceData(1)), all_grid_points_(allocateDeviceData(1)), all_cells_(allocateDeviceData(1)), - index_list_(nullptr), index_head_list_(allocateDeviceData(VecdFoldProduct(allCells))), + index_list_(nullptr), index_head_list_(allocateDeviceData(VecdFoldProduct(allCells))), index_head_list_size_(VecdFoldProduct(allCells)) { copyDataToDevice(&meshLowerBound, mesh_lower_bound_, 1) @@ -59,7 +59,7 @@ CellLinkedListKernel::~CellLinkedListKernel() execution::ExecutionEvent CellLinkedListKernel::clearCellLists() { // Only clear head list, since index list does not depend on its previous values - return std::move(copyDataToDevice(static_cast(0), index_head_list_, index_head_list_size_)); + return std::move(copyDataToDevice(static_cast(0), index_head_list_, index_head_list_size_)); } execution::ExecutionEvent CellLinkedListKernel::UpdateCellLists(const SPH::BaseParticles &base_particles) @@ -68,7 +68,7 @@ execution::ExecutionEvent CellLinkedListKernel::UpdateCellLists(const SPH::BaseP { total_real_particles_ = base_particles.total_real_particles_; list_data_pos_ = base_particles.getDeviceVariableByName("Position"); - index_list_ = allocateDeviceData(total_real_particles_); + index_list_ = allocateDeviceData(total_real_particles_); } auto clear_event = clearCellLists(); auto *pos_n = base_particles.getDeviceVariableByName("Position"); @@ -86,7 +86,7 @@ execution::ExecutionEvent CellLinkedListKernel::UpdateCellLists(const SPH::BaseP const auto cell_index = CellIndexFromPosition(pos_n[index_i], *mesh_lower_bound, *grid_spacing, *all_grid_points); const auto linear_cell_index = transferCellIndexTo1D(cell_index, *all_cells); - sycl::atomic_ref atomic_head_list(index_head_list[linear_cell_index]); /* @@ -104,18 +104,18 @@ execution::ExecutionEvent CellLinkedListKernel::UpdateCellLists(const SPH::BaseP }); } //=================================================================================================// -size_t *CellLinkedListKernel::computingSequence(BaseParticles &baseParticles) +DeviceInt *CellLinkedListKernel::computingSequence(BaseParticles &baseParticles) { auto *pos = baseParticles.getDeviceVariableByName("Position"); auto *sequence = baseParticles.sequence_device_; - size_t total_real_particles = baseParticles.total_real_particles_; + DeviceInt total_real_particles = baseParticles.total_real_particles_; executionQueue.getQueue().submit( [&, mesh_lower_bound = mesh_lower_bound_, grid_spacing = grid_spacing_, all_grid_points = all_grid_points_](sycl::handler &cgh) { cgh.parallel_for(executionQueue.getUniformNdRange(total_real_particles), [=](sycl::nd_item<1> item) { - size_t i = item.get_global_id(); + DeviceInt i = item.get_global_id(); if(i < total_real_particles) sequence[i] = BaseMesh::transferMeshIndexToMortonOrder( CellIndexFromPosition(pos[i], *mesh_lower_bound, diff --git a/src/shared/meshes/cell_linked_list.h b/src/shared/meshes/cell_linked_list.h index 1527030f4f..244b62174a 100644 --- a/src/shared/meshes/cell_linked_list.h +++ b/src/shared/meshes/cell_linked_list.h @@ -93,19 +93,19 @@ class CellLinkedListKernel { execution::ExecutionEvent UpdateCellLists(const BaseParticles &base_particles); template - void forEachNeighbor(size_t index_i, const DeviceVecd *self_position, + void forEachNeighbor(DeviceInt index_i, const DeviceVecd *self_position, const FunctionOnEach &function) const { const DeviceVecd pos_i = self_position[index_i]; - const size_t search_depth = 1; + const DeviceInt search_depth = 1; const auto target_cell_index = CellIndexFromPosition(pos_i, *mesh_lower_bound_, *grid_spacing_, *all_grid_points_); mesh_for_each_array( VecdMax(DeviceArrayi{0}, DeviceArrayi{target_cell_index - search_depth}), - VecdMin(*all_cells_, DeviceArrayi{target_cell_index + search_depth + 1}), + VecdMin(*all_cells_, DeviceArrayi{target_cell_index + search_depth + DeviceInt{1}}), [&](DeviceArrayi&& cell_index) { const auto linear_cell_index = transferCellIndexTo1D(cell_index, *all_cells_); - size_t index_j = index_head_list_[linear_cell_index]; + DeviceInt index_j = index_head_list_[linear_cell_index]; // Cell list ends when index_j == 0, if index_j is already zero then cell is empty. while(index_j--) { // abbreviates while(index_j != 0) { index_j -= 1; ... } function(pos_i, index_j, list_data_pos_[index_j]); @@ -121,7 +121,7 @@ class CellLinkedListKernel { } - size_t* computingSequence(BaseParticles &baseParticles); + DeviceInt* computingSequence(BaseParticles &baseParticles); template static inline DeviceArrayi CellIndexFromPosition(const sycl::vec &position, const sycl::vec& mesh_lower_bound, @@ -130,7 +130,7 @@ class CellLinkedListKernel { return sycl::min( sycl::max( sycl::floor((position - mesh_lower_bound) / grid_spacing) - .template convert(), DeviceArrayi{0}), + .template convert(), DeviceArrayi{0}), all_grid_points - DeviceArrayi{2}); } @@ -143,12 +143,12 @@ class CellLinkedListKernel { return VecdMin(VecdMax(pos_floor, DeviceArrayi{0}), DeviceArrayi{all_grid_points - DeviceArrayi{2}}); } - static inline size_t transferCellIndexTo1D(const DeviceArray2i &cell_index, const DeviceArray2i &all_cells) + static inline DeviceInt transferCellIndexTo1D(const DeviceArray2i &cell_index, const DeviceArray2i &all_cells) { return cell_index[0] * all_cells[1] + cell_index[1]; } - static inline size_t transferCellIndexTo1D(const DeviceArray3i &cell_index, const DeviceArray3i &all_cells) + static inline DeviceInt transferCellIndexTo1D(const DeviceArray3i &cell_index, const DeviceArray3i &all_cells) { return cell_index[0] * all_cells[1] * all_cells[2] + cell_index[1] * all_cells[2] + @@ -156,16 +156,16 @@ class CellLinkedListKernel { } private: - size_t total_real_particles_; + DeviceInt total_real_particles_; DeviceVecd* list_data_pos_; DeviceVecd *mesh_lower_bound_; DeviceReal *grid_spacing_; DeviceArrayi *all_grid_points_, *all_cells_; - size_t* index_list_; - size_t* index_head_list_; - size_t index_head_list_size_; + DeviceInt* index_list_; + DeviceInt* index_head_list_; + DeviceInt index_head_list_size_; }; diff --git a/src/shared/particle_dynamics/external_force/external_force.h b/src/shared/particle_dynamics/external_force/external_force.h index 173b25acbd..0e30aaecf1 100644 --- a/src/shared/particle_dynamics/external_force/external_force.h +++ b/src/shared/particle_dynamics/external_force/external_force.h @@ -75,7 +75,7 @@ class Gravity template>, std::is_same, std::true_type>>> - DeviceVecd InducedAcceleration(const Vecd &position = VecdZero()) const { + DeviceVecd InducedAcceleration(const Vec &position = VecdZero()) const { return global_acceleration_device_; } template> : public DensitySummationKernel>(std::forward(baseArgs)...) {} - void interaction(size_t index_i, Real dt = 0.0) { + void interaction(DeviceInt index_i, DeviceReal dt = 0.0) { DeviceReal sigma = W0_; const auto &neighbor_builder = *inner_neighbor_builder_; cell_linked_list_->forEachInnerNeighbor(index_i, [&](const DeviceVecd &pos_i, size_t index_j, const DeviceVecd &pos_j) @@ -118,7 +118,7 @@ class DensitySummationKernel> : public DensitySummationKernel> : public DensitySummationKernelforEachNeighbor(index_i, particles_position_, - [&](const DeviceVecd &pos_i, size_t index_j, const DeviceVecd &pos_j) + [&](const DeviceVecd &pos_i, DeviceInt index_j, const DeviceVecd &pos_j) { if(neighbor_builder.isWithinCutoff(pos_i, pos_j)) sigma += neighbor_builder.W_ij(pos_i, pos_j) * @@ -194,7 +194,7 @@ class DensitySummationKernel> : public DensitySummationKernel diff --git a/src/shared/particle_dynamics/fluid_dynamics/fluid_integration.h b/src/shared/particle_dynamics/fluid_dynamics/fluid_integration.h index b1b65148bb..1b1848b83a 100644 --- a/src/shared/particle_dynamics/fluid_dynamics/fluid_integration.h +++ b/src/shared/particle_dynamics/fluid_dynamics/fluid_integration.h @@ -84,7 +84,7 @@ class BaseIntegrationKernel { force_prior_(particles->getDeviceVariableByName("ForcePrior")) {} protected: - MaterialTypeKernel fluid_; + const MaterialTypeKernel fluid_; DeviceReal *rho_, *p_, *drho_dt_, *mass_, *Vol_; DeviceVecd *pos_, *vel_, *force_, *force_prior_; }; @@ -113,23 +113,24 @@ template class Integration1stHalfKernel, RiemannSolverType, MaterialType> : public BaseIntegrationKernel { public: - Integration1stHalfKernel(BaseInnerRelation& inner_relation, BaseParticles* particles) - : BaseIntegrationKernel(particles), - correction_(particles), riemann_solver_(this->fluid_, this->fluid_), + Integration1stHalfKernel(BaseInnerRelation& inner_relation, BaseParticles* particles, + const RiemannSolverType &riemann_solver) + : BaseIntegrationKernel(particles), correction_(particles), + riemann_solver_(*riemann_solver.device_kernel.get_ptr()), cell_linked_list_(inner_relation.getInnerCellLinkedListDevice()), inner_neighbor_builder_(inner_relation.getInnerNeighborBuilderDevice()) {} - void initialization(size_t index_i, Real dt = 0.0) { - this->rho_[index_i] += this->drho_dt_[index_i] * dt * 0.5; + void initialization(DeviceInt index_i, DeviceReal dt = 0.0) { + this->rho_[index_i] += this->drho_dt_[index_i] * dt * DeviceReal(0.5); this->p_[index_i] = this->fluid_.getPressure(this->rho_[index_i]); - this->pos_[index_i] += this->vel_[index_i] * dt * 0.5; + this->pos_[index_i] += this->vel_[index_i] * dt * DeviceReal(0.5); } - void update(size_t index_i, Real dt = 0.0) { + void update(DeviceInt index_i, DeviceReal dt = 0.0) { this->vel_[index_i] += (this->force_prior_[index_i] + this->force_[index_i]) / this->mass_[index_i] * dt; } - void interaction(size_t index_i, Real dt = 0.0) { + void interaction(DeviceInt index_i, DeviceReal dt = 0.0) { auto force = VecdZero(); DeviceReal rho_dissipation(0); const auto &pressure_i = this->p_[index_i]; @@ -152,10 +153,12 @@ class Integration1stHalfKernel, RiemannSolverType, MaterialType> : publi } protected: - RiemannSolverType riemann_solver_; - NoKernelCorrection correction_; - CellLinkedListKernel* cell_linked_list_; - NeighborBuilderInnerKernel* inner_neighbor_builder_; + using RiemannSolverTypeKernel = typename decltype(RiemannSolverType::device_kernel)::KernelType; + + const RiemannSolverTypeKernel riemann_solver_; + const NoKernelCorrection correction_; + const CellLinkedListKernel* cell_linked_list_; + const NeighborBuilderInnerKernel* inner_neighbor_builder_; }; @@ -208,16 +211,16 @@ class Integration1stHalfKernel, RiemannSolverType, MaterialType> { public: template - Integration1stHalfKernel(const BaseContactRelation& contact_relation, - BaseParticles* particles, BaseArgs&& ...args) + Integration1stHalfKernel(const BaseContactRelation& contact_relation, BaseParticles* particles, + const RiemannSolverType& riemann_solver, BaseArgs&& ...args) : BaseIntegrationWithWallKernel(particles, std::forward(args)...), - correction_(particles), riemann_solver_(this->fluid_, this->fluid_), + correction_(particles), riemann_solver_(*riemann_solver.device_kernel.get_ptr()), contact_bodies_size_(contact_relation.contact_bodies_.size()), contact_cell_linked_lists_(contact_relation.getContactCellLinkedListsDevice()), contact_neighbor_builders_(contact_relation.getContactNeighborBuilderDevice()), particles_position_(contact_relation.base_particles_.getDeviceVariableByName("Position")) {} - void interaction(size_t index_i, Real dt = 0.0) { + void interaction(DeviceInt index_i, DeviceReal dt = 0.0) { DeviceVecd force = VecdZero(); DeviceReal rho_dissipation{0}; const DeviceVecd &force_prior_i = this->force_prior_[index_i]; @@ -226,14 +229,14 @@ class Integration1stHalfKernel, RiemannSolverType, MaterialType> &pressure_i{this->p_[index_i]}, &rho_i{this->rho_[index_i]}; const auto correction_i = correction_(index_i); - for (size_t k = 0; k < contact_bodies_size_; ++k) + for (auto k = 0; k < contact_bodies_size_; ++k) { const DeviceVecd* force_ave_k = this->wall_force_ave_[k]; const DeviceReal* wall_mass_k = this->wall_mass_[k]; const DeviceReal* wall_Vol_k = this->wall_Vol_[k]; const auto& neighbor_builder = *contact_neighbor_builders_[k]; contact_cell_linked_lists_[k]->forEachNeighbor(index_i, particles_position_, - [&](const DeviceVecd &pos_i, size_t index_j, const DeviceVecd &pos_j) + [&](const DeviceVecd &pos_i, DeviceInt index_j, const DeviceVecd &pos_j) { if (neighbor_builder.isWithinCutoff(pos_i, pos_j)) { @@ -253,12 +256,14 @@ class Integration1stHalfKernel, RiemannSolverType, MaterialType> } protected: - NoKernelCorrection correction_; - RiemannSolverType riemann_solver_; - size_t contact_bodies_size_; + using RiemannSolverTypeKernel = typename decltype(RiemannSolverType::device_kernel)::KernelType; + + const NoKernelCorrection correction_; + const RiemannSolverTypeKernel riemann_solver_; + const DeviceInt contact_bodies_size_; CellLinkedListKernel **contact_cell_linked_lists_; NeighborBuilderContactKernel **contact_neighbor_builders_; - DeviceVecd *particles_position_; + const DeviceVecd *particles_position_; }; template @@ -316,21 +321,22 @@ template class Integration2ndHalfKernel, RiemannSolverType, MaterialType> : public BaseIntegrationKernel { public: - Integration2ndHalfKernel(BaseInnerRelation& inner_relation, BaseParticles* particles) + Integration2ndHalfKernel(BaseInnerRelation& inner_relation, BaseParticles* particles, + const RiemannSolverType &riemann_solver) : BaseIntegrationKernel(particles), - riemann_solver_(this->fluid_, this->fluid_), + riemann_solver_(*riemann_solver.device_kernel.get_ptr()), cell_linked_list_(inner_relation.getInnerCellLinkedListDevice()), inner_neighbor_builder_(inner_relation.getInnerNeighborBuilderDevice()) {} - void initialization(size_t index_i, Real dt = 0.0) { - this->pos_[index_i] += this->vel_[index_i] * dt * 0.5; + void initialization(DeviceInt index_i, DeviceReal dt = 0.0) { + this->pos_[index_i] += this->vel_[index_i] * dt * DeviceReal(0.5); } - void update(size_t index_i, Real dt = 0.0) { - this->rho_[index_i] += this->drho_dt_[index_i] * dt * 0.5; + void update(DeviceInt index_i, DeviceReal dt = 0.0) { + this->rho_[index_i] += this->drho_dt_[index_i] * dt * DeviceReal(0.5); } - void interaction(size_t index_i, Real dt = 0.0) + void interaction(DeviceInt index_i, DeviceReal dt = 0.0) { DeviceReal density_change_rate(0); auto p_dissipation = VecdZero(); @@ -353,7 +359,9 @@ class Integration2ndHalfKernel, RiemannSolverType, MaterialType> : publi } protected: - RiemannSolverType riemann_solver_; + using RiemannSolverTypeKernel = typename decltype(RiemannSolverType::device_kernel)::KernelType; + + const RiemannSolverTypeKernel riemann_solver_; CellLinkedListKernel* cell_linked_list_; NeighborBuilderInnerKernel* inner_neighbor_builder_; }; @@ -388,28 +396,28 @@ class Integration2ndHalfKernel, RiemannSolverType, MaterialType> { public: template - Integration2ndHalfKernel(const BaseContactRelation& contact_relation, - BaseParticles* particles, BaseArgs&& ...args) + Integration2ndHalfKernel(const BaseContactRelation& contact_relation, BaseParticles* particles, + const RiemannSolverType &riemann_solver, BaseArgs&& ...args) : BaseIntegrationWithWallKernel(particles, std::forward(args)...), - riemann_solver_(this->fluid_, this->fluid_), + riemann_solver_(*riemann_solver.device_kernel.get_ptr()), contact_bodies_size_(contact_relation.contact_bodies_.size()), contact_cell_linked_lists_(contact_relation.getContactCellLinkedListsDevice()), contact_neighbor_builders_(contact_relation.getContactNeighborBuilderDevice()), particles_position_(contact_relation.base_particles_.getDeviceVariableByName("Position")) {} - void interaction(size_t index_i, Real dt = 0.0) { + void interaction(DeviceInt index_i, DeviceReal dt = 0.0) { DeviceReal density_change_rate{0}; auto p_dissipation = VecdZero(); const DeviceVecd vel_i = this->vel_[index_i]; const DeviceReal mass_i = this->mass_[index_i]; - for (size_t k = 0; k < contact_bodies_size_; ++k) + for (auto k = 0; k < contact_bodies_size_; ++k) { const DeviceReal *wall_Vol_k = this->wall_Vol_[k]; const DeviceVecd *vel_ave_k = this->wall_vel_ave_[k]; const DeviceVecd *n_k = this->wall_n_[k]; const auto& neighbor_builder = *contact_neighbor_builders_[k]; contact_cell_linked_lists_[k]->forEachNeighbor(index_i, particles_position_, - [&](const DeviceVecd &pos_i, size_t index_j, const DeviceVecd &pos_j) + [&](const DeviceVecd &pos_i, DeviceInt index_j, const DeviceVecd &pos_j) { if (neighbor_builder.isWithinCutoff(pos_i, pos_j)) { @@ -428,8 +436,10 @@ class Integration2ndHalfKernel, RiemannSolverType, MaterialType> } protected: - RiemannSolverType riemann_solver_; - size_t contact_bodies_size_; + using RiemannSolverTypeKernel = typename decltype(RiemannSolverType::device_kernel)::KernelType; + + const RiemannSolverTypeKernel riemann_solver_; + const DeviceInt contact_bodies_size_; CellLinkedListKernel **contact_cell_linked_lists_; NeighborBuilderContactKernel **contact_neighbor_builders_; DeviceVecd *particles_position_; diff --git a/src/shared/particle_dynamics/fluid_dynamics/fluid_integration.hpp b/src/shared/particle_dynamics/fluid_dynamics/fluid_integration.hpp index 2c568a7102..2bd28e5364 100644 --- a/src/shared/particle_dynamics/fluid_dynamics/fluid_integration.hpp +++ b/src/shared/particle_dynamics/fluid_dynamics/fluid_integration.hpp @@ -23,7 +23,7 @@ Integration1stHalf, RiemannSolverType, KernelCorrectionType>:: Integration1stHalf(BaseInnerRelation &inner_relation) : BaseIntegration(inner_relation), correction_(particles_), riemann_solver_(fluid_, fluid_), - device_kernel(inner_relation, particles_) + device_kernel(inner_relation, particles_, riemann_solver_) { static_assert(std::is_base_of::value, "KernelCorrection is not the base of KernelCorrectionType!"); @@ -84,9 +84,10 @@ Integration1stHalf, RiemannSolverType, KernelCorrectionType>:: Integration1stHalf(BaseContactRelation &wall_contact_relation) : BaseIntegrationWithWall(wall_contact_relation), correction_(particles_), riemann_solver_(fluid_, fluid_), - device_kernel(wall_contact_relation, particles_, wall_mass_device_.data(), - wall_Vol_device_.data(), wall_vel_ave_device_.data(), - wall_force_ave_device_.data(), wall_n_device_.data()) {} + device_kernel(wall_contact_relation, particles_, riemann_solver_, + wall_mass_device_.data(), wall_Vol_device_.data(), + wall_vel_ave_device_.data(), wall_force_ave_device_.data(), + wall_n_device_.data()) {} //=================================================================================================// template void Integration1stHalf, RiemannSolverType, KernelCorrectionType>::interaction(size_t index_i, Real dt) @@ -165,7 +166,7 @@ Integration2ndHalf, RiemannSolverType>:: Integration2ndHalf(BaseInnerRelation &inner_relation) : BaseIntegration(inner_relation), riemann_solver_(this->fluid_, this->fluid_), mass_(particles_->mass_), Vol_(particles_->Vol_), - device_kernel(inner_relation, particles_) {} + device_kernel(inner_relation, particles_, riemann_solver_) {} //=================================================================================================// template void Integration2ndHalf, RiemannSolverType>::initialization(size_t index_i, Real dt) @@ -204,9 +205,10 @@ Integration2ndHalf, RiemannSolverType>:: Integration2ndHalf(BaseContactRelation &wall_contact_relation) : BaseIntegrationWithWall(wall_contact_relation), riemann_solver_(this->fluid_, this->fluid_), - device_kernel(wall_contact_relation, particles_, wall_mass_device_.data(), - wall_Vol_device_.data(), wall_vel_ave_device_.data(), - wall_force_ave_device_.data(), wall_n_device_.data()) {} + device_kernel(wall_contact_relation, particles_, riemann_solver_, + wall_mass_device_.data(), wall_Vol_device_.data(), + wall_vel_ave_device_.data(), wall_force_ave_device_.data(), + wall_n_device_.data()) {} //=================================================================================================// template void Integration2ndHalf, RiemannSolverType>::interaction(size_t index_i, Real dt) diff --git a/src/shared/particle_dynamics/fluid_dynamics/fluid_time_step.h b/src/shared/particle_dynamics/fluid_dynamics/fluid_time_step.h index dcef86335d..61f8f356f5 100644 --- a/src/shared/particle_dynamics/fluid_dynamics/fluid_time_step.h +++ b/src/shared/particle_dynamics/fluid_dynamics/fluid_time_step.h @@ -55,7 +55,7 @@ class AcousticTimeStepSizeKernel { DeviceReal reduce(size_t index_i, Real dt = 0.0) const { DeviceReal acceleration_scale = 4.0 * smoothing_length_min_ * - (force_[index_i] + force_prior_[index_i]).norm() / mass_[index_i]; + VecdNorm(DeviceVecd(force_[index_i] + force_prior_[index_i])) / mass_[index_i]; return sycl::fmax(fluid_.getSoundSpeed(p_[index_i], rho_[index_i]) + VecdNorm(vel_[index_i]), acceleration_scale); } diff --git a/src/shared/particle_dynamics/particle_functors.h b/src/shared/particle_dynamics/particle_functors.h index aa239991aa..20b48e0461 100644 --- a/src/shared/particle_dynamics/particle_functors.h +++ b/src/shared/particle_dynamics/particle_functors.h @@ -206,7 +206,7 @@ class NoKernelCorrection : public KernelCorrection { public: NoKernelCorrection(BaseParticles *particles) : KernelCorrection(){}; - Real operator()(size_t index_i) + Real operator()(size_t index_i) const { return 1.0; }; diff --git a/src/shared/particles/base_particles.cpp b/src/shared/particles/base_particles.cpp index 375bf6140d..459ab7e888 100644 --- a/src/shared/particles/base_particles.cpp +++ b/src/shared/particles/base_particles.cpp @@ -344,9 +344,9 @@ void BaseParticles::readFromXmlForReloadParticle(std::string &filefullpath) } //=================================================================================================// void BaseParticles::registerDeviceMemory() { - unsorted_id_device_ = allocateDeviceData(total_real_particles_); - sorted_id_device_ = allocateDeviceData(total_real_particles_); - sequence_device_ = allocateDeviceData(total_real_particles_); + unsorted_id_device_ = allocateDeviceData(total_real_particles_); + sorted_id_device_ = allocateDeviceData(total_real_particles_); + sequence_device_ = allocateDeviceData(total_real_particles_); registerDeviceVariable("Position", total_real_particles_, pos_.data()); registerDeviceVariable("Velocity", total_real_particles_, vel_.data()); registerDeviceVariable("Force", total_real_particles_, force_.data()); diff --git a/src/shared/particles/base_particles.h b/src/shared/particles/base_particles.h index 3e91d5d889..1604efbc07 100644 --- a/src/shared/particles/base_particles.h +++ b/src/shared/particles/base_particles.h @@ -189,9 +189,9 @@ class BaseParticles ParticleData sortable_data_; ParticleVariables sortable_variables_; ParticleSorting particle_sorting_; - size_t* unsorted_id_device_ = nullptr; /**< copy of unsorted ids stored inside device. */ - size_t* sorted_id_device_ = nullptr; /**< copy of sorted ids stored inside device. */ - size_t* sequence_device_ = nullptr; /**< the sequence referred for sorting within device execution. */ + DeviceInt* unsorted_id_device_ = nullptr; /**< copy of unsorted ids stored inside device. */ + DeviceInt* sorted_id_device_ = nullptr; /**< copy of sorted ids stored inside device. */ + DeviceInt* sequence_device_ = nullptr; /**< the sequence referred for sorting within device execution. */ DeviceVariables sortable_device_variables_; template diff --git a/src/shared/particles/base_particles.hpp b/src/shared/particles/base_particles.hpp index a3396a4257..4ffb585777 100644 --- a/src/shared/particles/base_particles.hpp +++ b/src/shared/particles/base_particles.hpp @@ -215,7 +215,7 @@ void BaseParticles::registerSortableVariable(const std::string &variable_name) template void BaseParticles::sortParticles(SequenceMethod &sequence_method, ExecutionPolicy execution_policy) { - size_t* sequence = sequence_method.computingSequence(*this); + auto* sequence = sequence_method.computingSequence(*this); particle_sorting_.sortingParticleData(sequence, total_real_particles_, execution_policy); } //=================================================================================================// From 858736bff5b27f7233c09d3085cde76c6bd95d59 Mon Sep 17 00:00:00 2001 From: Alberto Guarnieri Date: Fri, 24 May 2024 12:16:56 +0200 Subject: [PATCH 2/8] Use DeviceInt for device sorting methods --- src/shared/particles/particle_sorting.cpp | 88 +++++++++++------------ src/shared/particles/particle_sorting.h | 44 ++++++------ 2 files changed, 66 insertions(+), 66 deletions(-) diff --git a/src/shared/particles/particle_sorting.cpp b/src/shared/particles/particle_sorting.cpp index 01b8bab46f..67d57967db 100644 --- a/src/shared/particles/particle_sorting.cpp +++ b/src/shared/particles/particle_sorting.cpp @@ -27,7 +27,7 @@ void SwapSortableParticleData::operator()(size_t *a, size_t *b) MoveSortableParticleDeviceData::MoveSortableParticleDeviceData(BaseParticles &base_particles) : base_particles_(base_particles), initialized_swap_variables_(false) {} //=================================================================================================// -void MoveSortableParticleDeviceData::operator()(size_t *index_permutation, size_t size) +void MoveSortableParticleDeviceData::operator()(DeviceInt *index_permutation, DeviceInt size) { if (!initialized_swap_variables_) { @@ -73,25 +73,24 @@ void ParticleSorting::updateSortedId() //=================================================================================================// void ParticleSorting::updateSortedDeviceId() const { - size_t *unsorted_id_device = base_particles_.unsorted_id_device_; - size_t *sorted_id_device = base_particles_.sorted_id_device_; - size_t total_real_particles = base_particles_.total_real_particles_; + DeviceInt *unsorted_id_device = base_particles_.unsorted_id_device_; + DeviceInt *sorted_id_device = base_particles_.sorted_id_device_; + DeviceInt total_real_particles = base_particles_.total_real_particles_; execution::executionQueue.getQueue() .parallel_for(execution::executionQueue.getUniformNdRange(total_real_particles), [=](sycl::nd_item<1> item) { - size_t i = item.get_global_id(); + DeviceInt i = item.get_global_id(); if (i < total_real_particles) sorted_id_device[unsorted_id_device[i]] = i; }) .wait(); } //=================================================================================================// -template <> -void ParticleSorting::sortingParticleData(size_t *begin, size_t size, execution::ParallelSYCLDevicePolicy execution_policy) +void ParticleSorting::sortingParticleData(DeviceInt *begin, DeviceInt size, execution::ParallelSYCLDevicePolicy execution_policy) { if (!index_sorting_device_variables_) - index_sorting_device_variables_ = allocateDeviceData(size); + index_sorting_device_variables_ = allocateDeviceData(size); device_radix_sorting.sort_by_key(begin, index_sorting_device_variables_, size, execution::executionQueue.getQueue(), 512, 4).wait(); @@ -101,17 +100,17 @@ void ParticleSorting::sortingParticleData(size_t *begin, size_t size, execution: } //=================================================================================================// template -SYCL_EXTERNAL size_t DeviceRadixSort::split_count(bool bit, sycl::nd_item<1> &item) +SYCL_EXTERNAL DeviceInt DeviceRadixSort::split_count(bool bit, sycl::nd_item<1> &item) { const auto group_range = item.get_local_range().size(); - const size_t id = item.get_local_id(); + const DeviceInt id = item.get_local_id(); // Count 1s held by lower-numbered work-items and current work-item - size_t true_before = sycl::inclusive_scan_over_group(item.get_group(), static_cast(bit), - sycl::plus{}); // prefix sum over group + DeviceInt true_before = sycl::inclusive_scan_over_group(item.get_group(), static_cast(bit), + sycl::plus<>{}); // prefix sum over group // Total number of 0s - size_t false_totals = sycl::group_broadcast(item.get_group(), group_range - true_before, + DeviceInt false_totals = sycl::group_broadcast(item.get_group(), group_range - true_before, group_range - 1); // Return work-item's rank @@ -119,24 +118,24 @@ SYCL_EXTERNAL size_t DeviceRadixSort::split_count(bool bit, sycl::nd_ } //=================================================================================================// template -size_t DeviceRadixSort::get_digit(size_t key, size_t d, size_t radix_bits) +DeviceInt DeviceRadixSort::get_digit(DeviceInt key, DeviceInt d, DeviceInt radix_bits) { - return (key >> d * radix_bits) & ((1ul << radix_bits) - 1); + return (key >> d * radix_bits) & ((static_cast(1) << radix_bits) - 1); } //=================================================================================================// template -size_t DeviceRadixSort::get_bit(size_t key, size_t b) +DeviceInt DeviceRadixSort::get_bit(DeviceInt key, DeviceInt b) { return (key >> b) & 1; } //=================================================================================================// template -size_t DeviceRadixSort::find_max_element(const size_t *data, size_t size, size_t identity) +DeviceInt DeviceRadixSort::find_max_element(const DeviceInt *data, DeviceInt size, DeviceInt identity) { - size_t result = identity; + DeviceInt result = identity; auto &sycl_queue = execution::executionQueue.getQueue(); { - sycl::buffer buffer_result(&result, 1); + sycl::buffer buffer_result(&result, 1); sycl_queue.submit([&](sycl::handler &cgh) { auto reduction_operator = sycl::reduction(buffer_result, cgh, sycl::maximum<>()); @@ -151,7 +150,7 @@ size_t DeviceRadixSort::find_max_element(const size_t *data, size_t s } //=================================================================================================// template -void DeviceRadixSort::resize(size_t data_size, size_t radix_bits, size_t workgroup_size) +void DeviceRadixSort::resize(DeviceInt data_size, DeviceInt radix_bits, DeviceInt workgroup_size) { data_size_ = data_size; radix_bits_ = radix_bits; @@ -161,31 +160,32 @@ void DeviceRadixSort::resize(size_t data_size, size_t radix_bits, siz kernel_range_ = {uniform_global_size_, workgroup_size}; workgroups_ = kernel_range_.get_group_range().size(); - radix_ = 1ul << radix_bits; // radix = 2^b + radix_ = static_cast(1) << radix_bits; // radix = 2^b - sycl::range<2> buckets_column_major_range = {radix_, workgroups_}, buckets_row_major_range = {workgroups_, radix_}; + sycl::range<2> buckets_column_major_range = {static_cast(radix_), static_cast(workgroups_)}, + buckets_row_major_range = {static_cast(workgroups_), static_cast(radix_)}; // Each entry contains global number of digits with the same value // Column-major, so buckets offsets can be computed by just applying a scan over it - global_buckets_ = std::make_unique>(buckets_column_major_range); + global_buckets_ = std::make_unique>(buckets_column_major_range); // Each entry contains global number of digits with the same and lower values - global_buckets_offsets_ = std::make_unique>(buckets_column_major_range); - local_buckets_offsets_buffer_ = std::make_unique>(buckets_row_major_range); // save state of local accessor + global_buckets_offsets_ = std::make_unique>(buckets_column_major_range); + local_buckets_offsets_buffer_ = std::make_unique>(buckets_row_major_range); // save state of local accessor data_swap_buffer_ = std::make_unique>(uniform_global_size_); // temporary memory for swapping // Keep extra values to be swapped when kernel range has been made uniform uniform_extra_swap_buffer_ = std::make_unique>(uniform_global_size_ - data_size); } //=================================================================================================// template -sycl::event DeviceRadixSort::sort_by_key(size_t *keys, ValueType *data, size_t data_size, sycl::queue &queue, size_t workgroup_size, size_t radix_bits) +sycl::event DeviceRadixSort::sort_by_key(DeviceInt *keys, ValueType *data, DeviceInt data_size, sycl::queue &queue, DeviceInt workgroup_size, DeviceInt radix_bits) { if(data_size_ != data_size || radix_bits_ != radix_bits || workgroup_size_ != workgroup_size) resize(data_size, radix_bits, workgroup_size); // Largest key, increased by 1 if the workgroup is not homogeneous with the data vector, // the new maximum will be used for those work-items out of data range, that will then be excluded once sorted - const size_t max_key = find_max_element(keys, data_size, 0ul) + (uniform_case_masking_ ? 1 : 0); - const size_t bits_max_key = std::floor(std::log2(max_key)) + 1.0; // bits needed to represent max_key - const size_t length = max_key ? bits_max_key / radix_bits + (bits_max_key % radix_bits ? 1 : 0) : 1; // max number of radix digits + const DeviceInt max_key = find_max_element(keys, data_size, 0ul) + (uniform_case_masking_ ? 1 : 0); + const DeviceInt bits_max_key = std::floor(std::log2(max_key)) + 1.0; // bits needed to represent max_key + const DeviceInt length = max_key ? bits_max_key / radix_bits + (bits_max_key % radix_bits ? 1 : 0) : 1; // max number of radix digits sycl::event sort_event{}; for (int digit = 0; digit < length; ++digit) @@ -195,14 +195,14 @@ sycl::event DeviceRadixSort::sort_by_key(size_t *keys, ValueType *dat { cgh.depends_on(sort_event); auto data_swap_acc = data_swap_buffer_->get_access(cgh, sycl::write_only, sycl::no_init); - auto local_buckets = sycl::local_accessor(radix_, cgh); + auto local_buckets = sycl::local_accessor(radix_, cgh); auto local_output = sycl::local_accessor(kernel_range_.get_local_range(), cgh); auto global_buckets_accessor = global_buckets_->get_access(cgh, sycl::read_write, sycl::no_init); auto local_buckets_offsets_accessor = local_buckets_offsets_buffer_->get_access(cgh, sycl::write_only, sycl::no_init); cgh.parallel_for(kernel_range_, [=, radix=radix_](sycl::nd_item<1> item) { - const size_t workgroup = item.get_group_linear_id(), + const DeviceInt workgroup = item.get_group_linear_id(), global_id = item.get_global_id(); SortablePair number; @@ -221,7 +221,7 @@ sycl::event DeviceRadixSort::sort_by_key(size_t *keys, ValueType *dat auto radix_digit = get_digit(number.first, digit, radix_bits); auto rank = split_count(get_bit(radix_digit, 0), item); // sorting first bit local_output[rank] = number; - for (size_t b = 1; b < radix_bits; ++b) { // sorting remaining bits + for (DeviceInt b = 1; b < radix_bits; ++b) { // sorting remaining bits item.barrier(sycl::access::fence_space::local_space); number = local_output[item.get_local_id()]; radix_digit = get_digit(number.first, digit, radix_bits); @@ -231,28 +231,28 @@ sycl::event DeviceRadixSort::sort_by_key(size_t *keys, ValueType *dat } // Initialize local buckets to zero, since they are uninitialized by default - for (size_t r = 0; r < radix; ++r) + for (DeviceInt r = 0; r < radix; ++r) local_buckets[r] = 0; item.barrier(sycl::access::fence_space::local_space); { - sycl::atomic_ref bucket_r{local_buckets[radix_digit]}; ++bucket_r; item.barrier(sycl::access::fence_space::local_space); } // Save local buckets to global memory, with one row per work-group (in column-major order) - for (size_t r = 0; r < radix; ++r) + for (DeviceInt r = 0; r < radix; ++r) global_buckets_accessor[r][workgroup] = local_buckets[r]; if(global_id < data_size) data_swap_acc[workgroup_size * workgroup + rank] = number; // save local sorting back to data // Compute local buckets offsets - size_t *begin = local_buckets.get_pointer(), *end = begin + radix, + DeviceInt *begin = local_buckets.get_pointer(), *end = begin + radix, *outBegin = local_buckets_offsets_accessor.get_pointer().get() + workgroup * radix; - sycl::joint_exclusive_scan(item.get_group(), begin, end, outBegin, sycl::plus{}); + sycl::joint_exclusive_scan(item.get_group(), begin, end, outBegin, sycl::plus<>{}); }); }); // Global synchronization to make sure that all locally computed buckets have been copied to global memory @@ -264,9 +264,9 @@ sycl::event DeviceRadixSort::sort_by_key(size_t *keys, ValueType *dat cgh.parallel_for(kernel_range_, [=](sycl::nd_item<1> item) { // Compute global buckets offsets if(item.get_group_linear_id() == 0) { - size_t *begin = global_buckets_accessor.get_pointer(), *end = begin + global_buckets_accessor.size(); + DeviceInt *begin = global_buckets_accessor.get_pointer(), *end = begin + global_buckets_accessor.size(); sycl::joint_exclusive_scan(item.get_group(), begin, end, - global_buckets_offsets_accessor.get_pointer(), sycl::plus{}); + global_buckets_offsets_accessor.get_pointer(), sycl::plus<>{}); } }); }); @@ -280,20 +280,20 @@ sycl::event DeviceRadixSort::sort_by_key(size_t *keys, ValueType *dat auto local_buckets_offsets_accessor = local_buckets_offsets_buffer_->get_access(cgh, sycl::read_only); cgh.parallel_for(kernel_range_, [=](sycl::nd_item<1> item) { // Compute global buckets offsets - size_t *begin = global_buckets_accessor.get_pointer(), *end = begin + global_buckets_accessor.size(); + DeviceInt *begin = global_buckets_accessor.get_pointer(), *end = begin + global_buckets_accessor.size(); sycl::joint_exclusive_scan(item.get_group(), begin, end, - global_buckets_offsets_accessor.get_pointer(), sycl::plus{}); + global_buckets_offsets_accessor.get_pointer(), sycl::plus<>{}); // Mask only relevant indexes. All max_keys added to homogenize the computations // should be owned by work-items with global_id >= data_size if(item.get_global_id() < data_size) { // Retrieve position and sorted data from swap memory - const size_t rank = item.get_local_id(), workgroup = item.get_group_linear_id(); + const DeviceInt rank = item.get_local_id(), workgroup = item.get_group_linear_id(); const SortablePair number = data_swap_acc[workgroup_size * workgroup + rank]; - const size_t radix_digit = get_digit(number.first, digit, radix_bits); + const DeviceInt radix_digit = get_digit(number.first, digit, radix_bits); // Compute sorted position based on global and local buckets - const size_t data_offset = global_buckets_offsets_accessor[radix_digit][workgroup] + rank - + const DeviceInt data_offset = global_buckets_offsets_accessor[radix_digit][workgroup] + rank - local_buckets_offsets_accessor[workgroup][radix_digit]; // Copy to original data pointers diff --git a/src/shared/particles/particle_sorting.h b/src/shared/particles/particle_sorting.h index c822d3864b..4e9c1e79e3 100644 --- a/src/shared/particles/particle_sorting.h +++ b/src/shared/particles/particle_sorting.h @@ -202,34 +202,34 @@ namespace SPH template class DeviceRadixSort { - using SortablePair = std::pair; + using SortablePair = std::pair; public: /** Get the number of bits corresponding to the d-th digit of key, * with each digit composed of a number of bits equal to radix_bits */ - static inline size_t get_digit(size_t key, size_t d, size_t radix_bits); + static inline DeviceInt get_digit(DeviceInt key, DeviceInt d, DeviceInt radix_bits); /** Get the b-th bit of key */ - static inline size_t get_bit(size_t key, size_t b); + static inline DeviceInt get_bit(DeviceInt key, DeviceInt b); /** Group operation to compute rank, i.e. sorted position, of each work-item based on one bit. * All work-items with bit = 0 will be on the first half of the ranking, while work-items with * bit = 1 will be placed on the second half. */ - static SYCL_EXTERNAL size_t split_count(bool bit, sycl::nd_item<1> &item); + static SYCL_EXTERNAL DeviceInt split_count(bool bit, sycl::nd_item<1> &item); - size_t find_max_element(const size_t *data, size_t size, size_t identity); + DeviceInt find_max_element(const DeviceInt *data, DeviceInt size, DeviceInt identity); - void resize(size_t data_size, size_t radix_bits, size_t workgroup_size); + void resize(DeviceInt data_size, DeviceInt radix_bits, DeviceInt workgroup_size); sycl::event sort_by_key( - size_t *keys, ValueType *data, size_t data_size, sycl::queue &queue, - size_t workgroup_size = 256, size_t radix_bits = 4); + DeviceInt *keys, ValueType *data, DeviceInt data_size, sycl::queue &queue, + DeviceInt workgroup_size = 256, DeviceInt radix_bits = 4); private: bool uniform_case_masking_; - size_t data_size_ = 0, radix_bits_, workgroup_size_, + DeviceInt data_size_ = 0, radix_bits_, workgroup_size_, uniform_global_size_, workgroups_, radix_; sycl::nd_range<1> kernel_range_{0,0}; - std::unique_ptr> global_buckets_, global_buckets_offsets_, local_buckets_offsets_buffer_; + std::unique_ptr> global_buckets_, global_buckets_offsets_, local_buckets_offsets_buffer_; std::unique_ptr> data_swap_buffer_, uniform_extra_swap_buffer_; }; @@ -261,14 +261,14 @@ struct swapParticleDeviceDataValue : 0; StdVec *> variables_ptr = std::get::value>(device_variables); - for (size_t i = 0; i != variables_ptr.size(); ++i) + for (DeviceInt i = 0; i != variables_ptr.size(); ++i) variables_.push_back(variables_ptr[i]->VariableAddress()); tmp_variable_ = allocateDeviceData(size_single_variable_); } /** Initialize swapping for extra variable not registered in DeviceVariables */ - void init(VariableType *single_variable, size_t size_variable) + void init(VariableType *single_variable, DeviceInt size_variable) { size_variables_ = 1; size_single_variable_ = size_variable; @@ -279,20 +279,20 @@ struct swapParticleDeviceDataValue ~swapParticleDeviceDataValue() { freeDeviceData(tmp_variable_); } /** Reorder initialized variables based on a permutation indicating where each element should be placed */ - sycl::event operator()(const size_t *index_permutation, size_t size) const + sycl::event operator()(const DeviceInt *index_permutation, DeviceInt size) const { if (size_variables_) assert(size == size_single_variable_ && "Provided index permutation has different size than device variables"); sycl::event sort_event{}; - for (size_t var = 0; var < size_variables_; ++var) + for (DeviceInt var = 0; var < size_variables_; ++var) { auto *sortable_device_variable = variables_[var]; auto tmp_copy_event = execution::executionQueue.getQueue().copy(sortable_device_variable, tmp_variable_, size, sort_event); sort_event = execution::executionQueue.getQueue().parallel_for(execution::executionQueue.getUniformNdRange(size), tmp_copy_event, [=, tmp_variable = tmp_variable_](sycl::nd_item<1> item) { - size_t i = item.get_global_id(); + DeviceInt i = item.get_global_id(); if (i < size) sortable_device_variable[i] = tmp_variable[index_permutation[i]]; }); @@ -301,7 +301,7 @@ struct swapParticleDeviceDataValue } private: - size_t size_variables_, // number of device variables registered, + DeviceInt size_variables_, // number of device variables registered, size_single_variable_; // size of each variable std::vector variables_; // variables to be sorted VariableType *tmp_variable_ = nullptr; // temporary memory to execute sorting in parallel @@ -347,12 +347,12 @@ class MoveSortableParticleDeviceData BaseParticles &base_particles_; bool initialized_swap_variables_; DeviceDataAssembleOperation swap_particle_device_data_value_; - swapParticleDeviceDataValue swap_unsorted_id_; + swapParticleDeviceDataValue swap_unsorted_id_; public: explicit MoveSortableParticleDeviceData(BaseParticles &base_particles); /** Reorder sortable device variables based on sorting permutation*/ - void operator()(size_t *index_permutation, size_t size); + void operator()(DeviceInt *index_permutation, DeviceInt size); }; /** @@ -363,12 +363,12 @@ class ParticleSorting { protected: BaseParticles &base_particles_; - size_t *index_sorting_device_variables_; + DeviceInt *index_sorting_device_variables_; /** using pointer because it is constructed after particles. */ SwapSortableParticleData swap_sortable_particle_data_; MoveSortableParticleDeviceData move_sortable_particle_device_data_; - DeviceRadixSort device_radix_sorting; + DeviceRadixSort device_radix_sorting; CompareParticleSequence compare_; tbb::interface9::QuickSortParticleRange< size_t *, CompareParticleSequence, SwapSortableParticleData> @@ -389,8 +389,8 @@ class ParticleSorting { this->sortingParticleData(begin, size); } - template <> - void sortingParticleData(size_t *begin, size_t size, execution::ParallelSYCLDevicePolicy execution_policy); + + void sortingParticleData(DeviceInt *begin, DeviceInt size, execution::ParallelSYCLDevicePolicy execution_policy); /** update the reference of sorted data from unsorted data */ virtual void updateSortedId(); From 497dfb150d237f30fab10b1938b5349e5a255b3f Mon Sep 17 00:00:00 2001 From: Alberto Guarnieri Date: Fri, 24 May 2024 12:27:41 +0200 Subject: [PATCH 3/8] Set generic spir64 target for SYCL execution when specific target is not specified --- CMakeLists.txt | 3 ++- .../particle_dynamics/execution_unit/execution_queue.hpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e0aaa51624..f625fc83d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -101,7 +101,8 @@ endif() if(SPHINXSYS_USE_SYCL) if(NOT SPHINXSYS_SYCL_TARGETS) - set(SPHINXSYS_SYCL_TARGETS nvptx64-nvidia-cuda) + message("-- Using default SYCL device target, select specific target with option SPHINXSYS_SYCL_TARGETS") + set(SPHINXSYS_SYCL_TARGETS spir64) endif() message("-- SPHinXsys use SYCL with targets ${SPHINXSYS_SYCL_TARGETS}") target_compile_options(sphinxsys_core INTERFACE -fsycl -fsycl-targets=${SPHINXSYS_SYCL_TARGETS} -Wno-unknown-cuda-version) diff --git a/src/shared/particle_dynamics/execution_unit/execution_queue.hpp b/src/shared/particle_dynamics/execution_unit/execution_queue.hpp index 32eb449a00..513d055b38 100644 --- a/src/shared/particle_dynamics/execution_unit/execution_queue.hpp +++ b/src/shared/particle_dynamics/execution_unit/execution_queue.hpp @@ -16,7 +16,7 @@ namespace SPH::execution { sycl::queue &getQueue() { if(!sycl_queue) - sycl_queue = std::make_unique(sycl::gpu_selector_v); + sycl_queue = std::make_unique(sycl::default_selector_v); return *sycl_queue; } From 96a70c09b5982d4bb672709362f6366b532303b6 Mon Sep 17 00:00:00 2001 From: Alberto Guarnieri Date: Fri, 24 May 2024 12:28:12 +0200 Subject: [PATCH 4/8] Increased default workgroup size --- src/shared/particle_dynamics/execution_unit/execution_queue.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/shared/particle_dynamics/execution_unit/execution_queue.hpp b/src/shared/particle_dynamics/execution_unit/execution_queue.hpp index 513d055b38..3fe7994a1e 100644 --- a/src/shared/particle_dynamics/execution_unit/execution_queue.hpp +++ b/src/shared/particle_dynamics/execution_unit/execution_queue.hpp @@ -38,7 +38,7 @@ namespace SPH::execution { } private: - ExecutionQueue() : work_group_size(32), sycl_queue() {} + ExecutionQueue() : work_group_size(128), sycl_queue() {} std::size_t work_group_size; std::unique_ptr sycl_queue; From 6fbb22942f8cee53bf3cc66ee328994f100a5488 Mon Sep 17 00:00:00 2001 From: Alberto Guarnieri Date: Sat, 13 Jul 2024 19:58:19 +0200 Subject: [PATCH 5/8] get_pointer substituted with get_multi_pointer for better compliance with SYCL 2020 --- src/shared/particles/particle_sorting.cpp | 35 +++++++++++++++++------ src/shared/particles/particle_sorting.h | 3 +- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/src/shared/particles/particle_sorting.cpp b/src/shared/particles/particle_sorting.cpp index 67d57967db..b5032878ce 100644 --- a/src/shared/particles/particle_sorting.cpp +++ b/src/shared/particles/particle_sorting.cpp @@ -5,6 +5,9 @@ #include "base_particles.h" #include "cell_linked_list.h" +#include +#include + namespace SPH { //=================================================================================================// @@ -94,6 +97,16 @@ void ParticleSorting::sortingParticleData(DeviceInt *begin, DeviceInt size, exec device_radix_sorting.sort_by_key(begin, index_sorting_device_variables_, size, execution::executionQueue.getQueue(), 512, 4).wait(); + /*execution::executionQueue.getQueue().parallel_for(execution::executionQueue.getUniformNdRange(size), + [=, index_sorting = index_sorting_device_variables_](sycl::nd_item<1> it) { + DeviceInt i = it.get_global_id(0); + if(i < size) + index_sorting[i] = i; + }).wait(); + + oneapi::dpl::sort_by_key(oneapi::dpl::execution::make_device_policy(execution::executionQueue.getQueue()), + begin, begin + size, index_sorting_device_variables_);*/ + move_sortable_particle_device_data_(index_sorting_device_variables_, size); updateSortedDeviceId(); @@ -169,7 +182,7 @@ void DeviceRadixSort::resize(DeviceInt data_size, DeviceInt radix_bit global_buckets_ = std::make_unique>(buckets_column_major_range); // Each entry contains global number of digits with the same and lower values global_buckets_offsets_ = std::make_unique>(buckets_column_major_range); - local_buckets_offsets_buffer_ = std::make_unique>(buckets_row_major_range); // save state of local accessor + local_buckets_offsets_buffer_ = std::make_unique>(buckets_row_major_range.size()); // save state of local accessor data_swap_buffer_ = std::make_unique>(uniform_global_size_); // temporary memory for swapping // Keep extra values to be swapped when kernel range has been made uniform uniform_extra_swap_buffer_ = std::make_unique>(uniform_global_size_ - data_size); @@ -250,8 +263,8 @@ sycl::event DeviceRadixSort::sort_by_key(DeviceInt *keys, ValueType * data_swap_acc[workgroup_size * workgroup + rank] = number; // save local sorting back to data // Compute local buckets offsets - DeviceInt *begin = local_buckets.get_pointer(), *end = begin + radix, - *outBegin = local_buckets_offsets_accessor.get_pointer().get() + workgroup * radix; + DeviceInt *begin = local_buckets.get_multi_ptr().get(), *end = begin + radix, + *outBegin = local_buckets_offsets_accessor.get_multi_ptr().get() + workgroup * radix; sycl::joint_exclusive_scan(item.get_group(), begin, end, outBegin, sycl::plus<>{}); }); }); @@ -264,9 +277,11 @@ sycl::event DeviceRadixSort::sort_by_key(DeviceInt *keys, ValueType * cgh.parallel_for(kernel_range_, [=](sycl::nd_item<1> item) { // Compute global buckets offsets if(item.get_group_linear_id() == 0) { - DeviceInt *begin = global_buckets_accessor.get_pointer(), *end = begin + global_buckets_accessor.size(); + const DeviceInt *begin = global_buckets_accessor.get_multi_ptr().get(), + *end = begin + global_buckets_accessor.size(); sycl::joint_exclusive_scan(item.get_group(), begin, end, - global_buckets_offsets_accessor.get_pointer(), sycl::plus<>{}); + global_buckets_offsets_accessor.get_multi_ptr().get(), + sycl::plus<>{}); } }); }); @@ -278,11 +293,13 @@ sycl::event DeviceRadixSort::sort_by_key(DeviceInt *keys, ValueType * auto global_buckets_accessor = global_buckets_->get_access(cgh, sycl::read_only); auto global_buckets_offsets_accessor = global_buckets_offsets_->get_access(cgh, sycl::read_write); auto local_buckets_offsets_accessor = local_buckets_offsets_buffer_->get_access(cgh, sycl::read_only); - cgh.parallel_for(kernel_range_, [=](sycl::nd_item<1> item) { + cgh.parallel_for(kernel_range_, [=, radix = radix_](sycl::nd_item<1> item) { // Compute global buckets offsets - DeviceInt *begin = global_buckets_accessor.get_pointer(), *end = begin + global_buckets_accessor.size(); + const DeviceInt *begin = global_buckets_accessor.get_multi_ptr().get(), + *end = begin + global_buckets_accessor.size(); sycl::joint_exclusive_scan(item.get_group(), begin, end, - global_buckets_offsets_accessor.get_pointer(), sycl::plus<>{}); + global_buckets_offsets_accessor.get_multi_ptr().get(), + sycl::plus<>{}); // Mask only relevant indexes. All max_keys added to homogenize the computations // should be owned by work-items with global_id >= data_size @@ -294,7 +311,7 @@ sycl::event DeviceRadixSort::sort_by_key(DeviceInt *keys, ValueType * // Compute sorted position based on global and local buckets const DeviceInt data_offset = global_buckets_offsets_accessor[radix_digit][workgroup] + rank - - local_buckets_offsets_accessor[workgroup][radix_digit]; + local_buckets_offsets_accessor[workgroup * radix + radix_digit]; // Copy to original data pointers keys[data_offset] = number.first; diff --git a/src/shared/particles/particle_sorting.h b/src/shared/particles/particle_sorting.h index 4e9c1e79e3..18ffce506b 100644 --- a/src/shared/particles/particle_sorting.h +++ b/src/shared/particles/particle_sorting.h @@ -229,7 +229,8 @@ class DeviceRadixSort DeviceInt data_size_ = 0, radix_bits_, workgroup_size_, uniform_global_size_, workgroups_, radix_; sycl::nd_range<1> kernel_range_{0,0}; - std::unique_ptr> global_buckets_, global_buckets_offsets_, local_buckets_offsets_buffer_; + std::unique_ptr> global_buckets_, global_buckets_offsets_; + std::unique_ptr> local_buckets_offsets_buffer_; std::unique_ptr> data_swap_buffer_, uniform_extra_swap_buffer_; }; From 73a16cbc781a52574a8105505f3480e652d8a1ac Mon Sep 17 00:00:00 2001 From: Alberto Guarnieri Date: Sat, 13 Jul 2024 20:01:12 +0200 Subject: [PATCH 6/8] Improved CellLinkedListKernel data structure, with better memory access for contiguous cells --- src/shared/meshes/cell_linked_list.cpp | 87 +++++++++++++++++--------- src/shared/meshes/cell_linked_list.h | 17 +++-- 2 files changed, 65 insertions(+), 39 deletions(-) diff --git a/src/shared/meshes/cell_linked_list.cpp b/src/shared/meshes/cell_linked_list.cpp index 0fb29d4d60..32b704f1d2 100644 --- a/src/shared/meshes/cell_linked_list.cpp +++ b/src/shared/meshes/cell_linked_list.cpp @@ -39,8 +39,8 @@ CellLinkedListKernel::CellLinkedListKernel(const DeviceVecd &meshLowerBound, Dev : total_real_particles_(0), list_data_pos_(nullptr), // will be initialized at first UpdateCellLists execution mesh_lower_bound_(allocateDeviceData(1)), grid_spacing_(allocateDeviceData(1)), all_grid_points_(allocateDeviceData(1)), all_cells_(allocateDeviceData(1)), - index_list_(nullptr), index_head_list_(allocateDeviceData(VecdFoldProduct(allCells))), - index_head_list_size_(VecdFoldProduct(allCells)) + linear_cell_size_(VecdFoldProduct(allCells)), offset_cell_size_(allocateDeviceData(linear_cell_size_+1)), + curr_cell_size_(allocateDeviceData(linear_cell_size_)) { copyDataToDevice(&meshLowerBound, mesh_lower_bound_, 1) .add(copyDataToDevice(&gridSpacing, grid_spacing_, 1)) @@ -58,50 +58,77 @@ CellLinkedListKernel::~CellLinkedListKernel() execution::ExecutionEvent CellLinkedListKernel::clearCellLists() { - // Only clear head list, since index list does not depend on its previous values - return std::move(copyDataToDevice(static_cast(0), index_head_list_, index_head_list_size_)); + return std::move(copyDataToDevice(static_cast(0), offset_cell_size_, linear_cell_size_+1) + .add(copyDataToDevice(static_cast(0), curr_cell_size_, linear_cell_size_))); } execution::ExecutionEvent CellLinkedListKernel::UpdateCellLists(const SPH::BaseParticles &base_particles) { - if(!index_list_) // initialize list data with base_particles at first execution + if(!list_data_pos_) // initialize list data with base_particles at first execution { total_real_particles_ = base_particles.total_real_particles_; list_data_pos_ = base_particles.getDeviceVariableByName("Position"); - index_list_ = allocateDeviceData(total_real_particles_); + particle_id_list_ = allocateDeviceData(total_real_particles_); } auto clear_event = clearCellLists(); - auto *pos_n = base_particles.getDeviceVariableByName("Position"); - size_t total_real_particles = base_particles.total_real_particles_; + + DeviceInt total_real_particles = base_particles.total_real_particles_; + auto determine_cells_size_event = executionQueue.getQueue().submit( + [&, mesh_lower_bound = mesh_lower_bound_, grid_spacing = grid_spacing_, all_grid_points = all_grid_points_, + all_cells = all_cells_, offset_cell_size = offset_cell_size_, pos_n = list_data_pos_](sycl::handler &cgh) + { + cgh.depends_on(clear_event.getEventList()); + cgh.parallel_for(executionQueue.getUniformNdRange(total_real_particles), + [=](sycl::nd_item<1> item) { + if(item.get_global_id() < total_real_particles) + { + const DeviceInt index_i = item.get_global_id(); + const auto cell_index = CellIndexFromPosition(pos_n[index_i], *mesh_lower_bound, + *grid_spacing, *all_grid_points); + const auto linear_cell_index = transferCellIndexTo1D(cell_index, *all_cells); + sycl::atomic_ref + atomic_cell_size(offset_cell_size[linear_cell_index]); + ++atomic_cell_size; + } + }); }); + + auto exclusive_scan_cells_size_event = executionQueue.getQueue().submit( + [&, offset_cell_size = offset_cell_size_, num_cells = linear_cell_size_ + 1](sycl::handler &cgh) { + cgh.depends_on(determine_cells_size_event); + cgh.parallel_for(executionQueue.getUniformNdRange(executionQueue.getWorkGroupSize()), + [=](sycl::nd_item<1> item) { + if(item.get_group_linear_id() == 0) { + sycl::joint_exclusive_scan(item.get_group(), offset_cell_size, + offset_cell_size + num_cells, + offset_cell_size, sycl::plus<>{}); + } + }); + }); + return executionQueue.getQueue().submit( - [&, mesh_lower_bound = mesh_lower_bound_, grid_spacing = grid_spacing_, all_grid_points = all_grid_points_, - all_cells = all_cells_, index_list = index_list_, index_head_list = index_head_list_](sycl::handler &cgh) - { - cgh.depends_on(clear_event.getEventList()); - cgh.parallel_for(executionQueue.getUniformNdRange(total_real_particles), [=](sycl::nd_item<1> item) - { - const size_t index_i = item.get_global_id(); - if(index_i < total_real_particles) + [&, mesh_lower_bound = mesh_lower_bound_, grid_spacing = grid_spacing_, all_grid_points = all_grid_points_, + all_cells = all_cells_, particle_id_list = particle_id_list_, offset_cell_size = offset_cell_size_, + curr_cell_size = curr_cell_size_, pos_n = list_data_pos_] (sycl::handler &cgh) + { + cgh.depends_on(exclusive_scan_cells_size_event); + cgh.parallel_for(executionQueue.getUniformNdRange(total_real_particles), + [=](sycl::nd_item<1> item) + { + if(item.get_global_id() < total_real_particles) { + const DeviceInt index_i = item.get_global_id(); const auto cell_index = CellIndexFromPosition(pos_n[index_i], *mesh_lower_bound, *grid_spacing, *all_grid_points); const auto linear_cell_index = transferCellIndexTo1D(cell_index, *all_cells); sycl::atomic_ref - atomic_head_list(index_head_list[linear_cell_index]); - /* - * Insert index at the head of the list, the index previously at the top is then - * used as the next one pointed by the new index. - * Indices values are increased by 1 to let 0 be the value that indicates list termination. - * If the cell list is empty (i.e. head == 0) then head will point to the new index and the - * new index will point to 0 (i.e. the new index will be the first and last element of the cell list). - * index_head_list[linear_cell_index] = index_i+1 ---> index_list[index_i] = 0 - * Since the cell list order is not relevant, memory_order_relaxed will only ensure that each cell of - * index_head_list gets a new index one at a time. - */ - index_list[index_i] = atomic_head_list.exchange(index_i + 1); - } }); - }); + atomic_current_cell_size(curr_cell_size[linear_cell_index]); + + particle_id_list[offset_cell_size[linear_cell_index] + atomic_current_cell_size++] = index_i; + } + }); + }); } //=================================================================================================// DeviceInt *CellLinkedListKernel::computingSequence(BaseParticles &baseParticles) diff --git a/src/shared/meshes/cell_linked_list.h b/src/shared/meshes/cell_linked_list.h index 244b62174a..359a32b4ae 100644 --- a/src/shared/meshes/cell_linked_list.h +++ b/src/shared/meshes/cell_linked_list.h @@ -105,12 +105,10 @@ class CellLinkedListKernel { VecdMin(*all_cells_, DeviceArrayi{target_cell_index + search_depth + DeviceInt{1}}), [&](DeviceArrayi&& cell_index) { const auto linear_cell_index = transferCellIndexTo1D(cell_index, *all_cells_); - DeviceInt index_j = index_head_list_[linear_cell_index]; - // Cell list ends when index_j == 0, if index_j is already zero then cell is empty. - while(index_j--) { // abbreviates while(index_j != 0) { index_j -= 1; ... } - function(pos_i, index_j, list_data_pos_[index_j]); - index_j = index_list_[index_j]; - } + // Since offset_cell_size_ has linear_cell_size_+1 elements, no boundary checks are needed. + // offset_cell_size_[0] == 0 && offset_cell_size_[linear_cell_size_] == total_real_particles_ + for(DeviceInt j = offset_cell_size_[linear_cell_index]; j < offset_cell_size_[linear_cell_index+1]; ++j) + function(pos_i, particle_id_list_[j], list_data_pos_[particle_id_list_[j]]); }); } @@ -163,9 +161,10 @@ class CellLinkedListKernel { DeviceReal *grid_spacing_; DeviceArrayi *all_grid_points_, *all_cells_; - DeviceInt* index_list_; - DeviceInt* index_head_list_; - DeviceInt index_head_list_size_; + DeviceInt linear_cell_size_; // Total number of cells + DeviceInt *offset_cell_size_, // Identify begin and end offsets of each cell in particle_id_list_ + *curr_cell_size_; // Auxiliary memory to compute particle_id_list_ starting from offset_cell_size_ + DeviceInt *particle_id_list_; // List of particles ordered based on their cells }; From bcb7192447033dd7a1026f6d48e43842e2826f26 Mon Sep 17 00:00:00 2001 From: Alberto Guarnieri Date: Sat, 13 Jul 2024 20:23:17 +0200 Subject: [PATCH 7/8] Use default workgroup size in test_2d_dambreak_sycl --- tests/2d_examples/test_2d_dambreak_sycl/dambreak_sycl.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/2d_examples/test_2d_dambreak_sycl/dambreak_sycl.cpp b/tests/2d_examples/test_2d_dambreak_sycl/dambreak_sycl.cpp index bfbad58768..a396d80c70 100644 --- a/tests/2d_examples/test_2d_dambreak_sycl/dambreak_sycl.cpp +++ b/tests/2d_examples/test_2d_dambreak_sycl/dambreak_sycl.cpp @@ -103,8 +103,6 @@ int main(int ac, char *av[]) ReduceDynamics fluid_acoustic_time_step(water_block); water_block.getBaseParticles().copyToDeviceMemory().wait(); - executionQueue.setWorkGroupSize(16); - //---------------------------------------------------------------------- // Define the methods for I/O operations, observations // and regression tests of the simulation. From 8981ddd7c303e07c81205ae54407e11633d5c4d5 Mon Sep 17 00:00:00 2001 From: Alberto Guarnieri Date: Sat, 13 Jul 2024 20:39:00 +0200 Subject: [PATCH 8/8] Option to use oneDPL sorting with macro SPHINXSYS_USE_ONEDPL_SORTING --- src/shared/particles/particle_sorting.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/shared/particles/particle_sorting.cpp b/src/shared/particles/particle_sorting.cpp index b5032878ce..3464fa3fd9 100644 --- a/src/shared/particles/particle_sorting.cpp +++ b/src/shared/particles/particle_sorting.cpp @@ -5,8 +5,10 @@ #include "base_particles.h" #include "cell_linked_list.h" +#ifdef SPHINXSYS_USE_ONEDPL_SORTING #include #include +#endif namespace SPH { @@ -95,9 +97,10 @@ void ParticleSorting::sortingParticleData(DeviceInt *begin, DeviceInt size, exec if (!index_sorting_device_variables_) index_sorting_device_variables_ = allocateDeviceData(size); +#ifndef SPHINXSYS_USE_ONEDPL_SORTING device_radix_sorting.sort_by_key(begin, index_sorting_device_variables_, size, execution::executionQueue.getQueue(), 512, 4).wait(); - - /*execution::executionQueue.getQueue().parallel_for(execution::executionQueue.getUniformNdRange(size), +#else + execution::executionQueue.getQueue().parallel_for(execution::executionQueue.getUniformNdRange(size), [=, index_sorting = index_sorting_device_variables_](sycl::nd_item<1> it) { DeviceInt i = it.get_global_id(0); if(i < size) @@ -105,7 +108,8 @@ void ParticleSorting::sortingParticleData(DeviceInt *begin, DeviceInt size, exec }).wait(); oneapi::dpl::sort_by_key(oneapi::dpl::execution::make_device_policy(execution::executionQueue.getQueue()), - begin, begin + size, index_sorting_device_variables_);*/ + begin, begin + size, index_sorting_device_variables_); +#endif move_sortable_particle_device_data_(index_sorting_device_variables_, size);