Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL] restructured cell linked list and other optimizations #623

Merged
merged 9 commits into from
Jul 13, 2024
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,8 @@ endif()

if(SPHINXSYS_USE_SYCL)
if(NOT SPHINXSYS_SYCL_TARGETS)
set(SPHINXSYS_SYCL_TARGETS nvptx64-nvidia-cuda)
message("-- Using default SYCL device target, select specific target with option SPHINXSYS_SYCL_TARGETS")
set(SPHINXSYS_SYCL_TARGETS spir64)
endif()
message("-- SPHinXsys use SYCL with targets ${SPHINXSYS_SYCL_TARGETS}")
target_compile_options(sphinxsys_core INTERFACE -fsycl -fsycl-targets=${SPHINXSYS_SYCL_TARGETS} -Wno-unknown-cuda-version)
Expand Down
1 change: 1 addition & 0 deletions src/shared/common/base_data_type.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ using Rotation2d = Eigen::Rotation2D<Real>;
using Rotation3d = Eigen::AngleAxis<Real>;
/** Device data types. */
using DeviceReal = Real;
using DeviceInt = u_int32_t;
using DeviceVec2d = Vec2d;
using DeviceVec3d = Vec3d;
using DeviceArray2i = Array2i;
Expand Down
8 changes: 8 additions & 0 deletions src/shared/common/vector_functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,10 @@ inline execution::ExecutionEvent copyDataToDevice(const Real* host, DeviceReal*
return transformAndCopyDataToDevice(host, device, size, [](Real val) { return static_cast<DeviceReal>(val); });
}

inline execution::ExecutionEvent copyDataToDevice(const size_t* host, DeviceInt* device, std::size_t size) {
return transformAndCopyDataToDevice(host, device, size, [](size_t val) { return static_cast<DeviceInt>(val); });
}

template<class T>
inline execution::ExecutionEvent copyDataToDevice(const T& value, T* device, std::size_t size) {
return execution::executionQueue.getQueue().fill(device, value, size);
Expand Down Expand Up @@ -234,5 +238,9 @@ inline execution::ExecutionEvent copyDataFromDevice(Vec3d* host, const DeviceVec
inline execution::ExecutionEvent copyDataFromDevice(Real* host, const DeviceReal* device, std::size_t size) {
return transformAndCopyDataFromDevice(host, device, size, [](DeviceReal val) { return static_cast<Real>(val); });
}

inline execution::ExecutionEvent copyDataFromDevice(size_t* host, const DeviceInt* device, std::size_t size) {
return transformAndCopyDataFromDevice(host, device, size, [](DeviceInt val) { return static_cast<size_t>(val); });
}
} // namespace SPH
#endif // VECTOR_FUNCTIONS_H
53 changes: 51 additions & 2 deletions src/shared/materials/riemann_solver.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,23 @@ struct FluidStateOut
FluidStateOut(Real rho, Vecd vel, Real p) : vel_(vel), rho_(rho), p_(p){};
};

class NoRiemannSolverKernel
{
public:
NoRiemannSolverKernel(DeviceReal rho0c0_i, DeviceReal rho0c0_j, DeviceReal inv_rho0c0_sum)
: rho0c0_i_(rho0c0_i), rho0c0_j_(rho0c0_j), inv_rho0c0_sum_(inv_rho0c0_sum) {}

DeviceReal DissipativePJump(const DeviceReal &) const { return DeviceReal(0.0); }
DeviceReal DissipativeUJump(const DeviceReal &) const { return DeviceReal(0.0); }

DeviceReal AverageP(const DeviceReal &p_i, const DeviceReal &p_j)
{
return (p_i * rho0c0_j_ + p_j * rho0c0_i_) * inv_rho0c0_sum_;
};
protected:
DeviceReal rho0c0_i_, rho0c0_j_, inv_rho0c0_sum_;
};

/**
* @struct NoRiemannSolver
* @brief Central difference scheme without Riemann flux.
Expand All @@ -60,7 +77,8 @@ class NoRiemannSolver
: rho0_i_(fluid_i.ReferenceDensity()), rho0_j_(fluid_j.ReferenceDensity()),
c0_i_(fluid_i.ReferenceSoundSpeed()), c0_j_(fluid_j.ReferenceSoundSpeed()),
rho0c0_i_(rho0_i_ * c0_i_), rho0c0_j_(rho0_j_ * c0_j_),
inv_rho0c0_sum_(1.0 / (rho0c0_i_ + rho0c0_j_)){};
inv_rho0c0_sum_(1.0 / (rho0c0_i_ + rho0c0_j_)),
device_kernel(rho0c0_i_, rho0c0_j_, inv_rho0c0_sum_) {};
Real DissipativePJump(const Real &u_jump) const { return 0.0; }
Real DissipativeUJump(const Real &p_jump) const { return 0.0; }

Expand All @@ -77,6 +95,32 @@ class NoRiemannSolver
Real rho0_i_, rho0_j_;
Real c0_i_, c0_j_;
Real rho0c0_i_, rho0c0_j_, inv_rho0c0_sum_;

public:
execution::DeviceImplementation<NoRiemannSolverKernel> device_kernel;
};

class AcousticRiemannSolverKernel : public NoRiemannSolverKernel
{
public:
template<class... Args>
AcousticRiemannSolverKernel(DeviceReal inv_rho0c0_ave, DeviceReal rho0c0_geo_ave,
DeviceReal inv_c_ave, DeviceReal limiter_coeff, Args&& ...baseArgs)
: NoRiemannSolverKernel(std::forward<Args>(baseArgs)...), inv_rho0c0_ave_(inv_rho0c0_ave),
rho0c0_geo_ave_(rho0c0_geo_ave), inv_c_ave_(inv_c_ave), limiter_coeff_(limiter_coeff) {}

DeviceReal DissipativePJump(const DeviceReal &u_jump) const {
return rho0c0_geo_ave_ * u_jump * sycl::min(limiter_coeff_ * sycl::max(u_jump * inv_c_ave_, DeviceReal(0)), DeviceReal(1));
}

DeviceReal DissipativeUJump(const DeviceReal &p_jump) const {
return p_jump * inv_rho0c0_ave_;
}

protected:
DeviceReal inv_rho0c0_ave_, rho0c0_geo_ave_;
DeviceReal inv_c_ave_;
DeviceReal limiter_coeff_;
};

class AcousticRiemannSolver : public NoRiemannSolver
Expand All @@ -88,7 +132,9 @@ class AcousticRiemannSolver : public NoRiemannSolver
inv_rho0c0_ave_(2.0 * inv_rho0c0_sum_),
rho0c0_geo_ave_(2.0 * rho0c0_i_ * rho0c0_j_ * inv_rho0c0_sum_),
inv_c_ave_(0.5 * (rho0_i_ + rho0_j_) * inv_rho0c0_ave_),
limiter_coeff_(limiter_coeff){};
limiter_coeff_(limiter_coeff),
device_kernel(inv_rho0c0_ave_, rho0c0_geo_ave_, inv_c_ave_, limiter_coeff_,
rho0c0_i_, rho0c0_j_, inv_rho0c0_sum_) {};
Real DissipativePJump(const Real &u_jump) const {
return rho0c0_geo_ave_ * u_jump * SMIN(limiter_coeff_ * SMAX(u_jump * inv_c_ave_, Real(0)), Real(1));
}
Expand All @@ -101,6 +147,9 @@ class AcousticRiemannSolver : public NoRiemannSolver
Real inv_rho0c0_ave_, rho0c0_geo_ave_;
Real inv_c_ave_;
Real limiter_coeff_;

public:
execution::DeviceImplementation<AcousticRiemannSolverKernel> device_kernel;
};

class DissipativeRiemannSolver : public AcousticRiemannSolver
Expand Down
4 changes: 2 additions & 2 deletions src/shared/materials/weakly_compressible_fluid.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ class WeaklyCompressibleFluidKernel
inline DeviceReal getPressure(DeviceReal rho) const { return p0_ * (rho / rho0_ - static_cast<DeviceReal>(1.0)); }
inline DeviceReal getSoundSpeed(DeviceReal p = 0.0, DeviceReal rho = 1.0) const { return c0_; }
inline DeviceReal ReferenceDensity() const { return rho0_; }
inline DeviceReal ReferenceSoundSpeed() { return c0_; };
inline DeviceReal ReferenceSoundSpeed() const { return c0_; };

private:
DeviceReal p0_, c0_, rho0_;
const DeviceReal p0_, c0_, rho0_;
};

/**
Expand Down
95 changes: 61 additions & 34 deletions src/shared/meshes/cell_linked_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ CellLinkedListKernel::CellLinkedListKernel(const DeviceVecd &meshLowerBound, Dev
: total_real_particles_(0), list_data_pos_(nullptr), // will be initialized at first UpdateCellLists execution
mesh_lower_bound_(allocateDeviceData<DeviceVecd>(1)), grid_spacing_(allocateDeviceData<DeviceReal>(1)),
all_grid_points_(allocateDeviceData<DeviceArrayi>(1)), all_cells_(allocateDeviceData<DeviceArrayi>(1)),
index_list_(nullptr), index_head_list_(allocateDeviceData<size_t>(VecdFoldProduct(allCells))),
index_head_list_size_(VecdFoldProduct(allCells))
linear_cell_size_(VecdFoldProduct(allCells)), offset_cell_size_(allocateDeviceData<DeviceInt>(linear_cell_size_+1)),
curr_cell_size_(allocateDeviceData<DeviceInt>(linear_cell_size_))
{
copyDataToDevice(&meshLowerBound, mesh_lower_bound_, 1)
.add(copyDataToDevice(&gridSpacing, grid_spacing_, 1))
Expand All @@ -58,64 +58,91 @@ CellLinkedListKernel::~CellLinkedListKernel()

execution::ExecutionEvent CellLinkedListKernel::clearCellLists()
{
// Only clear head list, since index list does not depend on its previous values
return std::move(copyDataToDevice(static_cast<size_t>(0), index_head_list_, index_head_list_size_));
return std::move(copyDataToDevice(static_cast<DeviceInt>(0), offset_cell_size_, linear_cell_size_+1)
.add(copyDataToDevice(static_cast<DeviceInt>(0), curr_cell_size_, linear_cell_size_)));
}

execution::ExecutionEvent CellLinkedListKernel::UpdateCellLists(const SPH::BaseParticles &base_particles)
{
if(!index_list_) // initialize list data with base_particles at first execution
if(!list_data_pos_) // initialize list data with base_particles at first execution
{
total_real_particles_ = base_particles.total_real_particles_;
list_data_pos_ = base_particles.getDeviceVariableByName<DeviceVecd>("Position");
index_list_ = allocateDeviceData<size_t>(total_real_particles_);
particle_id_list_ = allocateDeviceData<DeviceInt>(total_real_particles_);
}
auto clear_event = clearCellLists();
auto *pos_n = base_particles.getDeviceVariableByName<DeviceVecd>("Position");
size_t total_real_particles = base_particles.total_real_particles_;

DeviceInt total_real_particles = base_particles.total_real_particles_;
auto determine_cells_size_event = executionQueue.getQueue().submit(
[&, mesh_lower_bound = mesh_lower_bound_, grid_spacing = grid_spacing_, all_grid_points = all_grid_points_,
all_cells = all_cells_, offset_cell_size = offset_cell_size_, pos_n = list_data_pos_](sycl::handler &cgh)
{
cgh.depends_on(clear_event.getEventList());
cgh.parallel_for(executionQueue.getUniformNdRange(total_real_particles),
[=](sycl::nd_item<1> item) {
if(item.get_global_id() < total_real_particles)
{
const DeviceInt index_i = item.get_global_id();
const auto cell_index = CellIndexFromPosition(pos_n[index_i], *mesh_lower_bound,
*grid_spacing, *all_grid_points);
const auto linear_cell_index = transferCellIndexTo1D(cell_index, *all_cells);
sycl::atomic_ref<DeviceInt, sycl::memory_order_relaxed, sycl::memory_scope_device,
sycl::access::address_space::global_space>
atomic_cell_size(offset_cell_size[linear_cell_index]);
++atomic_cell_size;
}
}); });

auto exclusive_scan_cells_size_event = executionQueue.getQueue().submit(
[&, offset_cell_size = offset_cell_size_, num_cells = linear_cell_size_ + 1](sycl::handler &cgh) {
cgh.depends_on(determine_cells_size_event);
cgh.parallel_for(executionQueue.getUniformNdRange(executionQueue.getWorkGroupSize()),
[=](sycl::nd_item<1> item) {
if(item.get_group_linear_id() == 0) {
sycl::joint_exclusive_scan(item.get_group(), offset_cell_size,
offset_cell_size + num_cells,
offset_cell_size, sycl::plus<>{});
}
});
});

return executionQueue.getQueue().submit(
[&, mesh_lower_bound = mesh_lower_bound_, grid_spacing = grid_spacing_, all_grid_points = all_grid_points_,
all_cells = all_cells_, index_list = index_list_, index_head_list = index_head_list_](sycl::handler &cgh)
{
cgh.depends_on(clear_event.getEventList());
cgh.parallel_for(executionQueue.getUniformNdRange(total_real_particles), [=](sycl::nd_item<1> item)
{
const size_t index_i = item.get_global_id();
if(index_i < total_real_particles)
[&, mesh_lower_bound = mesh_lower_bound_, grid_spacing = grid_spacing_, all_grid_points = all_grid_points_,
all_cells = all_cells_, particle_id_list = particle_id_list_, offset_cell_size = offset_cell_size_,
curr_cell_size = curr_cell_size_, pos_n = list_data_pos_] (sycl::handler &cgh)
{
cgh.depends_on(exclusive_scan_cells_size_event);
cgh.parallel_for(executionQueue.getUniformNdRange(total_real_particles),
[=](sycl::nd_item<1> item)
{
if(item.get_global_id() < total_real_particles)
{
const DeviceInt index_i = item.get_global_id();
const auto cell_index = CellIndexFromPosition(pos_n[index_i], *mesh_lower_bound,
*grid_spacing, *all_grid_points);
const auto linear_cell_index = transferCellIndexTo1D(cell_index, *all_cells);
sycl::atomic_ref<size_t, sycl::memory_order_relaxed, sycl::memory_scope_device,
sycl::atomic_ref<DeviceInt, sycl::memory_order_relaxed, sycl::memory_scope_device,
sycl::access::address_space::global_space>
atomic_head_list(index_head_list[linear_cell_index]);
/*
* Insert index at the head of the list, the index previously at the top is then
* used as the next one pointed by the new index.
* Indices values are increased by 1 to let 0 be the value that indicates list termination.
* If the cell list is empty (i.e. head == 0) then head will point to the new index and the
* new index will point to 0 (i.e. the new index will be the first and last element of the cell list).
* index_head_list[linear_cell_index] = index_i+1 ---> index_list[index_i] = 0
* Since the cell list order is not relevant, memory_order_relaxed will only ensure that each cell of
* index_head_list gets a new index one at a time.
*/
index_list[index_i] = atomic_head_list.exchange(index_i + 1);
} });
});
atomic_current_cell_size(curr_cell_size[linear_cell_index]);

particle_id_list[offset_cell_size[linear_cell_index] + atomic_current_cell_size++] = index_i;
}
});
});
}
//=================================================================================================//
size_t *CellLinkedListKernel::computingSequence(BaseParticles &baseParticles)
DeviceInt *CellLinkedListKernel::computingSequence(BaseParticles &baseParticles)
{
auto *pos = baseParticles.getDeviceVariableByName<DeviceVecd>("Position");
auto *sequence = baseParticles.sequence_device_;
size_t total_real_particles = baseParticles.total_real_particles_;
DeviceInt total_real_particles = baseParticles.total_real_particles_;
executionQueue.getQueue().submit(
[&, mesh_lower_bound = mesh_lower_bound_, grid_spacing = grid_spacing_,
all_grid_points = all_grid_points_](sycl::handler &cgh)
{
cgh.parallel_for(executionQueue.getUniformNdRange(total_real_particles), [=](sycl::nd_item<1> item)
{
size_t i = item.get_global_id();
DeviceInt i = item.get_global_id();
if(i < total_real_particles)
sequence[i] = BaseMesh::transferMeshIndexToMortonOrder(
CellIndexFromPosition(pos[i], *mesh_lower_bound,
Expand Down
Loading