Skip to content

Commit

Permalink
Merge pull request #623 from nR3D/sycl
Browse files Browse the repository at this point in the history
[SYCL] restructured cell linked list and other optimizations
  • Loading branch information
Xiangyu-Hu authored Jul 13, 2024
2 parents 80a5d08 + 8981ddd commit e55c877
Show file tree
Hide file tree
Showing 20 changed files with 308 additions and 191 deletions.
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,8 @@ endif()

if(SPHINXSYS_USE_SYCL)
if(NOT SPHINXSYS_SYCL_TARGETS)
set(SPHINXSYS_SYCL_TARGETS nvptx64-nvidia-cuda)
message("-- Using default SYCL device target, select specific target with option SPHINXSYS_SYCL_TARGETS")
set(SPHINXSYS_SYCL_TARGETS spir64)
endif()
message("-- SPHinXsys use SYCL with targets ${SPHINXSYS_SYCL_TARGETS}")
target_compile_options(sphinxsys_core INTERFACE -fsycl -fsycl-targets=${SPHINXSYS_SYCL_TARGETS} -Wno-unknown-cuda-version)
Expand Down
1 change: 1 addition & 0 deletions src/shared/common/base_data_type.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ using Rotation2d = Eigen::Rotation2D<Real>;
using Rotation3d = Eigen::AngleAxis<Real>;
/** Device data types. */
using DeviceReal = Real;
using DeviceInt = u_int32_t;
using DeviceVec2d = Vec2d;
using DeviceVec3d = Vec3d;
using DeviceArray2i = Array2i;
Expand Down
8 changes: 8 additions & 0 deletions src/shared/common/vector_functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,10 @@ inline execution::ExecutionEvent copyDataToDevice(const Real* host, DeviceReal*
return transformAndCopyDataToDevice(host, device, size, [](Real val) { return static_cast<DeviceReal>(val); });
}

inline execution::ExecutionEvent copyDataToDevice(const size_t* host, DeviceInt* device, std::size_t size) {
return transformAndCopyDataToDevice(host, device, size, [](size_t val) { return static_cast<DeviceInt>(val); });
}

template<class T>
inline execution::ExecutionEvent copyDataToDevice(const T& value, T* device, std::size_t size) {
return execution::executionQueue.getQueue().fill(device, value, size);
Expand Down Expand Up @@ -234,5 +238,9 @@ inline execution::ExecutionEvent copyDataFromDevice(Vec3d* host, const DeviceVec
inline execution::ExecutionEvent copyDataFromDevice(Real* host, const DeviceReal* device, std::size_t size) {
return transformAndCopyDataFromDevice(host, device, size, [](DeviceReal val) { return static_cast<Real>(val); });
}

inline execution::ExecutionEvent copyDataFromDevice(size_t* host, const DeviceInt* device, std::size_t size) {
return transformAndCopyDataFromDevice(host, device, size, [](DeviceInt val) { return static_cast<size_t>(val); });
}
} // namespace SPH
#endif // VECTOR_FUNCTIONS_H
53 changes: 51 additions & 2 deletions src/shared/materials/riemann_solver.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,23 @@ struct FluidStateOut
FluidStateOut(Real rho, Vecd vel, Real p) : vel_(vel), rho_(rho), p_(p){};
};

class NoRiemannSolverKernel
{
public:
NoRiemannSolverKernel(DeviceReal rho0c0_i, DeviceReal rho0c0_j, DeviceReal inv_rho0c0_sum)
: rho0c0_i_(rho0c0_i), rho0c0_j_(rho0c0_j), inv_rho0c0_sum_(inv_rho0c0_sum) {}

DeviceReal DissipativePJump(const DeviceReal &) const { return DeviceReal(0.0); }
DeviceReal DissipativeUJump(const DeviceReal &) const { return DeviceReal(0.0); }

DeviceReal AverageP(const DeviceReal &p_i, const DeviceReal &p_j)
{
return (p_i * rho0c0_j_ + p_j * rho0c0_i_) * inv_rho0c0_sum_;
};
protected:
DeviceReal rho0c0_i_, rho0c0_j_, inv_rho0c0_sum_;
};

/**
* @struct NoRiemannSolver
* @brief Central difference scheme without Riemann flux.
Expand All @@ -60,7 +77,8 @@ class NoRiemannSolver
: rho0_i_(fluid_i.ReferenceDensity()), rho0_j_(fluid_j.ReferenceDensity()),
c0_i_(fluid_i.ReferenceSoundSpeed()), c0_j_(fluid_j.ReferenceSoundSpeed()),
rho0c0_i_(rho0_i_ * c0_i_), rho0c0_j_(rho0_j_ * c0_j_),
inv_rho0c0_sum_(1.0 / (rho0c0_i_ + rho0c0_j_)){};
inv_rho0c0_sum_(1.0 / (rho0c0_i_ + rho0c0_j_)),
device_kernel(rho0c0_i_, rho0c0_j_, inv_rho0c0_sum_) {};
Real DissipativePJump(const Real &u_jump) const { return 0.0; }
Real DissipativeUJump(const Real &p_jump) const { return 0.0; }

Expand All @@ -77,6 +95,32 @@ class NoRiemannSolver
Real rho0_i_, rho0_j_;
Real c0_i_, c0_j_;
Real rho0c0_i_, rho0c0_j_, inv_rho0c0_sum_;

public:
execution::DeviceImplementation<NoRiemannSolverKernel> device_kernel;
};

class AcousticRiemannSolverKernel : public NoRiemannSolverKernel
{
public:
template<class... Args>
AcousticRiemannSolverKernel(DeviceReal inv_rho0c0_ave, DeviceReal rho0c0_geo_ave,
DeviceReal inv_c_ave, DeviceReal limiter_coeff, Args&& ...baseArgs)
: NoRiemannSolverKernel(std::forward<Args>(baseArgs)...), inv_rho0c0_ave_(inv_rho0c0_ave),
rho0c0_geo_ave_(rho0c0_geo_ave), inv_c_ave_(inv_c_ave), limiter_coeff_(limiter_coeff) {}

DeviceReal DissipativePJump(const DeviceReal &u_jump) const {
return rho0c0_geo_ave_ * u_jump * sycl::min(limiter_coeff_ * sycl::max(u_jump * inv_c_ave_, DeviceReal(0)), DeviceReal(1));
}

DeviceReal DissipativeUJump(const DeviceReal &p_jump) const {
return p_jump * inv_rho0c0_ave_;
}

protected:
DeviceReal inv_rho0c0_ave_, rho0c0_geo_ave_;
DeviceReal inv_c_ave_;
DeviceReal limiter_coeff_;
};

class AcousticRiemannSolver : public NoRiemannSolver
Expand All @@ -88,7 +132,9 @@ class AcousticRiemannSolver : public NoRiemannSolver
inv_rho0c0_ave_(2.0 * inv_rho0c0_sum_),
rho0c0_geo_ave_(2.0 * rho0c0_i_ * rho0c0_j_ * inv_rho0c0_sum_),
inv_c_ave_(0.5 * (rho0_i_ + rho0_j_) * inv_rho0c0_ave_),
limiter_coeff_(limiter_coeff){};
limiter_coeff_(limiter_coeff),
device_kernel(inv_rho0c0_ave_, rho0c0_geo_ave_, inv_c_ave_, limiter_coeff_,
rho0c0_i_, rho0c0_j_, inv_rho0c0_sum_) {};
Real DissipativePJump(const Real &u_jump) const {
return rho0c0_geo_ave_ * u_jump * SMIN(limiter_coeff_ * SMAX(u_jump * inv_c_ave_, Real(0)), Real(1));
}
Expand All @@ -101,6 +147,9 @@ class AcousticRiemannSolver : public NoRiemannSolver
Real inv_rho0c0_ave_, rho0c0_geo_ave_;
Real inv_c_ave_;
Real limiter_coeff_;

public:
execution::DeviceImplementation<AcousticRiemannSolverKernel> device_kernel;
};

class DissipativeRiemannSolver : public AcousticRiemannSolver
Expand Down
4 changes: 2 additions & 2 deletions src/shared/materials/weakly_compressible_fluid.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ class WeaklyCompressibleFluidKernel
inline DeviceReal getPressure(DeviceReal rho) const { return p0_ * (rho / rho0_ - static_cast<DeviceReal>(1.0)); }
inline DeviceReal getSoundSpeed(DeviceReal p = 0.0, DeviceReal rho = 1.0) const { return c0_; }
inline DeviceReal ReferenceDensity() const { return rho0_; }
inline DeviceReal ReferenceSoundSpeed() { return c0_; };
inline DeviceReal ReferenceSoundSpeed() const { return c0_; };

private:
DeviceReal p0_, c0_, rho0_;
const DeviceReal p0_, c0_, rho0_;
};

/**
Expand Down
95 changes: 61 additions & 34 deletions src/shared/meshes/cell_linked_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ CellLinkedListKernel::CellLinkedListKernel(const DeviceVecd &meshLowerBound, Dev
: total_real_particles_(0), list_data_pos_(nullptr), // will be initialized at first UpdateCellLists execution
mesh_lower_bound_(allocateDeviceData<DeviceVecd>(1)), grid_spacing_(allocateDeviceData<DeviceReal>(1)),
all_grid_points_(allocateDeviceData<DeviceArrayi>(1)), all_cells_(allocateDeviceData<DeviceArrayi>(1)),
index_list_(nullptr), index_head_list_(allocateDeviceData<size_t>(VecdFoldProduct(allCells))),
index_head_list_size_(VecdFoldProduct(allCells))
linear_cell_size_(VecdFoldProduct(allCells)), offset_cell_size_(allocateDeviceData<DeviceInt>(linear_cell_size_+1)),
curr_cell_size_(allocateDeviceData<DeviceInt>(linear_cell_size_))
{
copyDataToDevice(&meshLowerBound, mesh_lower_bound_, 1)
.add(copyDataToDevice(&gridSpacing, grid_spacing_, 1))
Expand All @@ -58,64 +58,91 @@ CellLinkedListKernel::~CellLinkedListKernel()

execution::ExecutionEvent CellLinkedListKernel::clearCellLists()
{
// Only clear head list, since index list does not depend on its previous values
return std::move(copyDataToDevice(static_cast<size_t>(0), index_head_list_, index_head_list_size_));
return std::move(copyDataToDevice(static_cast<DeviceInt>(0), offset_cell_size_, linear_cell_size_+1)
.add(copyDataToDevice(static_cast<DeviceInt>(0), curr_cell_size_, linear_cell_size_)));
}

execution::ExecutionEvent CellLinkedListKernel::UpdateCellLists(const SPH::BaseParticles &base_particles)
{
if(!index_list_) // initialize list data with base_particles at first execution
if(!list_data_pos_) // initialize list data with base_particles at first execution
{
total_real_particles_ = base_particles.total_real_particles_;
list_data_pos_ = base_particles.getDeviceVariableByName<DeviceVecd>("Position");
index_list_ = allocateDeviceData<size_t>(total_real_particles_);
particle_id_list_ = allocateDeviceData<DeviceInt>(total_real_particles_);
}
auto clear_event = clearCellLists();
auto *pos_n = base_particles.getDeviceVariableByName<DeviceVecd>("Position");
size_t total_real_particles = base_particles.total_real_particles_;

DeviceInt total_real_particles = base_particles.total_real_particles_;
auto determine_cells_size_event = executionQueue.getQueue().submit(
[&, mesh_lower_bound = mesh_lower_bound_, grid_spacing = grid_spacing_, all_grid_points = all_grid_points_,
all_cells = all_cells_, offset_cell_size = offset_cell_size_, pos_n = list_data_pos_](sycl::handler &cgh)
{
cgh.depends_on(clear_event.getEventList());
cgh.parallel_for(executionQueue.getUniformNdRange(total_real_particles),
[=](sycl::nd_item<1> item) {
if(item.get_global_id() < total_real_particles)
{
const DeviceInt index_i = item.get_global_id();
const auto cell_index = CellIndexFromPosition(pos_n[index_i], *mesh_lower_bound,
*grid_spacing, *all_grid_points);
const auto linear_cell_index = transferCellIndexTo1D(cell_index, *all_cells);
sycl::atomic_ref<DeviceInt, sycl::memory_order_relaxed, sycl::memory_scope_device,
sycl::access::address_space::global_space>
atomic_cell_size(offset_cell_size[linear_cell_index]);
++atomic_cell_size;
}
}); });

auto exclusive_scan_cells_size_event = executionQueue.getQueue().submit(
[&, offset_cell_size = offset_cell_size_, num_cells = linear_cell_size_ + 1](sycl::handler &cgh) {
cgh.depends_on(determine_cells_size_event);
cgh.parallel_for(executionQueue.getUniformNdRange(executionQueue.getWorkGroupSize()),
[=](sycl::nd_item<1> item) {
if(item.get_group_linear_id() == 0) {
sycl::joint_exclusive_scan(item.get_group(), offset_cell_size,
offset_cell_size + num_cells,
offset_cell_size, sycl::plus<>{});
}
});
});

return executionQueue.getQueue().submit(
[&, mesh_lower_bound = mesh_lower_bound_, grid_spacing = grid_spacing_, all_grid_points = all_grid_points_,
all_cells = all_cells_, index_list = index_list_, index_head_list = index_head_list_](sycl::handler &cgh)
{
cgh.depends_on(clear_event.getEventList());
cgh.parallel_for(executionQueue.getUniformNdRange(total_real_particles), [=](sycl::nd_item<1> item)
{
const size_t index_i = item.get_global_id();
if(index_i < total_real_particles)
[&, mesh_lower_bound = mesh_lower_bound_, grid_spacing = grid_spacing_, all_grid_points = all_grid_points_,
all_cells = all_cells_, particle_id_list = particle_id_list_, offset_cell_size = offset_cell_size_,
curr_cell_size = curr_cell_size_, pos_n = list_data_pos_] (sycl::handler &cgh)
{
cgh.depends_on(exclusive_scan_cells_size_event);
cgh.parallel_for(executionQueue.getUniformNdRange(total_real_particles),
[=](sycl::nd_item<1> item)
{
if(item.get_global_id() < total_real_particles)
{
const DeviceInt index_i = item.get_global_id();
const auto cell_index = CellIndexFromPosition(pos_n[index_i], *mesh_lower_bound,
*grid_spacing, *all_grid_points);
const auto linear_cell_index = transferCellIndexTo1D(cell_index, *all_cells);
sycl::atomic_ref<size_t, sycl::memory_order_relaxed, sycl::memory_scope_device,
sycl::atomic_ref<DeviceInt, sycl::memory_order_relaxed, sycl::memory_scope_device,
sycl::access::address_space::global_space>
atomic_head_list(index_head_list[linear_cell_index]);
/*
* Insert index at the head of the list, the index previously at the top is then
* used as the next one pointed by the new index.
* Indices values are increased by 1 to let 0 be the value that indicates list termination.
* If the cell list is empty (i.e. head == 0) then head will point to the new index and the
* new index will point to 0 (i.e. the new index will be the first and last element of the cell list).
* index_head_list[linear_cell_index] = index_i+1 ---> index_list[index_i] = 0
* Since the cell list order is not relevant, memory_order_relaxed will only ensure that each cell of
* index_head_list gets a new index one at a time.
*/
index_list[index_i] = atomic_head_list.exchange(index_i + 1);
} });
});
atomic_current_cell_size(curr_cell_size[linear_cell_index]);

particle_id_list[offset_cell_size[linear_cell_index] + atomic_current_cell_size++] = index_i;
}
});
});
}
//=================================================================================================//
size_t *CellLinkedListKernel::computingSequence(BaseParticles &baseParticles)
DeviceInt *CellLinkedListKernel::computingSequence(BaseParticles &baseParticles)
{
auto *pos = baseParticles.getDeviceVariableByName<DeviceVecd>("Position");
auto *sequence = baseParticles.sequence_device_;
size_t total_real_particles = baseParticles.total_real_particles_;
DeviceInt total_real_particles = baseParticles.total_real_particles_;
executionQueue.getQueue().submit(
[&, mesh_lower_bound = mesh_lower_bound_, grid_spacing = grid_spacing_,
all_grid_points = all_grid_points_](sycl::handler &cgh)
{
cgh.parallel_for(executionQueue.getUniformNdRange(total_real_particles), [=](sycl::nd_item<1> item)
{
size_t i = item.get_global_id();
DeviceInt i = item.get_global_id();
if(i < total_real_particles)
sequence[i] = BaseMesh::transferMeshIndexToMortonOrder(
CellIndexFromPosition(pos[i], *mesh_lower_bound,
Expand Down
Loading

0 comments on commit e55c877

Please sign in to comment.