From 9588ddabf6328848bbc15aadfd86e07f2a0fd8a9 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 5 Dec 2023 13:30:26 -0600 Subject: [PATCH 01/30] Added the GPU versions of lj/cut/coul/cut/soft and lj/cut/coul/long/soft --- lib/gpu/geryon/ocl_mat.h | 2 +- lib/gpu/lal_base_dpd.cpp | 6 +- lib/gpu/lal_base_dpd.h | 5 +- lib/gpu/lal_device.cpp | 38 ++- lib/gpu/lal_device.h | 1 + lib/gpu/lal_lj_coul_long.h | 2 +- lib/gpu/lal_lj_coul_long_soft.cpp | 174 ++++++++++++ lib/gpu/lal_lj_coul_long_soft.cu | 288 ++++++++++++++++++++ lib/gpu/lal_lj_coul_long_soft.h | 88 ++++++ lib/gpu/lal_lj_coul_long_soft_ext.cpp | 151 +++++++++++ lib/gpu/lal_lj_coul_soft.cpp | 157 +++++++++++ lib/gpu/lal_lj_coul_soft.cu | 275 +++++++++++++++++++ lib/gpu/lal_lj_coul_soft.h | 85 ++++++ lib/gpu/lal_lj_coul_soft_ext.cpp | 128 +++++++++ src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp | 249 +++++++++++++++++ src/GPU/pair_lj_cut_coul_cut_soft_gpu.h | 45 ++++ src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp | 297 +++++++++++++++++++++ src/GPU/pair_lj_cut_coul_long_soft_gpu.h | 46 ++++ 18 files changed, 2023 insertions(+), 14 deletions(-) create mode 100644 lib/gpu/lal_lj_coul_long_soft.cpp create mode 100644 lib/gpu/lal_lj_coul_long_soft.cu create mode 100644 lib/gpu/lal_lj_coul_long_soft.h create mode 100644 lib/gpu/lal_lj_coul_long_soft_ext.cpp create mode 100644 lib/gpu/lal_lj_coul_soft.cpp create mode 100644 lib/gpu/lal_lj_coul_soft.cu create mode 100644 lib/gpu/lal_lj_coul_soft.h create mode 100644 lib/gpu/lal_lj_coul_soft_ext.cpp create mode 100644 src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp create mode 100644 src/GPU/pair_lj_cut_coul_cut_soft_gpu.h create mode 100644 src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp create mode 100644 src/GPU/pair_lj_cut_coul_long_soft_gpu.h diff --git a/lib/gpu/geryon/ocl_mat.h b/lib/gpu/geryon/ocl_mat.h index 3135594dc3d..66ca6ab5275 100644 --- a/lib/gpu/geryon/ocl_mat.h +++ b/lib/gpu/geryon/ocl_mat.h @@ -54,6 +54,6 @@ namespace ucl_opencl { #include "ucl_print.h" #undef UCL_PRINT_ALLOW -} // namespace ucl_cudart +} // namespace ucl_opencl #endif diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp index e103699d40b..0ddd24d21ed 100644 --- a/lib/gpu/lal_base_dpd.cpp +++ b/lib/gpu/lal_base_dpd.cpp @@ -56,7 +56,8 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, const void *pair_program, - const char *k_name, const int onetype) { + const char *k_name, const int onetype, + const int extra_fields) { screen=_screen; int gpu_nbor=0; @@ -75,7 +76,8 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall, bool charge = false; bool rot = false; bool vel = true; - int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel); + _extra_fields = extra_fields; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields/4); if (success!=0) return success; diff --git a/lib/gpu/lal_base_dpd.h b/lib/gpu/lal_base_dpd.h index 9eb56993afa..6e9451d72a0 100644 --- a/lib/gpu/lal_base_dpd.h +++ b/lib/gpu/lal_base_dpd.h @@ -53,7 +53,7 @@ class BaseDPD { const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, const void *pair_program, const char *k_name, - const int onetype=0); + const int onetype=0, const int extra_fields=0); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(); @@ -167,7 +167,6 @@ class BaseDPD { /// Atom Data Atom *atom; - // ------------------------ FORCE/ENERGY DATA ----------------------- Answer *ans; @@ -197,6 +196,8 @@ class BaseDPD { numtyp _dtinvsqrt; int _seed, _timestep; + int _extra_fields; + protected: bool _compiled; int _block_size, _threads_per_atom, _onetype; diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index 70ba373a653..e9ef2294b2d 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -364,6 +364,12 @@ int DeviceT::init_device(MPI_Comm /*world*/, MPI_Comm replica, const int ngpu, } else _neighbor_shared.setup_auto_cell_size(false,_user_cell_size,_simd_size); + #ifndef LAL_USE_OLD_NEIGHBOR + _use_old_nbor_build = 0; + #else + _use_old_nbor_build = 1; + #endif + return flag; } @@ -510,9 +516,13 @@ int DeviceT::init(Answer &ans, const bool charge, gpu_nbor=1; else if (_gpu_mode==Device::GPU_HYB_NEIGH) gpu_nbor=2; + + // NOTE: enforce the hybrid mode (binning on the CPU) + // when not using sorting on the device #if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT) if (gpu_nbor==1) gpu_nbor=2; #endif + // or when the device supports subgroups #ifndef LAL_USE_OLD_NEIGHBOR if (gpu_nbor==1) gpu_nbor=2; #endif @@ -886,19 +896,31 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, } if (times[5] > 0.0) fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size); - fprintf(screen,"Average split: %.4f.\n",avg_split); - fprintf(screen,"Lanes / atom: %d.\n",threads_per_atom); - fprintf(screen,"Vector width: %d.\n", simd_size()); - fprintf(screen,"Prefetch mode: "); - if (_nbor_prefetch==2) fprintf(screen,"Intrinsics.\n"); - else if (_nbor_prefetch==1) fprintf(screen,"API.\n"); - else fprintf(screen,"None.\n"); - fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); if (nbor.gpu_nbor()==2) fprintf(screen,"CPU Neighbor: %.4f s.\n",times[8]/_replica_size); fprintf(screen,"CPU Cast/Pack: %.4f s.\n",times[4]/_replica_size); fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size); fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[7]/_replica_size); + fprintf(screen,"Average split: %.4f.\n",avg_split); + fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); + fprintf(screen,"Prefetch mode: "); + if (_nbor_prefetch==2) fprintf(screen,"Intrinsics.\n"); + else if (_nbor_prefetch==1) fprintf(screen,"API.\n"); + else fprintf(screen,"None.\n"); + fprintf(screen,"Vector width: %d.\n", simd_size()); + fprintf(screen,"Lanes / atom: %d.\n",threads_per_atom); + fprintf(screen,"Pair block: %d.\n",_block_pair); + fprintf(screen,"Neigh block: %d.\n",_block_nbor_build); + if (nbor.gpu_nbor()==2) { + fprintf(screen,"Neigh mode: Hybrid (binning on host)"); + if (_use_old_nbor_build == 1) fprintf(screen," - legacy\n"); + else fprintf(screen," with subgroup support\n"); + } else if (nbor.gpu_nbor()==1) { + fprintf(screen,"Neigh mode: Device"); + if (_use_old_nbor_build == 1) fprintf(screen," - legacy\n"); + else fprintf(screen," - with subgroup support\n"); + } else if (nbor.gpu_nbor()==0) + fprintf(screen,"Neigh mode: Host\n"); fprintf(screen,"-------------------------------------"); fprintf(screen,"--------------------------------\n\n"); diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index ba693e551a5..d6b52484f1a 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -347,6 +347,7 @@ class Device { int _pppm_block, _block_nbor_build, _block_cell_2d, _block_cell_id; int _max_shared_types, _max_bio_shared_types, _pppm_max_spline; int _nbor_prefetch; + int _use_old_nbor_build; UCL_Program *dev_program; UCL_Kernel k_zero, k_info; diff --git a/lib/gpu/lal_lj_coul_long.h b/lib/gpu/lal_lj_coul_long.h index bc4fce40a54..ace5a263393 100644 --- a/lib/gpu/lal_lj_coul_long.h +++ b/lib/gpu/lal_lj_coul_long.h @@ -78,7 +78,7 @@ class LJCoulLong : public BaseCharge { numtyp _cut_coulsq, _qqrd2e, _g_ewald; - private: +protected: bool _allocated; int loop(const int eflag, const int vflag); }; diff --git a/lib/gpu/lal_lj_coul_long_soft.cpp b/lib/gpu/lal_lj_coul_long_soft.cpp new file mode 100644 index 00000000000..2b90333bae0 --- /dev/null +++ b/lib/gpu/lal_lj_coul_long_soft.cpp @@ -0,0 +1,174 @@ +/*************************************************************************** + lj_coul_long_soft.cpp + ------------------- + Trung Nguyen (U Chicago) + + Class for acceleration of the lj/cut/coul/long/soft pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : ndtrung@uchicago.edu + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "lj_coul_long_soft_cl.h" +#elif defined(USE_CUDART) +const char *lj_coul_long_soft=0; +#else +#include "lj_coul_long_soft_cubin.h" +#endif + +#include "lal_lj_coul_long_soft.h" +#include +namespace LAMMPS_AL { +#define LJCoulLongSoftT LJCoulLongSoft + +extern Device device; + +template +LJCoulLongSoftT::LJCoulLongSoft() : BaseCharge(), + _allocated(false) { +} + +template +LJCoulLongSoftT::~LJCoulLongSoft() { + clear(); +} + +template +int LJCoulLongSoftT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int LJCoulLongSoftT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double **host_epsilon, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj_coul_long_soft,"k_lj_coul_long_soft"); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq, host_cut_ljsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset, host_epsilon); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _cut_coulsq=host_cut_coulsq; + _qqrd2e=qqrd2e; + _g_ewald=g_ewald; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void LJCoulLongSoftT::reinit(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, double **host_epsilon, double **host_cut_ljsq) { + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; i<_lj_types*_lj_types; i++) + host_write[i]=0.0; + + this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq, host_cut_ljsq); + this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset, host_epsilon); +} + +template +void LJCoulLongSoftT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double LJCoulLongSoftT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJCoulLongSoft); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int LJCoulLongSoftT::loop(const int eflag, const int vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->atom->q, + &_cut_coulsq, &_qqrd2e, &_g_ewald, + &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &lj1, &lj3, + &_lj_types, &sp_lj, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), &this->ans->force, + &this->ans->engv, &eflag, &vflag, &ainum, + &nbor_pitch, &this->atom->q, &_cut_coulsq, + &_qqrd2e, &_g_ewald, &this->_threads_per_atom); + } + this->time_pair.stop(); + return GX; +} + +template class LJCoulLongSoft; +} diff --git a/lib/gpu/lal_lj_coul_long_soft.cu b/lib/gpu/lal_lj_coul_long_soft.cu new file mode 100644 index 00000000000..242b52fb791 --- /dev/null +++ b/lib/gpu/lal_lj_coul_long_soft.cu @@ -0,0 +1,288 @@ +// ************************************************************************** +// lj_coul_long_soft.cu +// ------------------- +// Trung Nguyen (U Chicago) +// +// Device code for acceleration of the lj/cut/coul/long/soft pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : ndtrung@uchicago.edu +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( q_tex,float); +#else +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +__kernel void k_lj_coul_long_soft(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } + + if (ii +class LJCoulLongSoft : public BaseCharge { + public: + LJCoulLongSoft(); + ~LJCoulLongSoft(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double **host_epsilon, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); + + /// Send updated coeffs from host to device (to be compatible with fix adapt) + void reinit(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double **host_epsilon, double **host_cut_ljsq); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = cutsq_vdw + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset, lj3.w = epsilon + UCL_D_Vec lj3; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _cut_coulsq, _qqrd2e, _g_ewald; + +protected: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_lj_coul_long_soft_ext.cpp b/lib/gpu/lal_lj_coul_long_soft_ext.cpp new file mode 100644 index 00000000000..efccbd6ab86 --- /dev/null +++ b/lib/gpu/lal_lj_coul_long_soft_ext.cpp @@ -0,0 +1,151 @@ +/*************************************************************************** + lj_coul_long_soft_ext.cpp + ------------------- + Trung Nguyen (U Chicago) + + Functions for LAMMPS access to lj/cut/coul/long/soft acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : ndtrung@uchicago.edu + ***************************************************************************/ + +#include +#include +#include + +#include "lal_lj_coul_long_soft.h" + +using namespace std; +using namespace LAMMPS_AL; + +static LJCoulLongSoft LJCLSMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int ljcls_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double **epsilon, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + LJCLSMF.clear(); + gpu_mode=LJCLSMF.device->gpu_mode(); + double gpu_split=LJCLSMF.device->particle_split(); + int first_gpu=LJCLSMF.device->first_device(); + int last_gpu=LJCLSMF.device->last_device(); + int world_me=LJCLSMF.device->world_me(); + int gpu_rank=LJCLSMF.device->gpu_rank(); + int procs_per_gpu=LJCLSMF.device->procs_per_gpu(); + + LJCLSMF.device->init_message(screen,"lj/cut/coul/long/soft",first_gpu,last_gpu); + + bool message=false; + if (LJCLSMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=LJCLSMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, epsilon, special_lj, inum, nall, max_nbors, maxspecial, + cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); + + LJCLSMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + LJCLSMF.estimate_gpu_overhead(); + return init_ok; +} + +// --------------------------------------------------------------------------- +// Copy updated coeffs from host to device +// --------------------------------------------------------------------------- +void ljcls_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double **epsilon, double **host_cut_ljsq) { + int world_me=LJCLSMF.device->world_me(); + int gpu_rank=LJCLSMF.device->gpu_rank(); + int procs_per_gpu=LJCLSMF.device->procs_per_gpu(); + + if (world_me==0) + LJCLSMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, epsilon, host_cut_ljsq); + LJCLSMF.device->world_barrier(); + + for (int i=0; igpu_barrier(); + } +} + +void ljcls_gpu_clear() { + LJCLSMF.clear(); +} + +int** ljcls_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return LJCLSMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void ljcls_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + LJCLSMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, + host_q,nlocal,boxlo,prd); +} + +double ljcls_gpu_bytes() { + return LJCLSMF.host_memory_usage(); +} + diff --git a/lib/gpu/lal_lj_coul_soft.cpp b/lib/gpu/lal_lj_coul_soft.cpp new file mode 100644 index 00000000000..96bcf41aa36 --- /dev/null +++ b/lib/gpu/lal_lj_coul_soft.cpp @@ -0,0 +1,157 @@ +/*************************************************************************** + lj_coul_soft.cpp + ------------------- + Trung Nguyen (U Chicago) + + Class for acceleration of the lj/cut/coul/cut/soft pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : ndtrung@uchicago.edu + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "lj_coul_soft_cl.h" +#elif defined(USE_CUDART) +const char *lj_coul_soft=0; +#else +#include "lj_coul_soft_cubin.h" +#endif + +#include "lal_lj_coul_soft.h" +#include +namespace LAMMPS_AL { +#define LJCoulSoftT LJCoulSoft + +extern Device device; + +template +LJCoulSoftT::LJCoulSoft() : BaseCharge(), + _allocated(false) { +} + +template +LJCoulSoftT::~LJCoulSoft() { + clear(); +} + +template +int LJCoulSoftT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int LJCoulSoftT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double **host_epsilon, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj_coul_soft,"lj_coul_soft"); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cut_ljsq, host_cut_coulsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset, host_epsilon); + + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _qqrd2e=qqrd2e; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+ + sp_lj.row_bytes(); + return 0; +} + +template +void LJCoulSoftT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + cutsq.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double LJCoulSoftT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJCoulSoft); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int LJCoulSoftT::loop(const int eflag, const int vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->atom->q, + &cutsq, &_qqrd2e, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, + &cutsq, &_qqrd2e, &this->_threads_per_atom); + } + this->time_pair.stop(); + return GX; +} + +template class LJCoulSoft; +} diff --git a/lib/gpu/lal_lj_coul_soft.cu b/lib/gpu/lal_lj_coul_soft.cu new file mode 100644 index 00000000000..beea2ded436 --- /dev/null +++ b/lib/gpu/lal_lj_coul_soft.cu @@ -0,0 +1,275 @@ +// ************************************************************************** +// lj_coul_soft.cu +// ------------------- +// Trung Nguyen (U Chicago) +// +// Device code for acceleration of the lj/coul/cut/soft pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : ndtrung@uchicago.edu +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) + +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( q_tex,float); +#else +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +__kernel void k_lj_coul_soft(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, + const __global numtyp *restrict cutsq, + const numtyp qqrd2e, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } + + if (ii +class LJCoulSoft : public BaseCharge { + public: + LJCoulSoft(); + ~LJCoulSoft(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, double **host_epsilon, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + double **host_cut_coulsq, double *host_special_coul, + const double qqrd2e); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset, lj3.w = epsilon + UCL_D_Vec lj3; + /// cutsq + UCL_D_Vec cutsq; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _qqrd2e; + + private: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_lj_coul_soft_ext.cpp b/lib/gpu/lal_lj_coul_soft_ext.cpp new file mode 100644 index 00000000000..8c10db14f69 --- /dev/null +++ b/lib/gpu/lal_lj_coul_soft_ext.cpp @@ -0,0 +1,128 @@ +/*************************************************************************** + lj_coul_soft_ext.cpp + ------------------- + Trung Nguyen (U Chicago) + + Functions for LAMMPS access to lj/cut/coul/cut/soft acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : ndtrung@uchicago.edu + ***************************************************************************/ + +#include +#include +#include + +#include "lal_lj_coul_soft.h" + +using namespace std; +using namespace LAMMPS_AL; + +static LJCoulSoft LJCSMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int ljcs_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double **epsilon, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e) { + LJCSMF.clear(); + gpu_mode=LJCSMF.device->gpu_mode(); + double gpu_split=LJCSMF.device->particle_split(); + int first_gpu=LJCSMF.device->first_device(); + int last_gpu=LJCSMF.device->last_device(); + int world_me=LJCSMF.device->world_me(); + int gpu_rank=LJCSMF.device->gpu_rank(); + int procs_per_gpu=LJCSMF.device->procs_per_gpu(); + + LJCSMF.device->init_message(screen,"lj/cut/coul/cut",first_gpu,last_gpu); + + bool message=false; + if (LJCSMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=LJCSMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, epsilon, special_lj, inum, nall, max_nbors, + maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e); + + LJCSMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + LJCSMF.estimate_gpu_overhead(); + return init_ok; +} + +void ljcs_gpu_clear() { + LJCSMF.clear(); +} + +int** ljcs_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return LJCSMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void ljcs_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + LJCSMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag, + vflag,eatom,vatom,host_start,cpu_time,success,host_q, + nlocal,boxlo,prd); +} + +double ljcs_gpu_bytes() { + return LJCSMF.host_memory_usage(); +} + + diff --git a/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp new file mode 100644 index 00000000000..7b0d115d96f --- /dev/null +++ b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp @@ -0,0 +1,249 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Nguyen (U Chicago) +------------------------------------------------------------------------- */ + +#include "pair_lj_cut_coul_cut_soft_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "suffix.h" + +#include + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int ljcs_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **offset, double **epsilon, double *special_lj, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, + double **host_cut_coulsq, double *host_special_coul, const double qqrd2e); +void ljcs_gpu_clear(); +int **ljcs_gpu_compute_n(const int ago, const int inum, const int nall, double **host_x, + int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double *host_q, double *boxlo, + double *prd); +void ljcs_gpu_compute(const int ago, const int inum, const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, const double cpu_time, + bool &success, double *host_q, const int nlocal, double *boxlo, double *prd); +double ljcs_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairLJCutCoulCutSoftGPU::PairLJCutCoulCutSoftGPU(LAMMPS *lmp) : + PairLJCutCoulCutSoft(lmp), gpu_mode(GPU_FORCE) +{ + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairLJCutCoulCutSoftGPU::~PairLJCutCoulCutSoftGPU() +{ + ljcs_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulCutSoftGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + inum = atom->nlocal; + firstneigh = ljcs_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, + atom->tag, atom->nspecial, atom->special, eflag, vflag, + eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, domain->prd); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + ljcs_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, atom->q, + atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + if (host_start < inum) { + cpu_time = platform::walltime(); + cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); + cpu_time = platform::walltime() - cpu_time; + } +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairLJCutCoulCutSoftGPU::init_style() +{ + if (!atom->q_flag) error->all(FLERR, "Pair style lj/cut/coul/cut/soft/gpu requires atom attribute q"); + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i, j); + cut *= cut; + if (cut > maxcut) maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + int success = + ljcs_gpu_init(atom->ntypes + 1, cutsq, lj1, lj2, lj3, lj4, offset, epsilon, force->special_lj, + atom->nlocal, atom->nlocal + atom->nghost, mnf, maxspecial, cell_size, gpu_mode, + screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e); + GPU_EXTRA::check_flag(success, error, world); + + if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); +} + +/* ---------------------------------------------------------------------- */ + +double PairLJCutCoulCutSoftGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + ljcs_gpu_bytes(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulCutSoftGPU::cpu_compute(int start, int inum, int eflag, int /* vflag */, int *ilist, + int *numneigh, int **firstneigh) +{ + int i, j, ii, jj, jnum, itype, jtype, itable; + double qtmp, xtmp, ytmp, ztmp, delx, dely, delz, evdwl, ecoul, fpair; + double r, r2inv, r6inv, forcecoul, forcelj, factor_coul, factor_lj; + double denc, denlj, r4sig6; + int *jlist; + double rsq; + + evdwl = ecoul = 0.0; + + double **x = atom->x; + double **f = atom->f; + double *q = atom->q; + int *type = atom->type; + double *special_coul = force->special_coul; + double *special_lj = force->special_lj; + double qqrd2e = force->qqrd2e; + + // loop over neighbors of my atoms + + for (ii = start; ii < inum; ii++) { + i = ilist[ii]; + qtmp = q[i]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + factor_lj = special_lj[sbmask(j)]; + factor_coul = special_coul[sbmask(j)]; + j &= NEIGHMASK; + + delx = xtmp - x[j][0]; + dely = ytmp - x[j][1]; + delz = ztmp - x[j][2]; + rsq = delx * delx + dely * dely + delz * delz; + jtype = type[j]; + + if (rsq < cutsq[itype][jtype]) { + + if (rsq < cut_coulsq[itype][jtype]) { + denc = sqrt(lj4[itype][jtype] + rsq); + forcecoul = qqrd2e * lj1[itype][jtype] * qtmp*q[j] / (denc*denc*denc); + } else forcecoul = 0.0; + + if (rsq < cut_ljsq[itype][jtype]) { + r4sig6 = rsq*rsq / lj2[itype][jtype]; + denlj = lj3[itype][jtype] + rsq*r4sig6; + forcelj = lj1[itype][jtype] * epsilon[itype][jtype] * + (48.0*r4sig6/(denlj*denlj*denlj) - 24.0*r4sig6/(denlj*denlj)); + } else forcelj = 0.0; + + fpair = factor_coul*forcecoul + factor_lj*forcelj; + + f[i][0] += delx * fpair; + f[i][1] += dely * fpair; + f[i][2] += delz * fpair; + + + if (eflag) { + if (rsq < cut_coulsq[itype][jtype]) + ecoul = factor_coul * qqrd2e * lj1[itype][jtype] * qtmp*q[j] / denc; + else + ecoul = 0.0; + + if (rsq < cut_ljsq[itype][jtype]) { + evdwl = lj1[itype][jtype] * 4.0 * epsilon[itype][jtype] * + (1.0/(denlj*denlj) - 1.0/denlj) - offset[itype][jtype]; + evdwl *= factor_lj; + } else + evdwl = 0.0; + } + + if (evflag) ev_tally_full(i, evdwl, ecoul, fpair, delx, dely, delz); + } + } + } +} diff --git a/src/GPU/pair_lj_cut_coul_cut_soft_gpu.h b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.h new file mode 100644 index 00000000000..fcd5439f7a3 --- /dev/null +++ b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.h @@ -0,0 +1,45 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(lj/cut/coul/soft/soft/gpu,PairLJCutCoulCutSoftGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_LJ_CUT_COUL_CUT_SOFT_GPU_H +#define LMP_PAIR_LJ_CUT_COUL_CUT_SOFT_GPU_H + +#include "pair_lj_cut_coul_cut_soft.h" + +namespace LAMMPS_NS { + +class PairLJCutCoulCutSoftGPU : public PairLJCutCoulCutSoft { + public: + PairLJCutCoulCutSoftGPU(LAMMPS *lmp); + ~PairLJCutCoulCutSoftGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif diff --git a/src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp b/src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp new file mode 100644 index 00000000000..45d26f5155c --- /dev/null +++ b/src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp @@ -0,0 +1,297 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Nguyen (U Chicago) +------------------------------------------------------------------------- */ + +#include "pair_lj_cut_coul_long_soft_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "kspace.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "suffix.h" + +#include + +#define EWALD_F 1.12837917 +#define EWALD_P 0.3275911 +#define A1 0.254829592 +#define A2 -0.284496736 +#define A3 1.421413741 +#define A4 -1.453152027 +#define A5 1.061405429 + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int ljcls_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **offset, double **epsilon, double *special_lj, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, + double host_cut_coulsq, double *host_special_coul, const double qqrd2e, + const double g_ewald); +void ljcls_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **offset, double **epsilon, + double **host_lj_cutsq); +void ljcls_gpu_clear(); +int **ljcls_gpu_compute_n(const int ago, const int inum, const int nall, double **host_x, + int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double *host_q, double *boxlo, + double *prd); +void ljcls_gpu_compute(const int ago, const int inum, const int nall, double **host_x, + int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, const int nlocal, + double *boxlo, double *prd); +double ljcls_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairLJCutCoulLongSoftGPU::PairLJCutCoulLongSoftGPU(LAMMPS *lmp) : + PairLJCutCoulLongSoft(lmp), gpu_mode(GPU_FORCE) +{ + respa_enable = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairLJCutCoulLongSoftGPU::~PairLJCutCoulLongSoftGPU() +{ + ljcls_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulLongSoftGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + inum = atom->nlocal; + firstneigh = ljcls_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, + atom->tag, atom->nspecial, atom->special, eflag, vflag, + eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, domain->prd); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + ljcls_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, atom->q, + atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + if (host_start < inum) { + cpu_time = platform::walltime(); + cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); + cpu_time = platform::walltime() - cpu_time; + } +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairLJCutCoulLongSoftGPU::init_style() +{ + cut_respa = nullptr; + + if (!atom->q_flag) error->all(FLERR, "Pair style lj/cut/coul/long/soft/gpu requires atom attribute q"); + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i, j); + cut *= cut; + if (cut > maxcut) maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + cut_coulsq = cut_coul * cut_coul; + + // insure use of KSpace long-range solver, set g_ewald + + if (force->kspace == nullptr) error->all(FLERR, "Pair style requires a KSpace style"); + g_ewald = force->kspace->g_ewald; + + // setup force tables + + if (ncoultablebits) init_tables(cut_coul, cut_respa); + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + int success = + ljcls_gpu_init(atom->ntypes + 1, cutsq, lj1, lj2, lj3, lj4, offset, epsilon, force->special_lj, + atom->nlocal, atom->nlocal + atom->nghost, mnf, maxspecial, cell_size, gpu_mode, + screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e, g_ewald); + GPU_EXTRA::check_flag(success, error, world); + + if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulLongSoftGPU::reinit() +{ + Pair::reinit(); + + ljcls_gpu_reinit(atom->ntypes + 1, cutsq, lj1, lj2, lj3, lj4, offset, epsilon, cut_ljsq); +} + +/* ---------------------------------------------------------------------- */ + +double PairLJCutCoulLongSoftGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + ljcls_gpu_bytes(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulLongSoftGPU::cpu_compute(int start, int inum, int eflag, int /* vflag */, int *ilist, + int *numneigh, int **firstneigh) +{ + int i, j, ii, jj, jnum, itype, jtype, itable; + double qtmp, xtmp, ytmp, ztmp, delx, dely, delz, evdwl, ecoul, fpair; + double r, r2inv, r6inv, forcecoul, forcelj, factor_coul, factor_lj; + double denc, denlj, r4sig6; + double grij, expm2, prefactor, t, erfc; + int *jlist; + double rsq; + + evdwl = ecoul = 0.0; + + double **x = atom->x; + double **f = atom->f; + double *q = atom->q; + int *type = atom->type; + double *special_coul = force->special_coul; + double *special_lj = force->special_lj; + double qqrd2e = force->qqrd2e; + + // loop over neighbors of my atoms + + for (ii = start; ii < inum; ii++) { + i = ilist[ii]; + qtmp = q[i]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + factor_lj = special_lj[sbmask(j)]; + factor_coul = special_coul[sbmask(j)]; + j &= NEIGHMASK; + + delx = xtmp - x[j][0]; + dely = ytmp - x[j][1]; + delz = ztmp - x[j][2]; + rsq = delx * delx + dely * dely + delz * delz; + jtype = type[j]; + + if (rsq < cutsq[itype][jtype]) { + r2inv = 1.0 / rsq; + + if (rsq < cut_coulsq) { + r = sqrt(rsq); + grij = g_ewald * r; + expm2 = exp(-grij * grij); + t = 1.0 / (1.0 + EWALD_P * grij); + erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2; + + denc = sqrt(lj4[itype][jtype] + rsq); + prefactor = qqrd2e * lj1[itype][jtype] * qtmp*q[j] / (denc*denc*denc); + + forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); + if (factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor; + } else + forcecoul = 0.0; + + if (rsq < cut_ljsq[itype][jtype]) { + r4sig6 = rsq*rsq / lj2[itype][jtype]; + denlj = lj3[itype][jtype] + rsq*r4sig6; + forcelj = lj1[itype][jtype] * epsilon[itype][jtype] * + (48.0*r4sig6/(denlj*denlj*denlj) - 24.0*r4sig6/(denlj*denlj)); + } else + forcelj = 0.0; + + fpair = (forcecoul + factor_lj * forcelj) * r2inv; + + f[i][0] += delx * fpair; + f[i][1] += dely * fpair; + f[i][2] += delz * fpair; + + if (eflag) { + if (rsq < cut_coulsq) { + prefactor = qqrd2e * lj1[itype][jtype] * qtmp*q[j] / denc; + ecoul = prefactor*erfc; + } else + ecoul = 0.0; + + if (rsq < cut_ljsq[itype][jtype]) { + evdwl = lj1[itype][jtype] * 4.0 * epsilon[itype][jtype] * + (1.0/(denlj*denlj) - 1.0/denlj) - offset[itype][jtype]; + evdwl *= factor_lj; + } else + evdwl = 0.0; + } + + if (evflag) ev_tally_full(i, evdwl, ecoul, fpair, delx, dely, delz); + } + } + } +} diff --git a/src/GPU/pair_lj_cut_coul_long_soft_gpu.h b/src/GPU/pair_lj_cut_coul_long_soft_gpu.h new file mode 100644 index 00000000000..d4ff0e375fd --- /dev/null +++ b/src/GPU/pair_lj_cut_coul_long_soft_gpu.h @@ -0,0 +1,46 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(lj/cut/coul/long/soft/gpu,PairLJCutCoulLongSoftGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_LJ_CUT_COUL_LONG_SOFT_GPU_H +#define LMP_PAIR_LJ_CUT_COUL_LONG_SOFT_GPU_H + +#include "pair_lj_cut_coul_long_soft.h" + +namespace LAMMPS_NS { + +class PairLJCutCoulLongSoftGPU : public PairLJCutCoulLongSoft { + public: + PairLJCutCoulLongSoftGPU(LAMMPS *lmp); + ~PairLJCutCoulLongSoftGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + void reinit() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif From 086cf49a8c64130aad9879a834a683c9d6e5f220 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 5 Dec 2023 13:32:33 -0600 Subject: [PATCH 02/30] Added the GPU version of coul/slater/long --- lib/gpu/lal_coul_slater_long.cpp | 150 +++++++++++++++ lib/gpu/lal_coul_slater_long.cu | 237 ++++++++++++++++++++++++ lib/gpu/lal_coul_slater_long.h | 82 +++++++++ lib/gpu/lal_coul_slater_long_ext.cpp | 145 +++++++++++++++ src/GPU/pair_coul_slater_long_gpu.cpp | 255 ++++++++++++++++++++++++++ src/GPU/pair_coul_slater_long_gpu.h | 46 +++++ 6 files changed, 915 insertions(+) create mode 100644 lib/gpu/lal_coul_slater_long.cpp create mode 100644 lib/gpu/lal_coul_slater_long.cu create mode 100644 lib/gpu/lal_coul_slater_long.h create mode 100644 lib/gpu/lal_coul_slater_long_ext.cpp create mode 100644 src/GPU/pair_coul_slater_long_gpu.cpp create mode 100644 src/GPU/pair_coul_slater_long_gpu.h diff --git a/lib/gpu/lal_coul_slater_long.cpp b/lib/gpu/lal_coul_slater_long.cpp new file mode 100644 index 00000000000..98b73ab7535 --- /dev/null +++ b/lib/gpu/lal_coul_slater_long.cpp @@ -0,0 +1,150 @@ +/*************************************************************************** + coul_slater_long_ext.cpp + ------------------- + Trung Nguyen (U Chicago) + + Class for acceleration of the coul/slater/long pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "coul_slater_long_cl.h" +#elif defined(USE_CUDART) +const char *coul_slater_long=0; +#else +#include "coul_slater_long_cubin.h" +#endif + +#include "lal_coul_slater_long.h" +#include +namespace LAMMPS_AL { +#define CoulSlaterLongT CoulSlaterLong + +extern Device pair_gpu_device; + +template +CoulSlaterLongT::CoulSlaterLong() : BaseCharge(), _allocated(false) { +} + +template +CoulSlaterLongT::~CoulSlaterLong() { + clear(); +} + +template +int CoulSlaterLongT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int CoulSlaterLongT::init(const int ntypes, double **host_scale, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald, double lamda) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,coul_slater_long,"k_coul_slater_long"); + if (success!=0) + return success; + + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack1(ntypes,lj_types,scale,host_write,host_scale); + + sp_cl.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_coul[i]; + } + ucl_copy(sp_cl,host_write,4,false); + + _cut_coulsq=host_cut_coulsq; + _qqrd2e=qqrd2e; + _g_ewald=g_ewald; + _lamda=lamda; + + _allocated=true; + this->_max_bytes=scale.row_bytes()+sp_cl.row_bytes(); + return 0; +} + +template +void CoulSlaterLongT::reinit(const int ntypes, double **host_scale) { + UCL_H_Vec hscale(_lj_types*_lj_types,*(this->ucl_device), + UCL_WRITE_ONLY); + this->atom->type_pack1(ntypes,_lj_types,scale,hscale,host_scale); +} + +template +void CoulSlaterLongT::clear() { + if (!_allocated) + return; + _allocated=false; + + scale.clear(); + sp_cl.clear(); + this->clear_atomic(); +} + +template +double CoulSlaterLongT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(CoulSlaterLong); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int CoulSlaterLongT::loop(const int eflag, const int vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &scale, &sp_cl, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &nbor_pitch, + &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald, + &_lamda, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, + &_qqrd2e, &_g_ewald, &_lamda, &this->_threads_per_atom); + } + this->time_pair.stop(); + return GX; +} + +template class CoulSlaterLong; +} diff --git a/lib/gpu/lal_coul_slater_long.cu b/lib/gpu/lal_coul_slater_long.cu new file mode 100644 index 00000000000..f8902c59d74 --- /dev/null +++ b/lib/gpu/lal_coul_slater_long.cu @@ -0,0 +1,237 @@ +// ************************************************************************** +// coul_slater_long.cu +// ------------------- +// Trung Nguyen (U Chicago) +// +// Device code for acceleration of the coul/slater/long pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : September 2023 +// email : ndactrung@gmail.com +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) + +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( q_tex,float); +#else +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +__kernel void k_coul_slater_long(const __global numtyp4 *restrict x_, + const __global numtyp *restrict scale, + const int lj_types, + const __global numtyp *restrict sp_cl_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const numtyp lamda, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_cl[4]; + int n_stride; + local_allocate_store_charge(); + + sp_cl[0]=sp_cl_in[0]; + sp_cl[1]=sp_cl_in[1]; + sp_cl[2]=sp_cl_in[2]; + sp_cl[3]=sp_cl_in[3]; + + acctyp3 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp e_coul, virial[6]; + if (EVFLAG) { + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } + + if (ii +class CoulSlaterLong : public BaseCharge { + public: + CoulSlaterLong(); + ~CoulSlaterLong(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **scale, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald, const double lamda); + + /// Send updated coeffs from host to device (to be compatible with fix adapt) + void reinit(const int ntypes, double **scale); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// scale + UCL_D_Vec scale; + /// Special Coul values [0-3] + UCL_D_Vec sp_cl; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _cut_coulsq, _qqrd2e, _g_ewald, _lamda; + + protected: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_coul_slater_long_ext.cpp b/lib/gpu/lal_coul_slater_long_ext.cpp new file mode 100644 index 00000000000..fb5d9725c91 --- /dev/null +++ b/lib/gpu/lal_coul_slater_long_ext.cpp @@ -0,0 +1,145 @@ +/*************************************************************************** + coul_slater_long_ext.cpp + ------------------- + Trung Nguyen (U Chicago) + + Functions for LAMMPS access to coul/slater/long acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#include +#include +#include + +#include "lal_coul_slater_long.h" + +using namespace std; +using namespace LAMMPS_AL; + +static CoulSlaterLong CSLMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int csl_gpu_init(const int ntypes, double **host_scale, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen, double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald, const double lamda) { + CSLMF.clear(); + gpu_mode=CSLMF.device->gpu_mode(); + double gpu_split=CSLMF.device->particle_split(); + int first_gpu=CSLMF.device->first_device(); + int last_gpu=CSLMF.device->last_device(); + int world_me=CSLMF.device->world_me(); + int gpu_rank=CSLMF.device->gpu_rank(); + int procs_per_gpu=CSLMF.device->procs_per_gpu(); + + CSLMF.device->init_message(screen,"coul/slater/long",first_gpu,last_gpu); + + bool message=false; + if (CSLMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=CSLMF.init(ntypes, host_scale, inum, nall, max_nbors, maxspecial, + cell_size, gpu_split, screen, host_cut_coulsq, + host_special_coul, qqrd2e, g_ewald, lamda); + + CSLMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; iserialize_init(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + CSLMF.estimate_gpu_overhead(); + return init_ok; +} + +// --------------------------------------------------------------------------- +// Copy updated coeffs from host to device +// --------------------------------------------------------------------------- +void csl_gpu_reinit(const int ntypes, double **host_scale) { + int world_me=CSLMF.device->world_me(); + int gpu_rank=CSLMF.device->gpu_rank(); + int procs_per_gpu=CSLMF.device->procs_per_gpu(); + + if (world_me==0) + CSLMF.reinit(ntypes, host_scale); + + CSLMF.device->world_barrier(); + + for (int i=0; iserialize_init(); + } +} + +void csl_gpu_clear() { + CSLMF.clear(); +} + +int** csl_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return CSLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void csl_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + CSLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, + host_q,nlocal,boxlo,prd); +} + +double csl_gpu_bytes() { + return CSLMF.host_memory_usage(); +} + + diff --git a/src/GPU/pair_coul_slater_long_gpu.cpp b/src/GPU/pair_coul_slater_long_gpu.cpp new file mode 100644 index 00000000000..d812ebe2004 --- /dev/null +++ b/src/GPU/pair_coul_slater_long_gpu.cpp @@ -0,0 +1,255 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Nguyen (U Chicago) +------------------------------------------------------------------------- */ + +#include "pair_coul_slater_long_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "kspace.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "suffix.h" + +#include + +#define EWALD_F 1.12837917 +#define EWALD_P 0.3275911 +#define A1 0.254829592 +#define A2 -0.284496736 +#define A3 1.421413741 +#define A4 -1.453152027 +#define A5 1.061405429 + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int csl_gpu_init(const int ntypes, double **scale, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen, double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald, const double lamda); +void csl_gpu_reinit(const int ntypes, double **scale); +void csl_gpu_clear(); +int **csl_gpu_compute_n(const int ago, const int inum, const int nall, double **host_x, + int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double *host_q, double *boxlo, + double *prd); +void csl_gpu_compute(const int ago, const int inum, const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, const double cpu_time, + bool &success, double *host_q, const int nlocal, double *boxlo, double *prd); +double csl_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairCoulSlaterLongGPU::PairCoulSlaterLongGPU(LAMMPS *lmp) : PairCoulSlaterLong(lmp), gpu_mode(GPU_FORCE) +{ + respa_enable = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairCoulSlaterLongGPU::~PairCoulSlaterLongGPU() +{ + csl_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairCoulSlaterLongGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + inum = atom->nlocal; + firstneigh = csl_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, + atom->tag, atom->nspecial, atom->special, eflag, vflag, + eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, domain->prd); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + csl_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, atom->q, + atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); + if (host_start < inum) { + cpu_time = platform::walltime(); + cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); + cpu_time = platform::walltime() - cpu_time; + } +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairCoulSlaterLongGPU::init_style() +{ + if (!atom->q_flag) error->all(FLERR, "Pair style coul/slater/long/gpu requires atom attribute q"); + + // Call init_one calculation make sure scale is correct + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { init_one(i, j); } + } + } + double cell_size = cut_coul + neighbor->skin; + + cut_coulsq = cut_coul * cut_coul; + + // ensure use of KSpace long-range solver, set g_ewald + + if (force->kspace == nullptr) error->all(FLERR, "Pair style requires a KSpace style"); + g_ewald = force->kspace->g_ewald; + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + int success = csl_gpu_init(atom->ntypes + 1, scale, atom->nlocal, atom->nlocal + atom->nghost, mnf, + maxspecial, cell_size, gpu_mode, screen, cut_coulsq, + force->special_coul, force->qqrd2e, g_ewald, lamda); + + GPU_EXTRA::check_flag(success, error, world); + + if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); +} + +/* ---------------------------------------------------------------------- */ + +void PairCoulSlaterLongGPU::reinit() +{ + Pair::reinit(); + + csl_gpu_reinit(atom->ntypes + 1, scale); +} + +/* ---------------------------------------------------------------------- */ + +double PairCoulSlaterLongGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + csl_gpu_bytes(); +} + +/* ---------------------------------------------------------------------- */ + +void PairCoulSlaterLongGPU::cpu_compute(int start, int inum, int eflag, int /* vflag */, int *ilist, + int *numneigh, int **firstneigh) +{ + int i, j, ii, jj, jnum, itable; + double qtmp, xtmp, ytmp, ztmp, delx, dely, delz, ecoul, fpair; + double fraction, table; + double r, r2inv, forcecoul, factor_coul; + double grij, expm2, prefactor, t, erfc; + int *jlist; + double rsq; + + ecoul = 0.0; + + double **x = atom->x; + double **f = atom->f; + double *q = atom->q; + double *special_coul = force->special_coul; + double qqrd2e = force->qqrd2e; + + // loop over neighbors of my atoms + + for (ii = start; ii < inum; ii++) { + i = ilist[ii]; + qtmp = q[i]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + factor_coul = special_coul[sbmask(j)]; + j &= NEIGHMASK; + + delx = xtmp - x[j][0]; + dely = ytmp - x[j][1]; + delz = ztmp - x[j][2]; + rsq = delx * delx + dely * dely + delz * delz; + + r2inv = 1.0 / rsq; + + if (rsq < cut_coulsq) { + r2inv = 1.0/rsq; + r = sqrt(rsq); + grij = g_ewald * r; + expm2 = exp(-grij*grij); + t = 1.0 / (1.0 + EWALD_P*grij); + erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + double slater_term = exp(-2*r/lamda)*(1 + (2*r/lamda*(1+r/lamda))); + prefactor = qqrd2e * qtmp*q[j]/r; + forcecoul = prefactor * (erfc + EWALD_F*grij*expm2 - slater_term); + if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor*(1-slater_term); + + fpair = forcecoul * r2inv; + + f[i][0] += delx * fpair; + f[i][1] += dely * fpair; + f[i][2] += delz * fpair; + + if (eflag) { + if (rsq < cut_coulsq) { + ecoul = prefactor*(erfc - (1 + r/lamda)*exp(-2*r/lamda)); + if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor*(1.0-(1 + r/lamda)*exp(-2*r/lamda)); + } else + ecoul = 0.0; + } + + if (evflag) ev_tally_full(i, 0.0, ecoul, fpair, delx, dely, delz); + } + } + } +} diff --git a/src/GPU/pair_coul_slater_long_gpu.h b/src/GPU/pair_coul_slater_long_gpu.h new file mode 100644 index 00000000000..4a30a71d25a --- /dev/null +++ b/src/GPU/pair_coul_slater_long_gpu.h @@ -0,0 +1,46 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(coul/slater/long/gpu,PairCoulSlaterLongGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_COUL_SLATER_LONG_GPU_H +#define LMP_PAIR_COUL_SLATER_LONG_GPU_H + +#include "pair_coul_slater_long.h" + +namespace LAMMPS_NS { + +class PairCoulSlaterLongGPU : public PairCoulSlaterLong { + public: + PairCoulSlaterLongGPU(LAMMPS *lmp); + ~PairCoulSlaterLongGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + void reinit() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif From fe96d9f83618a2763147641ccd7b902f317bd30f Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 5 Dec 2023 13:34:37 -0600 Subject: [PATCH 03/30] Added the GPU version of pair edpd and mdpd --- lib/gpu/lal_edpd.cpp | 295 ++++++++++++++++++ lib/gpu/lal_edpd.cu | 619 ++++++++++++++++++++++++++++++++++++++ lib/gpu/lal_edpd.h | 106 +++++++ lib/gpu/lal_edpd_ext.cpp | 142 +++++++++ lib/gpu/lal_mdpd.cpp | 218 ++++++++++++++ lib/gpu/lal_mdpd.cu | 475 +++++++++++++++++++++++++++++ lib/gpu/lal_mdpd.h | 88 ++++++ lib/gpu/lal_mdpd_ext.cpp | 133 ++++++++ src/GPU/pair_edpd_gpu.cpp | 195 ++++++++++++ src/GPU/pair_edpd_gpu.h | 48 +++ src/GPU/pair_mdpd_gpu.cpp | 171 +++++++++++ src/GPU/pair_mdpd_gpu.h | 45 +++ 12 files changed, 2535 insertions(+) create mode 100644 lib/gpu/lal_edpd.cpp create mode 100644 lib/gpu/lal_edpd.cu create mode 100644 lib/gpu/lal_edpd.h create mode 100644 lib/gpu/lal_edpd_ext.cpp create mode 100644 lib/gpu/lal_mdpd.cpp create mode 100644 lib/gpu/lal_mdpd.cu create mode 100644 lib/gpu/lal_mdpd.h create mode 100644 lib/gpu/lal_mdpd_ext.cpp create mode 100644 src/GPU/pair_edpd_gpu.cpp create mode 100644 src/GPU/pair_edpd_gpu.h create mode 100644 src/GPU/pair_mdpd_gpu.cpp create mode 100644 src/GPU/pair_mdpd_gpu.h diff --git a/lib/gpu/lal_edpd.cpp b/lib/gpu/lal_edpd.cpp new file mode 100644 index 00000000000..2a32d01abb7 --- /dev/null +++ b/lib/gpu/lal_edpd.cpp @@ -0,0 +1,295 @@ +/*************************************************************************** + edpd.cpp + ------------------- + Trung Dac Nguyen (U Chicago) + + Class for acceleration of the edpd pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "edpd_cl.h" +#elif defined(USE_CUDART) +const char *edpd=0; +#else +#include "edpd_cubin.h" +#endif + +#include "lal_edpd.h" +#include +namespace LAMMPS_AL { +#define EDPDT EDPD + +extern Device device; + +template +EDPDT::EDPD() : BaseDPD(), _allocated(false) { + _max_q_size = 0; +} + +template +EDPDT::~EDPD() { + clear(); +} + +template +int EDPDT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int EDPDT::init(const int ntypes, + double **host_cutsq, double **host_a0, + double **host_gamma, double **host_cut, + double **host_power, double **host_kappa, + double **host_powerT, double **host_cutT, + double ***host_sc, double ***host_kc, double *host_mass, + double *host_special_lj, + const int power_flag, const int kappa_flag, + const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen) { + const int max_shared_types=this->device->max_shared_types(); + + int onetype=0; + #ifdef USE_OPENCL + if (maxspecial==0) + for (int i=1; i0) { + if (onetype>0) + onetype=-1; + else if (onetype==0) + onetype=i*max_shared_types+j; + } + if (onetype<0) onetype=0; + #endif + + int success; + int extra_fields = 4; // round up to accomodate quadruples of numtyp values + // T and cv + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,edpd,"k_edpd",onetype,extra_fields); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a0,host_gamma, + host_cut); + + coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_power,host_kappa, + host_powerT,host_cutT); + + UCL_H_Vec dview_mass(ntypes, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < ntypes; i++) + dview_mass[i] = host_mass[i]; + mass.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(mass,dview_mass,false); + + if (host_sc) { + UCL_H_Vec dview(lj_types*lj_types,*(this->ucl_device),UCL_WRITE_ONLY);; + sc.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + int n = 0; + for (int i = 1; i < ntypes; i++) + for (int j = 1; j < ntypes; j++) { + dview[n].x = host_sc[i][j][0]; + dview[n].y = host_sc[i][j][1]; + dview[n].z = host_sc[i][j][2]; + dview[n].w = host_sc[i][j][3]; + n++; + } + ucl_copy(sc,dview,false); + } + + if (host_kc) { + UCL_H_Vec dview(lj_types*lj_types,*(this->ucl_device),UCL_WRITE_ONLY);; + kc.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + int n = 0; + for (int i = 1; i < ntypes; i++) + for (int j = 1; j < ntypes; j++) { + dview[n].x = host_kc[i][j][0]; + dview[n].y = host_kc[i][j][1]; + dview[n].z = host_kc[i][j][2]; + dview[n].w = host_kc[i][j][3]; + n++; + } + ucl_copy(kc,dview,false); + } + + UCL_H_Vec host_rsq(lj_types*lj_types,*(this->ucl_device), + UCL_WRITE_ONLY); + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack1(ntypes,lj_types,cutsq,host_rsq,host_cutsq); + + double special_sqrt[4]; + special_sqrt[0] = sqrt(host_special_lj[0]); + special_sqrt[1] = sqrt(host_special_lj[1]); + special_sqrt[2] = sqrt(host_special_lj[2]); + special_sqrt[3] = sqrt(host_special_lj[3]); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + sp_sqrt.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(special_sqrt,4,*(this->ucl_device)); + ucl_copy(sp_sqrt,dview,false); + + _power_flag = power_flag; + _kappa_flag = kappa_flag; + + // allocate per-atom array Q + + int ef_nall=nall; + if (ef_nall==0) + ef_nall=2000; + + _max_q_size=static_cast(static_cast(ef_nall)*1.10); + Q.alloc(_max_q_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + + _allocated=true; + this->_max_bytes=coeff.row_bytes()+coeff2.row_bytes()+Q.row_bytes()+ + sc.row_bytes()+kc.row_bytes()+mass.row_bytes()+cutsq.row_bytes()+sp_lj.row_bytes()+sp_sqrt.row_bytes(); + return 0; +} + +template +void EDPDT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff.clear(); + coeff2.clear(); + sc.clear(); + kc.clear(); + Q.clear(); + mass.clear(); + cutsq.clear(); + sp_lj.clear(); + sp_sqrt.clear(); + this->clear_atomic(); +} + +template +double EDPDT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(EDPD); +} + +template +void EDPDT::update_flux(void **flux_ptr) { + *flux_ptr=Q.host.begin(); + Q.update_host(_max_q_size,false); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int EDPDT::loop(const int eflag, const int vflag) { + + int nall = this->atom->nall(); + + // Resize Q array if necessary + if (nall > _max_q_size) { + _max_q_size=static_cast(static_cast(nall)*1.10); + Q.resize(_max_q_size); + } + + // signal that we need to transfer extra data from the host + + this->atom->extra_data_unavail(); + + numtyp4 *pextra=reinterpret_cast(&(this->atom->extra[0])); + + int n = 0; + int nstride = 1; + for (int i = 0; i < nall; i++) { + int idx = n+i*nstride; + numtyp4 v; + v.x = edpd_temp[i]; + v.y = edpd_cv[i]; + v.z = 0; + v.w = 0; + pextra[idx] = v; + } + this->atom->add_extra_data(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &coeff2, &mass, + &sc, &kc, &sp_lj, &sp_sqrt, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &Q, &eflag, &vflag, + &_power_flag, &_kappa_flag, &ainum, &nbor_pitch, + &this->atom->v, &cutsq, &this->_dtinvsqrt, &this->_seed, + &this->_timestep, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &coeff2, &mass, + &sc, &kc, &_lj_types, &sp_lj, &sp_sqrt, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &Q, &eflag, &vflag, + &_power_flag, &_kappa_flag, &ainum, &nbor_pitch, + &this->atom->v, &cutsq, &this->_dtinvsqrt, &this->_seed, + &this->_timestep, &this->_threads_per_atom); + } + + this->time_pair.stop(); + return GX; +} + +template +void EDPDT::update_coeff(int ntypes, double **host_a0, double **host_gamma, + double **host_sigma, double **host_cut) +{ + UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_a0,host_gamma, + host_sigma,host_cut); +} + +// --------------------------------------------------------------------------- +// Get the extra data pointers from host +// --------------------------------------------------------------------------- + +template +void EDPDT::get_extra_data(double *host_T, double *host_cv) { + edpd_temp = host_T; + edpd_cv = host_cv; +} + +template class EDPD; +} diff --git a/lib/gpu/lal_edpd.cu b/lib/gpu/lal_edpd.cu new file mode 100644 index 00000000000..9662d15aea3 --- /dev/null +++ b/lib/gpu/lal_edpd.cu @@ -0,0 +1,619 @@ +// ************************************************************************** +// edpd.cu +// ------------------- +// Trung Dac Nguyen (U Chicago) +// +// Device code for acceleration of the edpd pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : September 2023 +// email : ndactrung@gmail.com +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( vel_tex,float4); +#else +_texture_2d( pos_tex,int4); +_texture_2d( vel_tex,int4); +#endif +#else +#define pos_tex x_ +#define vel_tex v_ +#endif + +#define EPSILON (numtyp)1.0e-10 + +//#define _USE_UNIFORM_SARU_LCG +//#define _USE_UNIFORM_SARU_TEA8 +//#define _USE_GAUSSIAN_SARU_LCG + +#if !defined(_USE_UNIFORM_SARU_LCG) && !defined(_USE_UNIFORM_SARU_TEA8) && !defined(_USE_GAUSSIAN_SARU_LCG) +#define _USE_UNIFORM_SARU_LCG +#endif + +// References: +// 1. Y. Afshar, F. Schmid, A. Pishevar, S. Worley, Comput. Phys. Comm. 184 (2013), 1119–1128. +// 2. C. L. Phillips, J. A. Anderson, S. C. Glotzer, Comput. Phys. Comm. 230 (2011), 7191-7201. +// PRNG period = 3666320093*2^32 ~ 2^64 ~ 10^19 + +#define LCGA 0x4beb5d59 /* Full period 32 bit LCG */ +#define LCGC 0x2600e1f7 +#define oWeylPeriod 0xda879add /* Prime period 3666320093 */ +#define oWeylOffset 0x8009d14b +#define TWO_N32 0.232830643653869628906250e-9f /* 2^-32 */ + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns uniformly distributed random numbers u in [-1.0;1.0] +// using the inherent LCG, then multiply u with sqrt(3) to "match" +// with a normal random distribution. +// Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12) +// Curly brackets to make variables local to the scope. +#ifdef _USE_UNIFORM_SARU_LCG +#define SQRT3 (numtyp)1.7320508075688772935274463 +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + unsigned int wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + unsigned int v = (state ^ (state>>26)) + wstate; \ + unsigned int s = (signed int)((v^(v>>20))*0x6957f5a7); \ + randnum = SQRT3*(s*TWO_N32*(numtyp)2.0-(numtyp)1.0); \ +} +#endif + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns uniformly distributed random numbers u in [-1.0;1.0] using TEA8 +// then multiply u with sqrt(3) to "match" with a normal random distribution +// Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12) +#ifdef _USE_UNIFORM_SARU_TEA8 +#define SQRT3 (numtyp)1.7320508075688772935274463 +#define k0 0xA341316C +#define k1 0xC8013EA4 +#define k2 0xAD90777D +#define k3 0x7E95761E +#define delta 0x9e3779b9 +#define rounds 8 +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + unsigned int wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + unsigned int sum = 0; \ + for (int i=0; i < rounds; i++) { \ + sum += delta; \ + state += ((wstate<<4) + k0)^(wstate + sum)^((wstate>>5) + k1); \ + wstate += ((state<<4) + k2)^(state + sum)^((state>>5) + k3); \ + } \ + unsigned int v = (state ^ (state>>26)) + wstate; \ + unsigned int s = (signed int)((v^(v>>20))*0x6957f5a7); \ + randnum = SQRT3*(s*TWO_N32*(numtyp)2.0-(numtyp)1.0); \ +} +#endif + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns two uniformly distributed random numbers r1 and r2 in [-1.0;1.0], +// and uses the polar method (Marsaglia's) to transform to a normal random value +// This is used to compared with CPU DPD using RandMars::gaussian() +#ifdef _USE_GAUSSIAN_SARU_LCG +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state=0x12345678; \ + unsigned int wstate=12345678; \ + state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + unsigned int v, s; \ + numtyp r1, r2, rsq; \ + while (1) { \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + v = (state ^ (state>>26)) + wstate; \ + s = (signed int)((v^(v>>20))*0x6957f5a7); \ + r1 = s*TWO_N32*(numtyp)2.0-(numtyp)1.0; \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + v = (state ^ (state>>26)) + wstate; \ + s = (signed int)((v^(v>>20))*0x6957f5a7); \ + r2 = s*TWO_N32*(numtyp)2.0-(numtyp)1.0; \ + rsq = r1 * r1 + r2 * r2; \ + if (rsq < (numtyp)1.0) break; \ + } \ + numtyp fac = ucl_sqrt((numtyp)-2.0*log(rsq)/rsq); \ + randnum = r2*fac; \ +} +#endif + +#if (SHUFFLE_AVAIL == 0) + +#define store_heatflux(Qi, ii, inum, tid, t_per_atom, offset, Q) \ + if (t_per_atom>1) { \ + simdsync(); \ + simd_reduce_add1(t_per_atom, red_acc, offset, tid, Qi); \ + } \ + if (offset==0 && ii1) { \ + simd_reduce_add1(t_per_atom,Qi); \ + } \ + if (offset==0 && ii tag2) { + tag1 = jtag; tag2 = itag; + } + + numtyp randnum = (numtyp)0.0; + saru(tag1, tag2, seed, timestep, randnum); + + numtyp T_ij=(numtyp)0.5*(Ti+Tj); + numtyp4 T_pow; + T_pow.x = T_ij - (numtyp)1.0; + T_pow.y = T_pow.x*T_pow.x; + T_pow.z = T_pow.x*T_pow.y; + T_pow.w = T_pow.x*T_pow.z; + + numtyp coeff2x = coeff2[mtype].x; //power[itype][jtype] + numtyp coeff2y = coeff2[mtype].y; //kappa[itype][jtype] + numtyp coeff2z = coeff2[mtype].z; //powerT[itype][jtype] + numtyp coeff2w = coeff2[mtype].w; //cutT[itype][jtype] + numtyp power_d = coeff2x; + if (power_flag) { + numtyp factor = (numtyp)1.0; + factor += sc[mtype].x*T_pow.x + sc[mtype].y*T_pow.y + + sc[mtype].z*T_pow.z + sc[mtype].w*T_pow.w; + power_d *= factor; + } + + power_d = MAX((numtyp)0.01,power_d); + numtyp wc = (numtyp)1.0 - r/coeffz; // cut[itype][jtype] + wc = MAX((numtyp)0.0,MIN((numtyp)1.0,wc)); + numtyp wr = ucl_pow(wc, (numtyp)0.5*power_d); + + numtyp kboltz = (numtyp)1.0; + numtyp GammaIJ = coeffy; // gamma[itype][jtype] + numtyp SigmaIJ = (numtyp)4.0*GammaIJ*kboltz*Ti*Tj/(Ti+Tj); + SigmaIJ = ucl_sqrt(SigmaIJ); + + numtyp force = coeffx*T_ij*wc; // a0[itype][jtype] + force -= GammaIJ *wr*wr *dot*rinv; + force += SigmaIJ * wr *randnum * dtinvsqrt; + force *= factor_dpd*rinv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + // heat transfer + + if (r < coeff2w) { + numtyp wrT = (numtyp)1.0 - r/coeff2w; + wrT = MAX((numtyp)0.0,MIN((numtyp)1.0,wrT)); + wrT = ucl_pow(wrT, (numtyp)0.5*coeff2z); // powerT[itype][jtype] + numtyp randnumT = (numtyp)0; + saru(tag1, tag2, seed+tag1+tag2, timestep, randnumT); // randomT->gaussian(); + randnumT = MAX((numtyp)-5.0,MIN(randnum,(numtyp)5.0)); + + numtyp kappaT = coeff2y; // kappa[itype][jtype] + if (kappa_flag) { + numtyp factor = (numtyp)1.0; + factor += kc[mtype].x*T_pow.x + kc[mtype].y*T_pow.y + + kc[mtype].z*T_pow.z + kc[mtype].w*T_pow.w; + kappaT *= factor; + } + + numtyp kij = cvi*cvj*kappaT * T_ij*T_ij; + numtyp alphaij = ucl_sqrt((numtyp)2.0*kboltz*kij); + + numtyp dQc = kij * wrT*wrT * (Tj - Ti)/(Ti*Tj); + numtyp dQd = wr*wr*( GammaIJ * vijeij*vijeij - SigmaIJ*SigmaIJ/mass_itype ) - SigmaIJ * wr *vijeij *randnum; + dQd /= (cvi+cvj); + numtyp dQr = alphaij * wrT * dtinvsqrt * randnumT; + Qi += (dQc + dQd + dQr ); + } + + if (EVFLAG && eflag) { + numtyp e = (numtyp)0.5*coeffx*T_ij*coeffz * wc*wc; + energy+=factor_dpd*e; + } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + } // for nbor + } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + store_heatflux(Qi,ii,inum,tid,t_per_atom,offset,Q); +} + +__kernel void k_edpd_fast(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict extra, + const __global numtyp4 *restrict coeff_in, + const __global numtyp4 *restrict coeff2_in, + const __global numtyp *restrict mass, + const __global numtyp4 *restrict sc_in, + const __global numtyp4 *restrict kc_in, + const __global numtyp *restrict sp_lj_in, + const __global numtyp *restrict sp_sqrt_in, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + __global acctyp *restrict Q, + const int eflag, const int vflag, + const int power_flag, const int kappa_flag, + const int inum, const int nbor_pitch, + const __global numtyp4 *restrict v_, + const __global numtyp *restrict cutsq, + const numtyp dtinvsqrt, const int seed, + const int timestep, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + #ifndef ONETYPE + __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 sc[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 kc[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[4]; + __local numtyp sp_sqrt[4]; + if (tid<4) { + sp_lj[tid]=sp_lj_in[tid]; + sp_sqrt[tid]=sp_sqrt_in[tid]; + } + if (tid tag2) { + tag1 = jtag; tag2 = itag; + } + numtyp randnum = (numtyp)0.0; + saru(tag1, tag2, seed, timestep, randnum); + + numtyp T_ij=(numtyp)0.5*(Ti+Tj); + numtyp4 T_pow; + T_pow.x = T_ij - (numtyp)1.0; + T_pow.y = T_pow.x*T_pow.x; + T_pow.z = T_pow.x*T_pow.y; + T_pow.w = T_pow.x*T_pow.z; + + numtyp power_d = coeff2x; // power[itype][jtype] + if (power_flag) { + numtyp factor = (numtyp)1.0; + factor += scx*T_pow.x + scy*T_pow.y + scz*T_pow.z + scw*T_pow.w; + power_d *= factor; + } + + power_d = MAX((numtyp)0.01,power_d); + numtyp wc = (numtyp)1.0 - r/coeffz; // cut[itype][jtype] + wc = MAX((numtyp)0.0,MIN((numtyp)1.0,wc)); + numtyp wr = ucl_pow((numtyp)wc, (numtyp)0.5*power_d); + + numtyp kboltz = (numtyp)1.0; + numtyp GammaIJ = coeffy; // gamma[itype][jtype] + numtyp SigmaIJ = (numtyp)4.0*GammaIJ*kboltz*Ti*Tj/(Ti+Tj); + SigmaIJ = ucl_sqrt(SigmaIJ); + + numtyp force = coeffx*T_ij*wc; // a0[itype][jtype] + force -= GammaIJ *wr*wr *dot*rinv; + force += SigmaIJ* wr *randnum * dtinvsqrt; + #ifndef ONETYPE + force *= factor_dpd*rinv; + #else + force *= rinv; + #endif + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + // heat transfer + + if (r < coeff2w) { + numtyp wrT = (numtyp)1.0 - r/coeff2w; + wrT = MAX((numtyp)0.0,MIN((numtyp)1.0,wrT)); + wrT = ucl_pow(wrT, (numtyp)0.5*coeff2z); // powerT[itype][jtype] + numtyp randnumT = (numtyp)0; + saru(tag1, tag2, seed+tag1+tag2, timestep, randnumT); // randomT->gaussian(); + randnumT = MAX((numtyp)-5.0,MIN(randnum,(numtyp)5.0)); + + numtyp kappaT = coeff2y; // kappa[itype][jtype] + if (kappa_flag) { + numtyp factor = (numtyp)1.0; + factor += kcx*T_pow.x + kcy*T_pow.y + kcz*T_pow.z + kcw*T_pow.w; + kappaT *= factor; + } + + numtyp kij = cvi*cvj*kappaT * T_ij*T_ij; + numtyp alphaij = ucl_sqrt((numtyp)2.0*kboltz*kij); + + numtyp dQc = kij * wrT*wrT * (Tj - Ti )/(Ti*Tj); + numtyp dQd = wr*wr*( GammaIJ * vijeij*vijeij - SigmaIJ*SigmaIJ/mass_itype ) - SigmaIJ * wr *vijeij *randnum; + dQd /= (cvi+cvj); + numtyp dQr = alphaij * wrT * dtinvsqrt * randnumT; + Qi += (dQc + dQd + dQr ); + } + + if (EVFLAG && eflag) { + numtyp e = (numtyp)0.5*coeffx*T_ij*coeffz * wc*wc; + #ifndef ONETYPE + energy+=factor_dpd*e; + #else + energy+=e; + #endif + } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + + } + } // for nbor + } // if ii + + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, ans,engv); + store_heatflux(Qi,ii,inum,tid,t_per_atom,offset,Q); +} + diff --git a/lib/gpu/lal_edpd.h b/lib/gpu/lal_edpd.h new file mode 100644 index 00000000000..b7935bf2f4e --- /dev/null +++ b/lib/gpu/lal_edpd.h @@ -0,0 +1,106 @@ +/*************************************************************************** + edpd.h + ------------------- + Trung Dac Nguyen (U Chicago) + + Class for acceleration of the dpd pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#ifndef LAL_DPD_H +#define LAL_DPD_H + +#include "lal_base_dpd.h" + +namespace LAMMPS_AL { + +template +class EDPD : public BaseDPD { + public: + EDPD(); + ~EDPD(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_a0, + double **host_gamma, double **host_cut, double **host_power, + double **host_kappa, double **host_powerT, double **host_cutT, + double ***host_sc, double ***host_kc, double *host_mass, + double *host_special_lj, const int power_flag, const int kappa_flag, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, + FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + /// Update coeff if needed (tstat only) + void update_coeff(int ntypes, double **host_a0, double **host_gamma, + double **host_sigma, double **host_cut); + + void get_extra_data(double *host_T, double *host_cv); + + /// copy Q (flux) from device to host + void update_flux(void **flux_ptr); + + // --------------------------- TYPE DATA -------------------------- + + /// coeff.x = a0, coeff.y = gamma, coeff.z = cut + UCL_D_Vec coeff; + /// coeff2.x = power, coeff2.y = kappa, coeff2.z = powerT, coeff2.w = cutT + UCL_D_Vec coeff2; + + UCL_D_Vec kc, sc; + UCL_D_Vec cutsq; + + /// per-type array + UCL_D_Vec mass; + + /// Special LJ values + UCL_D_Vec sp_lj, sp_sqrt; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + /// Per-atom arrays + UCL_Vector Q; + int _max_q_size; + + int _power_flag, _kappa_flag; + + /// pointer to host data + double *edpd_temp, *edpd_cv; + + private: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_edpd_ext.cpp b/lib/gpu/lal_edpd_ext.cpp new file mode 100644 index 00000000000..a9f60c39411 --- /dev/null +++ b/lib/gpu/lal_edpd_ext.cpp @@ -0,0 +1,142 @@ +/*************************************************************************** + edpd_ext.cpp + ------------------- + Trung Dac Nguyen (U Chicago) + + Functions for LAMMPS access to edpd acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#include +#include +#include + +#include "lal_edpd.h" + +using namespace std; +using namespace LAMMPS_AL; + +static EDPD EDPDMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int edpd_gpu_init(const int ntypes, double **cutsq, double **host_a0, + double **host_gamma, double **host_cut, double **host_power, + double **host_kappa, double **host_powerT, double **host_cutT, + double ***host_sc, double ***host_kc, double *host_mass, + double *special_lj, const int power_flag, const int kappa_flag, + const int inum, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + EDPDMF.clear(); + gpu_mode=EDPDMF.device->gpu_mode(); + double gpu_split=EDPDMF.device->particle_split(); + int first_gpu=EDPDMF.device->first_device(); + int last_gpu=EDPDMF.device->last_device(); + int world_me=EDPDMF.device->world_me(); + int gpu_rank=EDPDMF.device->gpu_rank(); + int procs_per_gpu=EDPDMF.device->procs_per_gpu(); + + EDPDMF.device->init_message(screen,"edpd",first_gpu,last_gpu); + + bool message=false; + if (EDPDMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=EDPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_cut, + host_power, host_kappa, host_powerT, + host_cutT, host_sc, host_kc, host_mass, + special_lj, power_flag, kappa_flag, + inum, nall, max_nbors, maxspecial, + cell_size, gpu_split, screen); + + EDPDMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; iserialize_init(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + EDPDMF.estimate_gpu_overhead(); + return init_ok; +} + +void edpd_gpu_clear() { + EDPDMF.clear(); +} + +int ** edpd_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + double *boxlo, double *prd) { + return EDPDMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_v, dtinvsqrt, seed, timestep, boxlo, prd); +} + +void edpd_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + const int nlocal, double *boxlo, double *prd) { + EDPDMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, + firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, + tag, host_v, dtinvsqrt, seed, timestep, nlocal, boxlo, prd); +} + +void edpd_gpu_get_extra_data(double *host_T, double *host_cv) { + EDPDMF.get_extra_data(host_T, host_cv); +} + +void edpd_gpu_update_flux(void **flux_ptr) { + EDPDMF.update_flux(flux_ptr); +} + +double edpd_gpu_bytes() { + return EDPDMF.host_memory_usage(); +} diff --git a/lib/gpu/lal_mdpd.cpp b/lib/gpu/lal_mdpd.cpp new file mode 100644 index 00000000000..16cf926df86 --- /dev/null +++ b/lib/gpu/lal_mdpd.cpp @@ -0,0 +1,218 @@ +/*************************************************************************** + mdpd.cpp + ------------------- + Trung Dac Nguyen (U Chicago) + + Class for acceleration of the mdpd pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "mdpd_cl.h" +#elif defined(USE_CUDART) +const char *mdpd=0; +#else +#include "mdpd_cubin.h" +#endif + +#include "lal_mdpd.h" +#include +namespace LAMMPS_AL { +#define MDPDT MDPD + +extern Device device; + +template +MDPDT::MDPD() : BaseDPD(), _allocated(false) { +} + +template +MDPDT::~MDPD() { + clear(); +} + +template +int MDPDT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int MDPDT::init(const int ntypes, + double **host_cutsq, double **host_A_att, double **host_B_rep, + double **host_gamma, double **host_sigma, + double **host_cut, double **host_cut_r, + double *host_special_lj, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen) { + const int max_shared_types=this->device->max_shared_types(); + + int onetype=0; + #ifdef USE_OPENCL + if (maxspecial==0) + for (int i=1; i0) { + if (onetype>0) + onetype=-1; + else if (onetype==0) + onetype=i*max_shared_types+j; + } + if (onetype<0) onetype=0; + #endif + + int success; + int extra_fields = 4; // round up to accomodate quadruples of numtyp values + // rho + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,mdpd,"k_mdpd",onetype,extra_fields); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_A_att,host_B_rep, + host_gamma,host_sigma); + + coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_cut,host_cut_r, + host_cutsq); + + UCL_H_Vec host_rsq(lj_types*lj_types,*(this->ucl_device), + UCL_WRITE_ONLY); + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack1(ntypes,lj_types,cutsq,host_rsq,host_cutsq); + + double special_sqrt[4]; + special_sqrt[0] = sqrt(host_special_lj[0]); + special_sqrt[1] = sqrt(host_special_lj[1]); + special_sqrt[2] = sqrt(host_special_lj[2]); + special_sqrt[3] = sqrt(host_special_lj[3]); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + sp_sqrt.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(special_sqrt,4,*(this->ucl_device)); + ucl_copy(sp_sqrt,dview,false); + + // allocate per-atom array Q + + int ef_nall=nall; + if (ef_nall==0) + ef_nall=2000; + + _allocated=true; + this->_max_bytes=coeff.row_bytes()+coeff2.row_bytes()+cutsq.row_bytes()+ + sp_lj.row_bytes()+sp_sqrt.row_bytes(); + return 0; +} + +template +void MDPDT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff.clear(); + coeff2.clear(); + cutsq.clear(); + sp_lj.clear(); + sp_sqrt.clear(); + this->clear_atomic(); +} + +template +double MDPDT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(MDPD); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int MDPDT::loop(const int eflag, const int vflag) { + + int nall = this->atom->nall(); + + // signal that we need to transfer extra data from the host + + this->atom->extra_data_unavail(); + + numtyp4 *pextra=reinterpret_cast(&(this->atom->extra[0])); + + int n = 0; + int nstride = 1; + for (int i = 0; i < nall; i++) { + int idx = n+i*nstride; + numtyp4 v; + v.x = mdpd_rho[i]; + v.y = 0; + v.z = 0; + v.w = 0; + pextra[idx] = v; + } + this->atom->add_extra_data(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &coeff2, + &sp_lj, &sp_sqrt, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->v, &cutsq, &this->_dtinvsqrt, &this->_seed, + &this->_timestep, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &coeff2, + &_lj_types, &sp_lj, &sp_sqrt, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->v, &cutsq, &this->_dtinvsqrt, &this->_seed, + &this->_timestep, &this->_threads_per_atom); + } + + this->time_pair.stop(); + return GX; +} + +// --------------------------------------------------------------------------- +// Get the extra data pointers from host +// --------------------------------------------------------------------------- + +template +void MDPDT::get_extra_data(double *host_rho) { + mdpd_rho = host_rho; +} + +template class MDPD; +} diff --git a/lib/gpu/lal_mdpd.cu b/lib/gpu/lal_mdpd.cu new file mode 100644 index 00000000000..6230cb24961 --- /dev/null +++ b/lib/gpu/lal_mdpd.cu @@ -0,0 +1,475 @@ +// ************************************************************************** +// mdpd.cu +// ------------------- +// Trung Dac Nguyen (ORNL) +// +// Device code for acceleration of the mdpd pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : December 2023 +// email : ndactrung@gmail.com +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( vel_tex,float4); +#else +_texture_2d( pos_tex,int4); +_texture_2d( vel_tex,int4); +#endif +#else +#define pos_tex x_ +#define vel_tex v_ +#endif + +#define EPSILON (numtyp)1.0e-10 + +//#define _USE_UNIFORM_SARU_LCG +//#define _USE_UNIFORM_SARU_TEA8 +//#define _USE_GAUSSIAN_SARU_LCG + +#if !defined(_USE_UNIFORM_SARU_LCG) && !defined(_USE_UNIFORM_SARU_TEA8) && !defined(_USE_GAUSSIAN_SARU_LCG) +#define _USE_UNIFORM_SARU_LCG +#endif + +// References: +// 1. Y. Afshar, F. Schmid, A. Pishevar, S. Worley, Comput. Phys. Comm. 184 (2013), 1119–1128. +// 2. C. L. Phillips, J. A. Anderson, S. C. Glotzer, Comput. Phys. Comm. 230 (2011), 7191-7201. +// PRNG period = 3666320093*2^32 ~ 2^64 ~ 10^19 + +#define LCGA 0x4beb5d59 /* Full period 32 bit LCG */ +#define LCGC 0x2600e1f7 +#define oWeylPeriod 0xda879add /* Prime period 3666320093 */ +#define oWeylOffset 0x8009d14b +#define TWO_N32 0.232830643653869628906250e-9f /* 2^-32 */ + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns uniformly distributed random numbers u in [-1.0;1.0] +// using the inherent LCG, then multiply u with sqrt(3) to "match" +// with a normal random distribution. +// Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12) +// Curly brackets to make variables local to the scope. +#ifdef _USE_UNIFORM_SARU_LCG +#define SQRT3 (numtyp)1.7320508075688772935274463 +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + unsigned int wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + unsigned int v = (state ^ (state>>26)) + wstate; \ + unsigned int s = (signed int)((v^(v>>20))*0x6957f5a7); \ + randnum = SQRT3*(s*TWO_N32*(numtyp)2.0-(numtyp)1.0); \ +} +#endif + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns uniformly distributed random numbers u in [-1.0;1.0] using TEA8 +// then multiply u with sqrt(3) to "match" with a normal random distribution +// Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12) +#ifdef _USE_UNIFORM_SARU_TEA8 +#define SQRT3 (numtyp)1.7320508075688772935274463 +#define k0 0xA341316C +#define k1 0xC8013EA4 +#define k2 0xAD90777D +#define k3 0x7E95761E +#define delta 0x9e3779b9 +#define rounds 8 +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + unsigned int wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + unsigned int sum = 0; \ + for (int i=0; i < rounds; i++) { \ + sum += delta; \ + state += ((wstate<<4) + k0)^(wstate + sum)^((wstate>>5) + k1); \ + wstate += ((state<<4) + k2)^(state + sum)^((state>>5) + k3); \ + } \ + unsigned int v = (state ^ (state>>26)) + wstate; \ + unsigned int s = (signed int)((v^(v>>20))*0x6957f5a7); \ + randnum = SQRT3*(s*TWO_N32*(numtyp)2.0-(numtyp)1.0); \ +} +#endif + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns two uniformly distributed random numbers r1 and r2 in [-1.0;1.0], +// and uses the polar method (Marsaglia's) to transform to a normal random value +// This is used to compared with CPU DPD using RandMars::gaussian() +#ifdef _USE_GAUSSIAN_SARU_LCG +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state=0x12345678; \ + unsigned int wstate=12345678; \ + state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + unsigned int v, s; \ + numtyp r1, r2, rsq; \ + while (1) { \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + v = (state ^ (state>>26)) + wstate; \ + s = (signed int)((v^(v>>20))*0x6957f5a7); \ + r1 = s*TWO_N32*(numtyp)2.0-(numtyp)1.0; \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + v = (state ^ (state>>26)) + wstate; \ + s = (signed int)((v^(v>>20))*0x6957f5a7); \ + r2 = s*TWO_N32*(numtyp)2.0-(numtyp)1.0; \ + rsq = r1 * r1 + r2 * r2; \ + if (rsq < (numtyp)1.0) break; \ + } \ + numtyp fac = ucl_sqrt((numtyp)-2.0*log(rsq)/rsq); \ + randnum = r2*fac; \ +} +#endif + +#define MIN(A,B) ((A) < (B) ? (A) : (B)) +#define MAX(A,B) ((A) < (B) ? (B) : (A)) + +// coeff.x = A_att, coeff.y = B_rep, coeff.z = gamma, coeff.w = sigma +// coeff2.x = cut, coeff2.y = cut_r, coeff2.z = cutsq + +__kernel void k_mdpd(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict extra, + const __global numtyp4 *restrict coeff, + const __global numtyp4 *restrict coeff2, + const int lj_types, + const __global numtyp *restrict sp_lj, + const __global numtyp *restrict sp_sqrt, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp4 *restrict v_, + const __global numtyp *restrict cutsq, + const numtyp dtinvsqrt, const int seed, + const int timestep, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + int n_stride; + local_allocate_store_pair(); + + acctyp3 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } + + if (ii tag2) { + tag1 = jtag; tag2 = itag; + } + + numtyp randnum = (numtyp)0.0; + saru(tag1, tag2, seed, timestep, randnum); + + // conservative force = A_att * wc + B_rep*(rhoi+rhoj)*wc_r + // drag force = -gamma * wr^2 * (delx dot delv) / r + // random force = sigma * wr * rnd * dtinvsqrt; + + numtyp force = A_attij*wc + B_repij*(rhoi+rhoj)*wc_r; + force -= gammaij*wr*wr*dot*rinv; + force += sigmaij*wr*randnum*dtinvsqrt; + force *= factor_dpd*rinv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (EVFLAG && eflag) { + // unshifted eng of conservative term: + // eng shifted to 0.0 at cutoff + numtyp e = (numtyp)0.5*A_attij*cutij * wr*wr + (numtyp)0.5*B_repij*cut_rij*(rhoi+rhoj)*wc_r*wc_r; + energy+=factor_dpd*e; + } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); +} + +__kernel void k_mdpd_fast(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict extra, + const __global numtyp4 *restrict coeff_in, + const __global numtyp4 *restrict coeff2_in, + const __global numtyp *restrict sp_lj_in, + const __global numtyp *restrict sp_sqrt_in, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp4 *restrict v_, + const __global numtyp *restrict cutsq, + const numtyp dtinvsqrt, const int seed, + const int timestep, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + #ifndef ONETYPE + __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[4]; + __local numtyp sp_sqrt[4]; + if (tid<4) { + sp_lj[tid]=sp_lj_in[tid]; + sp_sqrt[tid]=sp_sqrt_in[tid]; + } + if (tid tag2) { + tag1 = jtag; tag2 = itag; + } + + numtyp randnum = (numtyp)0.0; + saru(tag1, tag2, seed, timestep, randnum); + + // conservative force = A_att * wc + B_rep*(rhoi+rhoj)*wc_r + // drag force = -gamma * wr^2 * (delx dot delv) / r + // random force = sigma * wr * rnd * dtinvsqrt; + + numtyp force = A_attij*wc + B_repij*(rhoi+rhoj)*wc_r; + force -= gammaij*wr*wr*dot*rinv; + force += sigmaij*wr*randnum*dtinvsqrt; + #ifndef ONETYPE + force *= factor_dpd*rinv; + #else + force*=rinv; + #endif + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (EVFLAG && eflag) { + // unshifted eng of conservative term: + // eng shifted to 0.0 at cutoff + numtyp e = (numtyp)0.5*A_attij*cutij * wr*wr + (numtyp)0.5*B_repij*cut_rij*(rhoi+rhoj)*wc_r*wc_r; + #ifndef ONETYPE + energy+=factor_dpd*e; + #else + energy+=e; + #endif + } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); +} + diff --git a/lib/gpu/lal_mdpd.h b/lib/gpu/lal_mdpd.h new file mode 100644 index 00000000000..ca702dba73c --- /dev/null +++ b/lib/gpu/lal_mdpd.h @@ -0,0 +1,88 @@ +/*************************************************************************** + mdpd.h + ------------------- + Trung Dac Nguyen (U Chicago) + + Class for acceleration of the mdpd pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#ifndef LAL_DPD_H +#define LAL_DPD_H + +#include "lal_base_dpd.h" + +namespace LAMMPS_AL { + +template +class MDPD : public BaseDPD { + public: + MDPD(); + ~MDPD(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_A_att, double **host_B_rep, + double **host_gamma, double **host_sigma, + double **host_cut, double **host_cut_r, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, + FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + void get_extra_data(double *host_rho); + + // --------------------------- TYPE DATA -------------------------- + + /// coeff.x = A_att, coeff.x = B_rep, coeff.z = gamma, coeff.w = sigma + UCL_D_Vec coeff; + /// coeff2.x = cut, coeff2.y = cut_r, coeff2.z = cutsq + UCL_D_Vec coeff2; + + UCL_D_Vec cutsq; + + /// Special LJ values + UCL_D_Vec sp_lj, sp_sqrt; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + /// pointer to host data + double *mdpd_rho; + + private: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_mdpd_ext.cpp b/lib/gpu/lal_mdpd_ext.cpp new file mode 100644 index 00000000000..def6adb1f60 --- /dev/null +++ b/lib/gpu/lal_mdpd_ext.cpp @@ -0,0 +1,133 @@ +/*************************************************************************** + mdpd_ext.cpp + ------------------- + Trung Dac Nguyen (U Chicago) + + Functions for LAMMPS access to mdpd acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#include +#include +#include + +#include "lal_mdpd.h" + +using namespace std; +using namespace LAMMPS_AL; + +static MDPD MDPDMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int mdpd_gpu_init(const int ntypes, double **cutsq, + double **host_A_att, double **host_B_rep, + double **host_gamma, double **host_sigma, + double **host_cut, double **host_cut_r, + double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + MDPDMF.clear(); + gpu_mode=MDPDMF.device->gpu_mode(); + double gpu_split=MDPDMF.device->particle_split(); + int first_gpu=MDPDMF.device->first_device(); + int last_gpu=MDPDMF.device->last_device(); + int world_me=MDPDMF.device->world_me(); + int gpu_rank=MDPDMF.device->gpu_rank(); + int procs_per_gpu=MDPDMF.device->procs_per_gpu(); + + MDPDMF.device->init_message(screen,"mdpd",first_gpu,last_gpu); + + bool message=false; + if (MDPDMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=MDPDMF.init(ntypes, cutsq, host_A_att, host_B_rep, host_gamma, host_sigma, + host_cut, host_cut_r, special_lj, inum, nall, max_nbors, + maxspecial, cell_size, gpu_split, screen); + + MDPDMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; iserialize_init(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + MDPDMF.estimate_gpu_overhead(); + return init_ok; +} + +void mdpd_gpu_clear() { + MDPDMF.clear(); +} + +int ** mdpd_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + double *boxlo, double *prd) { + return MDPDMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_v, dtinvsqrt, seed, timestep, boxlo, prd); +} + +void mdpd_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + const int nlocal, double *boxlo, double *prd) { + MDPDMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, + firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, + tag, host_v, dtinvsqrt, seed, timestep, nlocal, boxlo, prd); +} + +void mdpd_gpu_get_extra_data(double *host_rho) { + MDPDMF.get_extra_data(host_rho); +} + +double mdpd_gpu_bytes() { + return MDPDMF.host_memory_usage(); +} + + diff --git a/src/GPU/pair_edpd_gpu.cpp b/src/GPU/pair_edpd_gpu.cpp new file mode 100644 index 00000000000..5bee0cadb81 --- /dev/null +++ b/src/GPU/pair_edpd_gpu.cpp @@ -0,0 +1,195 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Dac Nguyen (U Chicago) +------------------------------------------------------------------------- */ + +#include "pair_edpd_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "info.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "suffix.h" +#include "update.h" + +#include + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int edpd_gpu_init(const int ntypes, double **cutsq, double **host_a0, double **host_gamma, + double **host_cut, double **host_power, double **host_kappa, + double **host_powerT, double** host_cutT, double*** host_sc, double ***host_kc, + double *host_mass, double *special_lj, const int power_flag, const int kappa_flag, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); +void edpd_gpu_clear(); +int **edpd_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, + int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double **host_v, + const double dtinvsqrt, const int seed, const int timestep, double *boxlo, + double *prd); +void edpd_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, + int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, double **host_v, + const double dtinvsqrt, const int seed, const int timestep, const int nlocal, + double *boxlo, double *prd); +void edpd_gpu_get_extra_data(double *host_T, double *host_cv); +void edpd_gpu_update_flux(void **flux_ptr); +double edpd_gpu_bytes(); + +#define EPSILON 1.0e-10 + +/* ---------------------------------------------------------------------- */ + +PairEDPDGPU::PairEDPDGPU(LAMMPS *lmp) : PairEDPD(lmp), gpu_mode(GPU_FORCE) +{ + flux_pinned = nullptr; + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairEDPDGPU::~PairEDPDGPU() +{ + edpd_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairEDPDGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + double dtinvsqrt = 1.0 / sqrt(update->dt); + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double *T = atom->edpd_temp; + double *cv = atom->edpd_cv; + edpd_gpu_get_extra_data(T, cv); + + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + inum = atom->nlocal; + firstneigh = edpd_gpu_compute_n( + neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, + cpu_time, success, atom->v, dtinvsqrt, seed, update->ntimestep, domain->boxlo, domain->prd); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + edpd_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, atom->tag, + atom->v, dtinvsqrt, seed, update->ntimestep, atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + // get the heat flux from device + + double *Q = atom->edpd_flux; + edpd_gpu_update_flux(&flux_pinned); + + int nlocal = atom->nlocal; + if (acc_float) { + auto flux_ptr = (float *)flux_pinned; + for (int i = 0; i < nlocal; i++) + Q[i] = flux_ptr[i]; + + } else { + auto flux_ptr = (double *)flux_pinned; + for (int i = 0; i < nlocal; i++) + Q[i] = flux_ptr[i]; + } + + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairEDPDGPU::init_style() +{ + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double mcut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + mcut = init_one(i, j); + mcut *= mcut; + if (mcut > maxcut) maxcut = mcut; + cutsq[i][j] = cutsq[j][i] = mcut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + int success = + edpd_gpu_init(atom->ntypes + 1, cutsq, a0, gamma, cut, power, kappa, + powerT, cutT, sc, kc, atom->mass, force->special_lj, + power_flag, kappa_flag, atom->nlocal, atom->nlocal + atom->nghost, + mnf, maxspecial, cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success, error, world); + + acc_float = Info::has_accelerator_feature("GPU", "precision", "single"); + + if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); +} + +/* ---------------------------------------------------------------------- */ + +double PairEDPDGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + edpd_gpu_bytes(); +} diff --git a/src/GPU/pair_edpd_gpu.h b/src/GPU/pair_edpd_gpu.h new file mode 100644 index 00000000000..75495b2ca4a --- /dev/null +++ b/src/GPU/pair_edpd_gpu.h @@ -0,0 +1,48 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(edpd/gpu,PairEDPDGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_EDPD_GPU_H +#define LMP_PAIR_EDPD_GPU_H + +#include "pair_edpd.h" + +namespace LAMMPS_NS { + +class PairEDPDGPU : public PairEDPD { + public: + PairEDPDGPU(LAMMPS *lmp); + ~PairEDPDGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + void *flux_pinned; + bool acc_float; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif diff --git a/src/GPU/pair_mdpd_gpu.cpp b/src/GPU/pair_mdpd_gpu.cpp new file mode 100644 index 00000000000..bebe1e97360 --- /dev/null +++ b/src/GPU/pair_mdpd_gpu.cpp @@ -0,0 +1,171 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Dac Nguyen (U Chicago) +------------------------------------------------------------------------- */ + +#include "pair_mdpd_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "info.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "suffix.h" +#include "update.h" + +#include + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int mdpd_gpu_init(const int ntypes, double **cutsq, double **host_A_att, double **host_B_rep, + double **host_gamma, double **host_sigma, double **host_cut, double **host_cut_r, + double *special_lj, const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); +void mdpd_gpu_clear(); +int **mdpd_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, + int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double **host_v, + const double dtinvsqrt, const int seed, const int timestep, double *boxlo, + double *prd); +void mdpd_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, + int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, double **host_v, + const double dtinvsqrt, const int seed, const int timestep, const int nlocal, + double *boxlo, double *prd); +void mdpd_gpu_get_extra_data(double *host_rho); +double mdpd_gpu_bytes(); + +#define EPSILON 1.0e-10 + +/* ---------------------------------------------------------------------- */ + +PairMDPDGPU::PairMDPDGPU(LAMMPS *lmp) : PairMDPD(lmp), gpu_mode(GPU_FORCE) +{ + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairMDPDGPU::~PairMDPDGPU() +{ + mdpd_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairMDPDGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + double dtinvsqrt = 1.0 / sqrt(update->dt); + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double *rho = atom->rho; + mdpd_gpu_get_extra_data(rho); + + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + inum = atom->nlocal; + firstneigh = mdpd_gpu_compute_n( + neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, + cpu_time, success, atom->v, dtinvsqrt, seed, update->ntimestep, domain->boxlo, domain->prd); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + mdpd_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, atom->tag, + atom->v, dtinvsqrt, seed, update->ntimestep, atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairMDPDGPU::init_style() +{ + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double mcut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + mcut = init_one(i, j); + mcut *= mcut; + if (mcut > maxcut) maxcut = mcut; + cutsq[i][j] = cutsq[j][i] = mcut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + int success = + mdpd_gpu_init(atom->ntypes + 1, cutsq, A_att, B_rep, gamma, sigma, + cut, cut_r, force->special_lj, + atom->nlocal, atom->nlocal + atom->nghost, + mnf, maxspecial, cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success, error, world); + + if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); +} + +/* ---------------------------------------------------------------------- */ + +double PairMDPDGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + mdpd_gpu_bytes(); +} diff --git a/src/GPU/pair_mdpd_gpu.h b/src/GPU/pair_mdpd_gpu.h new file mode 100644 index 00000000000..5f27c4014e8 --- /dev/null +++ b/src/GPU/pair_mdpd_gpu.h @@ -0,0 +1,45 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(mdpd/gpu,PairMDPDGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_MDPD_GPU_H +#define LMP_PAIR_MDPD_GPU_H + +#include "pair_mdpd.h" + +namespace LAMMPS_NS { + +class PairMDPDGPU : public PairMDPD { + public: + PairMDPDGPU(LAMMPS *lmp); + ~PairMDPDGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif From e53b3c76f5982f7f4f27f175fc3937081d38063e Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 5 Dec 2023 15:32:57 -0600 Subject: [PATCH 04/30] Updated doc pages for the added pair styles, added a cmake preset for gpu-cuda, added "comm_modify vel yes" to in.mdpd --- cmake/presets/gpu-cuda.cmake | 12 ++++++++++++ doc/src/pair_coul_slater.rst | 3 +++ doc/src/pair_fep_soft.rst | 6 ++++-- doc/src/pair_mesodpd.rst | 6 ++++++ examples/PACKAGES/dpd-meso/mdpd/in.mdpd | 1 + 5 files changed, 26 insertions(+), 2 deletions(-) create mode 100644 cmake/presets/gpu-cuda.cmake diff --git a/cmake/presets/gpu-cuda.cmake b/cmake/presets/gpu-cuda.cmake new file mode 100644 index 00000000000..c5e967754a5 --- /dev/null +++ b/cmake/presets/gpu-cuda.cmake @@ -0,0 +1,12 @@ +# preset that enables GPU and selects CUDA API + +set(PKG_GPU ON CACHE BOOL "Build GPU package" FORCE) +set(GPU_API "cuda" CACHE STRING "APU used by GPU package" FORCE) +set(GPU_PREC "mixed" CACHE STRING "" FORCE) +set(GPU_ARCH "sm_60;sm_70;sm_80" CACHE STRING "LAMMPS GPU CUDA SM architectures" FORCE) + +set(CUDA_NVCC_FLAGS "-allow-unsupported-compiler" CACHE STRING "" FORCE) +set(CUDA_NVCC_FLAGS_DEBUG "-allow-unsupported-compiler" CACHE STRING "" FORCE) +set(CUDA_NVCC_FLAGS_MINSIZEREL "-allow-unsupported-compiler" CACHE STRING "" FORCE) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO "-allow-unsupported-compiler" CACHE STRING "" FORCE) +set(CUDA_NVCC_FLAGS_RELEASE "-allow-unsupported-compiler" CACHE STRING "" FORCE) diff --git a/doc/src/pair_coul_slater.rst b/doc/src/pair_coul_slater.rst index 443de4262b8..074f10ba834 100644 --- a/doc/src/pair_coul_slater.rst +++ b/doc/src/pair_coul_slater.rst @@ -1,6 +1,7 @@ .. index:: pair_style coul/slater .. index:: pair_style coul/slater/cut .. index:: pair_style coul/slater/long +.. index:: pair_style coul/slater/long/gpu pair_style coul/slater command ============================== @@ -11,6 +12,8 @@ pair_style coul/slater/cut command pair_style coul/slater/long command =================================== +Accelerator Variants: *coul/slater/gpu* + Syntax """""" diff --git a/doc/src/pair_fep_soft.rst b/doc/src/pair_fep_soft.rst index 400ad0cc4a4..20e17ce0b42 100644 --- a/doc/src/pair_fep_soft.rst +++ b/doc/src/pair_fep_soft.rst @@ -1,8 +1,10 @@ .. index:: pair_style lj/cut/soft .. index:: pair_style lj/cut/soft/omp .. index:: pair_style lj/cut/coul/cut/soft +.. index:: pair_style lj/cut/coul/cut/soft/gpu .. index:: pair_style lj/cut/coul/cut/soft/omp .. index:: pair_style lj/cut/coul/long/soft +.. index:: pair_style lj/cut/coul/long/soft/gpu .. index:: pair_style lj/cut/coul/long/soft/omp .. index:: pair_style lj/cut/tip4p/long/soft .. index:: pair_style lj/cut/tip4p/long/soft/omp @@ -27,12 +29,12 @@ Accelerator Variants: *lj/cut/soft/omp* pair_style lj/cut/coul/cut/soft command ======================================= -Accelerator Variants: *lj/cut/coul/cut/soft/omp* +Accelerator Variants: *lj/cut/coul/cut/soft/gpu*, *lj/cut/coul/cut/soft/omp* pair_style lj/cut/coul/long/soft command ======================================== -Accelerator Variants: *lj/cut/coul/long/soft/omp* +Accelerator Variants: *lj/cut/coul/long/soft/gpu*, *lj/cut/coul/long/soft/omp* pair_style lj/cut/tip4p/long/soft command ========================================= diff --git a/doc/src/pair_mesodpd.rst b/doc/src/pair_mesodpd.rst index 5d244f3b1d1..28a398754ff 100644 --- a/doc/src/pair_mesodpd.rst +++ b/doc/src/pair_mesodpd.rst @@ -1,14 +1,20 @@ .. index:: pair_style edpd +.. index:: pair_style edpd/gpu .. index:: pair_style mdpd +.. index:: pair_style mdpd/gpu .. index:: pair_style mdpd/rhosum .. index:: pair_style tdpd pair_style edpd command ======================= +Accelerator Variants: *edpd/gpu* + pair_style mdpd command ======================= +Accelerator Variants: *mdpd/gpu* + pair_style mdpd/rhosum command ============================== diff --git a/examples/PACKAGES/dpd-meso/mdpd/in.mdpd b/examples/PACKAGES/dpd-meso/mdpd/in.mdpd index b0740c82276..2c740f41275 100644 --- a/examples/PACKAGES/dpd-meso/mdpd/in.mdpd +++ b/examples/PACKAGES/dpd-meso/mdpd/in.mdpd @@ -16,6 +16,7 @@ neighbor 0.3 bin neigh_modify every 1 delay 0 check yes atom_style mdpd +comm_modify vel yes region mdpd block -25 25 -10 10 -10 10 units box create_box 1 mdpd From 0940793537b45aa16cd090c9a617769dba1b118d Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 7 Dec 2023 09:23:49 -0600 Subject: [PATCH 05/30] Updated the coul_soft and coul_long_soft kernels with forces as acctyp3 --- lib/gpu/lal_lj_coul_long_soft.cu | 12 +++++++----- lib/gpu/lal_lj_coul_soft.cu | 9 +++++---- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/lib/gpu/lal_lj_coul_long_soft.cu b/lib/gpu/lal_lj_coul_long_soft.cu index 242b52fb791..7e55f34a45c 100644 --- a/lib/gpu/lal_lj_coul_long_soft.cu +++ b/lib/gpu/lal_lj_coul_long_soft.cu @@ -14,7 +14,7 @@ // *************************************************************************** #if defined(NV_KERNEL) || defined(USE_HIP) -#include + #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE _texture( pos_tex,float4); @@ -36,7 +36,7 @@ __kernel void k_lj_coul_long_soft(const __global numtyp4 *restrict x_, const __global numtyp *restrict sp_lj_in, const __global int *dev_nbor, const __global int *dev_packed, - __global acctyp4 *restrict ans, + __global acctyp3 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, @@ -59,7 +59,7 @@ __kernel void k_lj_coul_long_soft(const __global numtyp4 *restrict x_, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - acctyp4 f; + acctyp3 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp energy, e_coul, virial[6]; if (EVFLAG) { @@ -79,6 +79,7 @@ __kernel void k_lj_coul_long_soft(const __global numtyp4 *restrict x_, int itype=ix.w; for ( ; nbor Date: Thu, 7 Dec 2023 10:16:40 -0600 Subject: [PATCH 06/30] Removed unused member functions in edpd --- lib/gpu/lal_base_dpd.h | 4 +--- lib/gpu/lal_edpd.cpp | 10 ---------- lib/gpu/lal_edpd.h | 8 ++------ lib/gpu/lal_mdpd.h | 4 ++-- 4 files changed, 5 insertions(+), 21 deletions(-) diff --git a/lib/gpu/lal_base_dpd.h b/lib/gpu/lal_base_dpd.h index 6e9451d72a0..64ec725d95a 100644 --- a/lib/gpu/lal_base_dpd.h +++ b/lib/gpu/lal_base_dpd.h @@ -196,11 +196,9 @@ class BaseDPD { numtyp _dtinvsqrt; int _seed, _timestep; - int _extra_fields; - protected: bool _compiled; - int _block_size, _threads_per_atom, _onetype; + int _block_size, _threads_per_atom, _onetype, _extra_fields; double _max_bytes, _max_an_bytes; double _gpu_overhead, _driver_overhead; UCL_D_Vec *_nbor_data; diff --git a/lib/gpu/lal_edpd.cpp b/lib/gpu/lal_edpd.cpp index 2a32d01abb7..c85644a60bc 100644 --- a/lib/gpu/lal_edpd.cpp +++ b/lib/gpu/lal_edpd.cpp @@ -271,16 +271,6 @@ int EDPDT::loop(const int eflag, const int vflag) { return GX; } -template -void EDPDT::update_coeff(int ntypes, double **host_a0, double **host_gamma, - double **host_sigma, double **host_cut) -{ - UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), - UCL_WRITE_ONLY); - this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_a0,host_gamma, - host_sigma,host_cut); -} - // --------------------------------------------------------------------------- // Get the extra data pointers from host // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_edpd.h b/lib/gpu/lal_edpd.h index b7935bf2f4e..c293b86ee66 100644 --- a/lib/gpu/lal_edpd.h +++ b/lib/gpu/lal_edpd.h @@ -13,8 +13,8 @@ email : ndactrung@gmail.com ***************************************************************************/ -#ifndef LAL_DPD_H -#define LAL_DPD_H +#ifndef LAL_EDPD_H +#define LAL_EDPD_H #include "lal_base_dpd.h" @@ -56,10 +56,6 @@ class EDPD : public BaseDPD { /// Total host memory used by library for pair style double host_memory_usage() const; - /// Update coeff if needed (tstat only) - void update_coeff(int ntypes, double **host_a0, double **host_gamma, - double **host_sigma, double **host_cut); - void get_extra_data(double *host_T, double *host_cv); /// copy Q (flux) from device to host diff --git a/lib/gpu/lal_mdpd.h b/lib/gpu/lal_mdpd.h index ca702dba73c..0e95185714d 100644 --- a/lib/gpu/lal_mdpd.h +++ b/lib/gpu/lal_mdpd.h @@ -13,8 +13,8 @@ email : ndactrung@gmail.com ***************************************************************************/ -#ifndef LAL_DPD_H -#define LAL_DPD_H +#ifndef LAL_MDPD_H +#define LAL_MDPD_H #include "lal_base_dpd.h" From 26c7358a849eaadf956252c2e6a63a6e31f8440b Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 7 Dec 2023 10:24:25 -0600 Subject: [PATCH 07/30] Working on sph_lj kernels --- lib/gpu/lal_sph_lj.cpp | 212 ++++++++++++++++++ lib/gpu/lal_sph_lj.cu | 432 +++++++++++++++++++++++++++++++++++++ lib/gpu/lal_sph_lj.h | 88 ++++++++ lib/gpu/lal_sph_lj_ext.cpp | 132 ++++++++++++ 4 files changed, 864 insertions(+) create mode 100644 lib/gpu/lal_sph_lj.cpp create mode 100644 lib/gpu/lal_sph_lj.cu create mode 100644 lib/gpu/lal_sph_lj.h create mode 100644 lib/gpu/lal_sph_lj_ext.cpp diff --git a/lib/gpu/lal_sph_lj.cpp b/lib/gpu/lal_sph_lj.cpp new file mode 100644 index 00000000000..8367a81b1e2 --- /dev/null +++ b/lib/gpu/lal_sph_lj.cpp @@ -0,0 +1,212 @@ +/*************************************************************************** + sph_lj.cpp + ------------------- + Trung Dac Nguyen (U Chicago) + + Class for acceleration of the sph_lj pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "sph_lj_cl.h" +#elif defined(USE_CUDART) +const char *sph_lj=0; +#else +#include "sph_lj_cubin.h" +#endif + +#include "lal_sph_lj.h" +#include +namespace LAMMPS_AL { +#define SPHLJT SPHLJ + +extern Device device; + +template +SPHLJT::SPHLJ() : BaseDPD(), _allocated(false) { + _max_drhoE_size = 0; +} + +template +SPHLJT::~SPHLJ() { + clear(); +} + +template +int SPHLJT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int SPHLJT::init(const int ntypes, + double **host_cutsq, double **host_viscosity, + double *host_special_lj, + const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen) { + const int max_shared_types=this->device->max_shared_types(); + + int onetype=0; + #ifdef USE_OPENCL + if (maxspecial==0) + for (int i=1; i0) { + if (onetype>0) + onetype=-1; + else if (onetype==0) + onetype=i*max_shared_types+j; + } + if (onetype<0) onetype=0; + #endif + + int success; + int extra_fields = 4; // round up to accomodate quadruples of numtyp values + // rho, cv, mass + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,sph_lj,"k_sph_lj",onetype,extra_fields); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack2(ntypes,lj_types,coeff,host_write,host_viscosity, + host_cutsq); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + // allocate per-atom array Q + + int ef_nall=nall; + if (ef_nall==0) + ef_nall=2000; + + _max_drhoE_size=static_cast(static_cast(ef_nall)*1.10); + drhoE.alloc(_max_drhoE_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + + _allocated=true; + this->_max_bytes=coeff.row_bytes()+drhoE.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void SPHLJT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff.clear(); + drhoE.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double SPHLJT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(SPHLJ); +} + +template +void SPHLJT::update_drhoE(void **drhoE_ptr) { + *drhoE_ptr=drhoE.host.begin(); + drhoE.update_host(_max_drhoE_size,false); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int SPHLJT::loop(const int eflag, const int vflag) { + + int nall = this->atom->nall(); + + // Resize drhoE array if necessary + if (nall > _max_drhoE_size) { + _max_drhoE_size=static_cast(static_cast(nall)*1.10); + drhoE.resize(_max_drhoE_size); + } + + // signal that we need to transfer extra data from the host + + this->atom->extra_data_unavail(); + + numtyp4 *pextra=reinterpret_cast(&(this->atom->extra[0])); + + int n = 0; + int nstride = 1; + for (int i = 0; i < nall; i++) { + int idx = n+i*nstride; + numtyp4 v; + v.x = rho[i]; + v.y = cv[i]; + v.z = mass[i]; + v.w = 0; + pextra[idx] = v; + } + this->atom->add_extra_data(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &drhoE, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->v, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, + &_lj_types, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &drhoE, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->v, &this->_threads_per_atom); + } + + this->time_pair.stop(); + return GX; +} + +// --------------------------------------------------------------------------- +// Get the extra data pointers from host +// --------------------------------------------------------------------------- + +template +void SPHLJT::get_extra_data(double *host_rho, double *host_cv, double* host_mass) { + rho = host_rho; + cv = host_cv; + mass = host_mass; +} + +template class SPHLJ; +} diff --git a/lib/gpu/lal_sph_lj.cu b/lib/gpu/lal_sph_lj.cu new file mode 100644 index 00000000000..c6fe0713991 --- /dev/null +++ b/lib/gpu/lal_sph_lj.cu @@ -0,0 +1,432 @@ +// ************************************************************************** +// edpd.cu +// ------------------- +// Trung Dac Nguyen (U Chicago) +// +// Device code for acceleration of the edpd pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : September 2023 +// email : ndactrung@gmail.com +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( vel_tex,float4); +#else +_texture_2d( pos_tex,int4); +_texture_2d( vel_tex,int4); +#endif +#else +#define pos_tex x_ +#define vel_tex v_ +#endif + +#if (SHUFFLE_AVAIL == 0) + +#define store_drhoE(drhoI, deltaE, ii, inum, tid, t_per_atom, offset, \ + drhoE) \ + if (t_per_atom>1) { \ + simdsync(); \ + simd_reduce_add2(t_per_atom, red_acc, offset, tid, rhoEi, deltaE); \ + } \ + if (offset==0 && ii1) { \ + simd_reduce_add2(t_per_atom,drhoI,deltaE); \ + } \ + if (offset==0 && ii tag2) { + tag1 = jtag; tag2 = itag; + } + + numtyp randnum = (numtyp)0.0; + saru(tag1, tag2, seed, timestep, randnum); + + numtyp T_ij=(numtyp)0.5*(Ti+Tj); + numtyp4 T_pow; + T_pow.x = T_ij - (numtyp)1.0; + T_pow.y = T_pow.x*T_pow.x; + T_pow.z = T_pow.x*T_pow.y; + T_pow.w = T_pow.x*T_pow.z; + + numtyp coeff2x = coeff2[mtype].x; //power[itype][jtype] + numtyp coeff2y = coeff2[mtype].y; //kappa[itype][jtype] + numtyp coeff2z = coeff2[mtype].z; //powerT[itype][jtype] + numtyp coeff2w = coeff2[mtype].w; //cutT[itype][jtype] + numtyp power_d = coeff2x; + if (power_flag) { + numtyp factor = (numtyp)1.0; + factor += sc[mtype].x*T_pow.x + sc[mtype].y*T_pow.y + + sc[mtype].z*T_pow.z + sc[mtype].w*T_pow.w; + power_d *= factor; + } + + power_d = MAX((numtyp)0.01,power_d); + numtyp wc = (numtyp)1.0 - r/coeffz; // cut[itype][jtype] + wc = MAX((numtyp)0.0,MIN((numtyp)1.0,wc)); + numtyp wr = ucl_pow(wc, (numtyp)0.5*power_d); + + numtyp kboltz = (numtyp)1.0; + numtyp GammaIJ = coeffy; // gamma[itype][jtype] + numtyp SigmaIJ = (numtyp)4.0*GammaIJ*kboltz*Ti*Tj/(Ti+Tj); + SigmaIJ = ucl_sqrt(SigmaIJ); + + numtyp force = coeffx*T_ij*wc; // a0[itype][jtype] + force -= GammaIJ *wr*wr *dot*rinv; + force += SigmaIJ * wr *randnum * dtinvsqrt; + force *= factor_dpd*rinv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + // heat transfer + + if (r < coeff2w) { + numtyp wrT = (numtyp)1.0 - r/coeff2w; + wrT = MAX((numtyp)0.0,MIN((numtyp)1.0,wrT)); + wrT = ucl_pow(wrT, (numtyp)0.5*coeff2z); // powerT[itype][jtype] + numtyp randnumT = (numtyp)0; + saru(tag1, tag2, seed+tag1+tag2, timestep, randnumT); // randomT->gaussian(); + randnumT = MAX((numtyp)-5.0,MIN(randnum,(numtyp)5.0)); + + numtyp kappaT = coeff2y; // kappa[itype][jtype] + if (kappa_flag) { + numtyp factor = (numtyp)1.0; + factor += kc[mtype].x*T_pow.x + kc[mtype].y*T_pow.y + + kc[mtype].z*T_pow.z + kc[mtype].w*T_pow.w; + kappaT *= factor; + } + + numtyp kij = cvi*cvj*kappaT * T_ij*T_ij; + numtyp alphaij = ucl_sqrt((numtyp)2.0*kboltz*kij); + + numtyp dQc = kij * wrT*wrT * (Tj - Ti)/(Ti*Tj); + numtyp dQd = wr*wr*( GammaIJ * vijeij*vijeij - SigmaIJ*SigmaIJ/mass_itype ) - SigmaIJ * wr *vijeij *randnum; + dQd /= (cvi+cvj); + numtyp dQr = alphaij * wrT * dtinvsqrt * randnumT; + Qi += (dQc + dQd + dQr ); + } + + if (EVFLAG && eflag) { + numtyp e = (numtyp)0.5*coeffx*T_ij*coeffz * wc*wc; + energy+=factor_dpd*e; + } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + } // for nbor + } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + store_drhoE(Qi,ii,inum,tid,t_per_atom,offset,Q); +} + +__kernel void k_sph_lj_fast(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict extra, + const __global numtyp2 *restrict coeff_in, + const __global numtyp *restrict sp_lj_in, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + __global acctyp *restrict drhoE, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, + const __global numtyp4 *restrict v_, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + #ifndef ONETYPE + __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[4]; + if (tid<4) { + sp_lj[tid]=sp_lj_in[tid]; + } + if (tid tag2) { + tag1 = jtag; tag2 = itag; + } + numtyp randnum = (numtyp)0.0; + saru(tag1, tag2, seed, timestep, randnum); + + numtyp T_ij=(numtyp)0.5*(Ti+Tj); + numtyp4 T_pow; + T_pow.x = T_ij - (numtyp)1.0; + T_pow.y = T_pow.x*T_pow.x; + T_pow.z = T_pow.x*T_pow.y; + T_pow.w = T_pow.x*T_pow.z; + + numtyp power_d = coeff2x; // power[itype][jtype] + if (power_flag) { + numtyp factor = (numtyp)1.0; + factor += scx*T_pow.x + scy*T_pow.y + scz*T_pow.z + scw*T_pow.w; + power_d *= factor; + } + + power_d = MAX((numtyp)0.01,power_d); + numtyp wc = (numtyp)1.0 - r/coeffz; // cut[itype][jtype] + wc = MAX((numtyp)0.0,MIN((numtyp)1.0,wc)); + numtyp wr = ucl_pow((numtyp)wc, (numtyp)0.5*power_d); + + numtyp kboltz = (numtyp)1.0; + numtyp GammaIJ = coeffy; // gamma[itype][jtype] + numtyp SigmaIJ = (numtyp)4.0*GammaIJ*kboltz*Ti*Tj/(Ti+Tj); + SigmaIJ = ucl_sqrt(SigmaIJ); + + numtyp force = coeffx*T_ij*wc; // a0[itype][jtype] + force -= GammaIJ *wr*wr *dot*rinv; + force += SigmaIJ* wr *randnum * dtinvsqrt; + #ifndef ONETYPE + force *= factor_dpd*rinv; + #else + force *= rinv; + #endif + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + // heat transfer + + if (r < coeff2w) { + numtyp wrT = (numtyp)1.0 - r/coeff2w; + wrT = MAX((numtyp)0.0,MIN((numtyp)1.0,wrT)); + wrT = ucl_pow(wrT, (numtyp)0.5*coeff2z); // powerT[itype][jtype] + numtyp randnumT = (numtyp)0; + saru(tag1, tag2, seed+tag1+tag2, timestep, randnumT); // randomT->gaussian(); + randnumT = MAX((numtyp)-5.0,MIN(randnum,(numtyp)5.0)); + + numtyp kappaT = coeff2y; // kappa[itype][jtype] + if (kappa_flag) { + numtyp factor = (numtyp)1.0; + factor += kcx*T_pow.x + kcy*T_pow.y + kcz*T_pow.z + kcw*T_pow.w; + kappaT *= factor; + } + + numtyp kij = cvi*cvj*kappaT * T_ij*T_ij; + numtyp alphaij = ucl_sqrt((numtyp)2.0*kboltz*kij); + + numtyp dQc = kij * wrT*wrT * (Tj - Ti )/(Ti*Tj); + numtyp dQd = wr*wr*( GammaIJ * vijeij*vijeij - SigmaIJ*SigmaIJ/mass_itype ) - SigmaIJ * wr *vijeij *randnum; + dQd /= (cvi+cvj); + numtyp dQr = alphaij * wrT * dtinvsqrt * randnumT; + Qi += (dQc + dQd + dQr ); + } + + if (EVFLAG && eflag) { + numtyp e = (numtyp)0.5*coeffx*T_ij*coeffz * wc*wc; + #ifndef ONETYPE + energy+=factor_dpd*e; + #else + energy+=e; + #endif + } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + + } + } // for nbor + } // if ii + + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, ans,engv); + store_drhoE(Qi,ii,inum,tid,t_per_atom,offset,Q); +} + diff --git a/lib/gpu/lal_sph_lj.h b/lib/gpu/lal_sph_lj.h new file mode 100644 index 00000000000..5e26d019b27 --- /dev/null +++ b/lib/gpu/lal_sph_lj.h @@ -0,0 +1,88 @@ +/*************************************************************************** + sph_lj.h + ------------------- + Trung Dac Nguyen (U Chicago) + + Class for acceleration of the sph lj pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#ifndef LAL_SPH_LJ_H +#define LAL_SPH_LJ_H + +#include "lal_base_dpd.h" + +namespace LAMMPS_AL { + +template +class SPHLJ : public BaseDPD { + public: + SPHLJ(); + ~SPHLJ(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_viscosity, + double *host_special_lj, const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, + FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + void get_extra_data(double *host_rho, double *host_cv, double* host_mass); + + /// copy drho and desph from device to host + void update_drhoE(void **drhoE_ptr); + + // --------------------------- TYPE DATA -------------------------- + + /// coeff.x = viscosity, coeff.y = cutsq + UCL_D_Vec coeff; + + /// Special LJ values + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + /// Per-atom arrays + UCL_Vector drhoE; + int _max_drhoE_size; + + /// pointer to host data + double *rho, *cv, *mass; + + private: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_sph_lj_ext.cpp b/lib/gpu/lal_sph_lj_ext.cpp new file mode 100644 index 00000000000..7abaa10805f --- /dev/null +++ b/lib/gpu/lal_sph_lj_ext.cpp @@ -0,0 +1,132 @@ +/*************************************************************************** + sph_lj_ext.cpp + ------------------- + Trung Dac Nguyen (U Chicago) + + Functions for LAMMPS access to sph lj acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#include +#include +#include + +#include "lal_sph_lj.h" + +using namespace std; +using namespace LAMMPS_AL; + +static SPHLJ SPHLJMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int sph_lj_gpu_init(const int ntypes, double **cutsq, double **host_viscosity, + double *special_lj, const int inum, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + SPHLJMF.clear(); + gpu_mode=SPHLJMF.device->gpu_mode(); + double gpu_split=SPHLJMF.device->particle_split(); + int first_gpu=SPHLJMF.device->first_device(); + int last_gpu=SPHLJMF.device->last_device(); + int world_me=SPHLJMF.device->world_me(); + int gpu_rank=SPHLJMF.device->gpu_rank(); + int procs_per_gpu=SPHLJMF.device->procs_per_gpu(); + + SPHLJMF.device->init_message(screen,"sph_lj",first_gpu,last_gpu); + + bool message=false; + if (SPHLJMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=SPHLJMF.init(ntypes, cutsq, host_viscosity, special_lj, + inum, nall, max_nbors, maxspecial, + cell_size, gpu_split, screen); + + SPHLJMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; iserialize_init(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + SPHLJMF.estimate_gpu_overhead(); + return init_ok; +} + +void sph_lj_gpu_clear() { + SPHLJMF.clear(); +} + +int ** sph_lj_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + double *boxlo, double *prd) { + return SPHLJMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_v, dtinvsqrt, seed, timestep, boxlo, prd); +} + +void sph_lj_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + const int nlocal, double *boxlo, double *prd) { + SPHLJMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, + firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, + tag, host_v, dtinvsqrt, seed, timestep, nlocal, boxlo, prd); +} + +void sph_lj_gpu_get_extra_data(double *host_rho, double *host_cv, double *host_mass) { + SPHLJMF.get_extra_data(host_rho, host_cv, host_mass); +} + +void sph_lj_gpu_update_drhoE(void **drhoE_ptr) { + SPHLJMF.update_drhoE(drhoE_ptr); +} + +double sph_lj_gpu_bytes() { + return SPHLJMF.host_memory_usage(); +} From 379d3c8e2073af5b4394eaf127fb9d6b18c5ba36 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 7 Dec 2023 11:06:17 -0600 Subject: [PATCH 08/30] Updated host_esph to extra data and cut to coeff --- lib/gpu/lal_sph_lj.cpp | 16 ++-- lib/gpu/lal_sph_lj.cu | 158 ++++++++++++++++--------------------- lib/gpu/lal_sph_lj.h | 10 ++- lib/gpu/lal_sph_lj_ext.cpp | 20 ++--- 4 files changed, 92 insertions(+), 112 deletions(-) diff --git a/lib/gpu/lal_sph_lj.cpp b/lib/gpu/lal_sph_lj.cpp index 8367a81b1e2..3abef71cfb8 100644 --- a/lib/gpu/lal_sph_lj.cpp +++ b/lib/gpu/lal_sph_lj.cpp @@ -45,8 +45,8 @@ int SPHLJT::bytes_per_atom(const int max_nbors) const { template int SPHLJT::init(const int ntypes, - double **host_cutsq, double **host_viscosity, - double *host_special_lj, + double **host_cutsq, double **host_cut, + double **host_viscosity, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -92,8 +92,8 @@ int SPHLJT::init(const int ntypes, host_write[i]=0.0; coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); - this->atom->type_pack2(ntypes,lj_types,coeff,host_write,host_viscosity, - host_cutsq); + this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_viscosity, + host_cut, host_cutsq); UCL_H_Vec dview; sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); @@ -163,9 +163,9 @@ int SPHLJT::loop(const int eflag, const int vflag) { int idx = n+i*nstride; numtyp4 v; v.x = rho[i]; + v.x = esph[i]; v.y = cv[i]; - v.z = mass[i]; - v.w = 0; + v.w = mass[i]; pextra[idx] = v; } this->atom->add_extra_data(); @@ -202,8 +202,10 @@ int SPHLJT::loop(const int eflag, const int vflag) { // --------------------------------------------------------------------------- template -void SPHLJT::get_extra_data(double *host_rho, double *host_cv, double* host_mass) { +void SPHLJT::get_extra_data(double *host_rho, double *host_esph, + double *host_cv, double* host_mass) { rho = host_rho; + esph = host_esph; cv = host_cv; mass = host_mass; } diff --git a/lib/gpu/lal_sph_lj.cu b/lib/gpu/lal_sph_lj.cu index c6fe0713991..4d21c093a8f 100644 --- a/lib/gpu/lal_sph_lj.cu +++ b/lib/gpu/lal_sph_lj.cu @@ -77,7 +77,8 @@ __kernel void k_sph_lj(const __global numtyp4 *restrict x_, energy=(acctyp)0; for (int i=0; i<6; i++) virial[i]=(acctyp)0; } - acctyp Qi = (acctyp)0; + acctyp2 drhoEacc; + drhoEacc.x = drhoEacc.x = (acctyp)0; if (iidimension == 3 Lucy Kernel, 3d + wfd = -25.066903536973515383e0 * wfd * wfd * ihsq * ihsq * ihsq * ih; + + // Lucy Kernel, 2d + //wfd = -19.098593171027440292e0 * wfd * wfd * ihsq * ihsq * ihsq; + + // function call to LJ EOS + numtyp fj, cj; + //LJEOS2(rho[j], esph[j], cv[j], &fj, &cj); + //fj /= (rho[j] * rho[j]); + + // apply long-range correction to model a LJ fluid with cutoff + // this implies that the modelled LJ fluid has cutoff == SPH cutoff + numtyp lrc = (numtyp)-11.1701 * (ihcub * ihcub * ihcub - (numtyp)1.5 * ihcub); + fi += lrc; + fj += lrc; + + // dot product of velocity delta and distance vector numtyp delvx = iv.x - jv.x; numtyp delvy = iv.y - jv.y; numtyp delvz = iv.z - jv.z; - numtyp dot = delx*delvx + dely*delvy + delz*delvz; - numtyp vijeij = dot*rinv; - - const numtyp coeffx=coeff[mtype].x; // a0[itype][jtype] - const numtyp coeffy=coeff[mtype].y; // gamma[itype][jtype] - const numtyp coeffz=coeff[mtype].z; // cut[itype][jtype] + numtyp delVdotDelR = delx*delvx + dely*delvy + delz*delvz; - const numtyp4 Tcvj = extra[j]; - numtyp Tj = Tcvj.x; - numtyp cvj = Tcvj.y; - - unsigned int tag1=itag, tag2=jtag; - if (tag1 > tag2) { - tag1 = jtag; tag2 = itag; - } - - numtyp randnum = (numtyp)0.0; - saru(tag1, tag2, seed, timestep, randnum); - - numtyp T_ij=(numtyp)0.5*(Ti+Tj); - numtyp4 T_pow; - T_pow.x = T_ij - (numtyp)1.0; - T_pow.y = T_pow.x*T_pow.x; - T_pow.z = T_pow.x*T_pow.y; - T_pow.w = T_pow.x*T_pow.z; - - numtyp coeff2x = coeff2[mtype].x; //power[itype][jtype] - numtyp coeff2y = coeff2[mtype].y; //kappa[itype][jtype] - numtyp coeff2z = coeff2[mtype].z; //powerT[itype][jtype] - numtyp coeff2w = coeff2[mtype].w; //cutT[itype][jtype] - numtyp power_d = coeff2x; - if (power_flag) { - numtyp factor = (numtyp)1.0; - factor += sc[mtype].x*T_pow.x + sc[mtype].y*T_pow.y + - sc[mtype].z*T_pow.z + sc[mtype].w*T_pow.w; - power_d *= factor; + // artificial viscosity (Monaghan 1992) + numtyp fvisc = (numtyp)0; + if (delVdotDelR < (numyp)0) { + mu = h * delVdotDelR / (rsq + (numyp)0.01 * h * h); + fvisc = -coeffx * (ci + cj) * mu / (rhoi + rhoj); // viscosity[itype][jtype] } - power_d = MAX((numtyp)0.01,power_d); - numtyp wc = (numtyp)1.0 - r/coeffz; // cut[itype][jtype] - wc = MAX((numtyp)0.0,MIN((numtyp)1.0,wc)); - numtyp wr = ucl_pow(wc, (numtyp)0.5*power_d); - - numtyp kboltz = (numtyp)1.0; - numtyp GammaIJ = coeffy; // gamma[itype][jtype] - numtyp SigmaIJ = (numtyp)4.0*GammaIJ*kboltz*Ti*Tj/(Ti+Tj); - SigmaIJ = ucl_sqrt(SigmaIJ); - - numtyp force = coeffx*T_ij*wc; // a0[itype][jtype] - force -= GammaIJ *wr*wr *dot*rinv; - force += SigmaIJ * wr *randnum * dtinvsqrt; - force *= factor_dpd*rinv; + // total pair force & thermal energy increment + numtyp force = -massi * massj * (fi + fj + fvisc) * wfd; + numtyp deltaE = (numtyp)-0.5 * force * delVdotDelR; f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; - // heat transfer - - if (r < coeff2w) { - numtyp wrT = (numtyp)1.0 - r/coeff2w; - wrT = MAX((numtyp)0.0,MIN((numtyp)1.0,wrT)); - wrT = ucl_pow(wrT, (numtyp)0.5*coeff2z); // powerT[itype][jtype] - numtyp randnumT = (numtyp)0; - saru(tag1, tag2, seed+tag1+tag2, timestep, randnumT); // randomT->gaussian(); - randnumT = MAX((numtyp)-5.0,MIN(randnum,(numtyp)5.0)); - - numtyp kappaT = coeff2y; // kappa[itype][jtype] - if (kappa_flag) { - numtyp factor = (numtyp)1.0; - factor += kc[mtype].x*T_pow.x + kc[mtype].y*T_pow.y + - kc[mtype].z*T_pow.z + kc[mtype].w*T_pow.w; - kappaT *= factor; - } + // and change in density, drho[i] + drhoEacc.x += massj * delVdotDelR * wfd; - numtyp kij = cvi*cvj*kappaT * T_ij*T_ij; - numtyp alphaij = ucl_sqrt((numtyp)2.0*kboltz*kij); - - numtyp dQc = kij * wrT*wrT * (Tj - Ti)/(Ti*Tj); - numtyp dQd = wr*wr*( GammaIJ * vijeij*vijeij - SigmaIJ*SigmaIJ/mass_itype ) - SigmaIJ * wr *vijeij *randnum; - dQd /= (cvi+cvj); - numtyp dQr = alphaij * wrT * dtinvsqrt * randnumT; - Qi += (dQc + dQd + dQr ); - } + // change in thermal energy, desph[i] + drhoEacc.y += deltaE; if (EVFLAG && eflag) { - numtyp e = (numtyp)0.5*coeffx*T_ij*coeffz * wc*wc; - energy+=factor_dpd*e; + numtyp e = (numtyp)0; + energy+=e; } if (EVFLAG && vflag) { virial[0] += delx*delx*force; @@ -224,12 +198,12 @@ __kernel void k_sph_lj(const __global numtyp4 *restrict x_, } // if ii store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, ans,engv); - store_drhoE(Qi,ii,inum,tid,t_per_atom,offset,Q); + store_drhoE(drhoEacc,ii,inum,tid,t_per_atom,offset,drhoE); } __kernel void k_sph_lj_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict extra, - const __global numtyp2 *restrict coeff_in, + const __global numtyp4 *restrict coeff_in, const __global numtyp *restrict sp_lj_in, const __global int * dev_nbor, const __global int * dev_packed, diff --git a/lib/gpu/lal_sph_lj.h b/lib/gpu/lal_sph_lj.h index 5e26d019b27..04423f85fac 100644 --- a/lib/gpu/lal_sph_lj.h +++ b/lib/gpu/lal_sph_lj.h @@ -37,7 +37,8 @@ class SPHLJ : public BaseDPD { * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ - int init(const int ntypes, double **host_cutsq, double **host_viscosity, + int init(const int ntypes, double **host_cutsq, + double** host_cut, double **host_viscosity, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); @@ -52,14 +53,15 @@ class SPHLJ : public BaseDPD { /// Total host memory used by library for pair style double host_memory_usage() const; - void get_extra_data(double *host_rho, double *host_cv, double* host_mass); + void get_extra_data(double *host_rho, double *host_esph, + double *host_cv, double* host_mass); /// copy drho and desph from device to host void update_drhoE(void **drhoE_ptr); // --------------------------- TYPE DATA -------------------------- - /// coeff.x = viscosity, coeff.y = cutsq + /// coeff.x = viscosity, coeff.y = cut, coeff.z = cutsq UCL_D_Vec coeff; /// Special LJ values @@ -76,7 +78,7 @@ class SPHLJ : public BaseDPD { int _max_drhoE_size; /// pointer to host data - double *rho, *cv, *mass; + double *rho, *esph, *cv, *mass; private: bool _allocated; diff --git a/lib/gpu/lal_sph_lj_ext.cpp b/lib/gpu/lal_sph_lj_ext.cpp index 7abaa10805f..7db601c9eb0 100644 --- a/lib/gpu/lal_sph_lj_ext.cpp +++ b/lib/gpu/lal_sph_lj_ext.cpp @@ -27,8 +27,9 @@ static SPHLJ SPHLJMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int sph_lj_gpu_init(const int ntypes, double **cutsq, double **host_viscosity, - double *special_lj, const int inum, const int nall, +int sph_lj_gpu_init(const int ntypes, double **cutsq, double** host_cut, + double **host_viscosity, double *special_lj, + const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { SPHLJMF.clear(); @@ -53,8 +54,8 @@ int sph_lj_gpu_init(const int ntypes, double **cutsq, double **host_viscosity, int init_ok=0; if (world_me==0) - init_ok=SPHLJMF.init(ntypes, cutsq, host_viscosity, special_lj, - inum, nall, max_nbors, maxspecial, + init_ok=SPHLJMF.init(ntypes, cutsq, host_cut, host_viscosity, + special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); SPHLJMF.device->world_barrier(); @@ -71,9 +72,9 @@ int sph_lj_gpu_init(const int ntypes, double **cutsq, double **host_viscosity, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=SPHLJMF.init(ntypes, cutsq, host_viscosity, special_lj, - inum, nall, max_nbors, maxspecial, - cell_size, gpu_split, screen); + init_ok=SPHLJMF.init(ntypes, cutsq, host_cut, host_viscosity, + special_lj, inum, nall, max_nbors, maxspecial, + cell_size, gpu_split, screen); SPHLJMF.device->serialize_init(); if (message) @@ -119,8 +120,9 @@ void sph_lj_gpu_compute(const int ago, const int inum_full, const int nall, tag, host_v, dtinvsqrt, seed, timestep, nlocal, boxlo, prd); } -void sph_lj_gpu_get_extra_data(double *host_rho, double *host_cv, double *host_mass) { - SPHLJMF.get_extra_data(host_rho, host_cv, host_mass); +void sph_lj_gpu_get_extra_data(double *host_rho, double *host_esph, + double *host_cv, double *host_mass) { + SPHLJMF.get_extra_data(host_rho, host_esph, host_cv, host_mass); } void sph_lj_gpu_update_drhoE(void **drhoE_ptr) { From fef28c9daaac21caddffcefe06aed378e3cf2411 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 7 Dec 2023 11:41:37 -0600 Subject: [PATCH 09/30] Updated sph_lj kernels --- lib/gpu/lal_sph_lj.cu | 297 ++++++++++++++++++++++-------------------- 1 file changed, 157 insertions(+), 140 deletions(-) diff --git a/lib/gpu/lal_sph_lj.cu b/lib/gpu/lal_sph_lj.cu index 4d21c093a8f..b965df25a29 100644 --- a/lib/gpu/lal_sph_lj.cu +++ b/lib/gpu/lal_sph_lj.cu @@ -29,41 +29,86 @@ _texture_2d( vel_tex,int4); #if (SHUFFLE_AVAIL == 0) -#define store_drhoE(drhoI, deltaE, ii, inum, tid, t_per_atom, offset, \ - drhoE) \ +#define store_drhoE(drhoEacc, ii, inum, tid, t_per_atom, offset, drhoE) \ if (t_per_atom>1) { \ simdsync(); \ - simd_reduce_add2(t_per_atom, red_acc, offset, tid, rhoEi, deltaE); \ + simd_reduce_add2(t_per_atom, red_acc, offset, tid, \ + drhoEacc.x, drhoEacc.y); \ } \ if (offset==0 && ii1) { \ - simd_reduce_add2(t_per_atom,drhoI,deltaE); \ - } \ - if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + drhoEacc.x += shfl_down(drhoEacc.x, s, t_per_atom); \ + drhoEacc.y += shfl_down(drhoEacc.y, s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii (numtyp)0.0) { + pc[1] = ucl_sqrt(csq); // soundspeed + } else { + pc[1] = (numtyp)0.0; + } +} + + __kernel void k_sph_lj(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict extra, - const __global numtyp4 *restrict coeff, - const int lj_types, - const __global numtyp *restrict sp_lj, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp3 *restrict ans, - __global acctyp *restrict engv, - __global acctyp *restrict drhoE, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, - const __global numtyp4 *restrict v_, - const int t_per_atom) { + const __global numtyp4 *restrict extra, + const __global numtyp4 *restrict coeff, + const int lj_types, + const __global numtyp *restrict sp_lj, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + __global acctyp *restrict drhoE, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, + const __global numtyp4 *restrict v_, + const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -87,7 +132,6 @@ __kernel void k_sph_lj(const __global numtyp4 *restrict x_, numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; - numtyp mass_itype = mass[itype]; numtyp4 iv; fetch4(iv,i,vel_tex); //v_[i]; const numtyp4 extrai = extra[i]; @@ -97,11 +141,12 @@ __kernel void k_sph_lj(const __global numtyp4 *restrict x_, numtyp massi= extrai.w; // compute pressure of particle i with LJ EOS - numtyp fi, ci; - //LJEOS2(rho[i], esph[i], cv[i], &fi, &ci); - //fi /= (rho[i] * rho[i]); + numtyp fci[2]; + LJEOS2(rhoi, esphi, cvi, fci); + numtyp fi = fci[0]; + numtyp ci = fci[1]; + fi /= (rhoi * rhoi); - numtyp factor_lj; for ( ; nbordimension == 3 Lucy Kernel, 3d - wfd = -25.066903536973515383e0 * wfd * wfd * ihsq * ihsq * ihsq * ih; + wfd = (numtyp)-25.066903536973515383 * wfd * wfd * ihsq * ihsq * ihsq * ih; // Lucy Kernel, 2d //wfd = -19.098593171027440292e0 * wfd * wfd * ihsq * ihsq * ihsq; // function call to LJ EOS - numtyp fj, cj; - //LJEOS2(rho[j], esph[j], cv[j], &fj, &cj); - //fj /= (rho[j] * rho[j]); + numtyp fcj[2]; + LJEOS2(rhoj, esphj, cvj, fcj); + numtyp fj = fcj[0]; + numtyp cj = fcj[1]; + fj /= (rhoj * rhoj); // apply long-range correction to model a LJ fluid with cutoff // this implies that the modelled LJ fluid has cutoff == SPH cutoff @@ -163,7 +210,7 @@ __kernel void k_sph_lj(const __global numtyp4 *restrict x_, // artificial viscosity (Monaghan 1992) numtyp fvisc = (numtyp)0; if (delVdotDelR < (numyp)0) { - mu = h * delVdotDelR / (rsq + (numyp)0.01 * h * h); + numtyp mu = h * delVdotDelR / (rsq + (numyp)0.01 * h * h); fvisc = -coeffx * (ci + cj) * mu / (rhoi + rhoj); // viscosity[itype][jtype] } @@ -202,18 +249,18 @@ __kernel void k_sph_lj(const __global numtyp4 *restrict x_, } __kernel void k_sph_lj_fast(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict extra, - const __global numtyp4 *restrict coeff_in, - const __global numtyp *restrict sp_lj_in, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp3 *restrict ans, - __global acctyp *restrict engv, - __global acctyp *restrict drhoE, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, - const __global numtyp4 *restrict v_, - const int t_per_atom) { + const __global numtyp4 *restrict extra, + const __global numtyp4 *restrict coeff_in, + const __global numtyp *restrict sp_lj_in, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + __global acctyp *restrict drhoE, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, + const __global numtyp4 *restrict v_, + const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -229,7 +276,8 @@ __kernel void k_sph_lj_fast(const __global numtyp4 *restrict x_, __syncthreads(); #else const numtyp coeffx=coeff_in[ONETYPE].x; // viscosity[itype][jtype] - const numtyp coeffy=coeff_in[ONETYPE].y; // cutsq[itype][jtype] + const numtyp coeffy=coeff_in[ONETYPE].y; // cut[itype][jtype] + const numtyp cutsq_p=coeff_in[ONETYPE].z; // cutsq[itype][jtype] #endif int n_stride; @@ -242,7 +290,8 @@ __kernel void k_sph_lj_fast(const __global numtyp4 *restrict x_, energy=(acctyp)0; for (int i=0; i<6; i++) virial[i]=(acctyp)0; } - acctyp Qi = (acctyp)0; + acctyp2 drhoEacc; + drhoEacc.x = drhoEacc.x = (acctyp)0; if (iidimension == 3 Lucy Kernel, 3d + wfd = (numtyp)-25.066903536973515383 * wfd * wfd * ihsq * ihsq * ihsq * ih; + + // Lucy Kernel, 2d + //wfd = -19.098593171027440292e0 * wfd * wfd * ihsq * ihsq * ihsq; + + // function call to LJ EOS + numtyp fcj[2]; + LJEOS2(rhoj, esphj, cvj, fcj); + numtyp fj = fcj[0]; + numtyp cj = fcj[1]; + fj /= (rhoj * rhoj); - unsigned int tag1=itag, tag2=jtag; - if (tag1 > tag2) { - tag1 = jtag; tag2 = itag; - } - numtyp randnum = (numtyp)0.0; - saru(tag1, tag2, seed, timestep, randnum); - - numtyp T_ij=(numtyp)0.5*(Ti+Tj); - numtyp4 T_pow; - T_pow.x = T_ij - (numtyp)1.0; - T_pow.y = T_pow.x*T_pow.x; - T_pow.z = T_pow.x*T_pow.y; - T_pow.w = T_pow.x*T_pow.z; - - numtyp power_d = coeff2x; // power[itype][jtype] - if (power_flag) { - numtyp factor = (numtyp)1.0; - factor += scx*T_pow.x + scy*T_pow.y + scz*T_pow.z + scw*T_pow.w; - power_d *= factor; - } + // apply long-range correction to model a LJ fluid with cutoff + // this implies that the modelled LJ fluid has cutoff == SPH cutoff + numtyp lrc = (numtyp)-11.1701 * (ihcub * ihcub * ihcub - (numtyp)1.5 * ihcub); + fi += lrc; + fj += lrc; - power_d = MAX((numtyp)0.01,power_d); - numtyp wc = (numtyp)1.0 - r/coeffz; // cut[itype][jtype] - wc = MAX((numtyp)0.0,MIN((numtyp)1.0,wc)); - numtyp wr = ucl_pow((numtyp)wc, (numtyp)0.5*power_d); + // dot product of velocity delta and distance vector + numtyp delvx = iv.x - jv.x; + numtyp delvy = iv.y - jv.y; + numtyp delvz = iv.z - jv.z; + numtyp delVdotDelR = delx*delvx + dely*delvy + delz*delvz; - numtyp kboltz = (numtyp)1.0; - numtyp GammaIJ = coeffy; // gamma[itype][jtype] - numtyp SigmaIJ = (numtyp)4.0*GammaIJ*kboltz*Ti*Tj/(Ti+Tj); - SigmaIJ = ucl_sqrt(SigmaIJ); + // artificial viscosity (Monaghan 1992) + numtyp fvisc = (numtyp)0; + if (delVdotDelR < (numyp)0) { + numtyp mu = h * delVdotDelR / (rsq + (numyp)0.01 * h * h); + fvisc = -coeffx * (ci + cj) * mu / (rhoi + rhoj); // viscosity[itype][jtype] + } - numtyp force = coeffx*T_ij*wc; // a0[itype][jtype] - force -= GammaIJ *wr*wr *dot*rinv; - force += SigmaIJ* wr *randnum * dtinvsqrt; - #ifndef ONETYPE - force *= factor_dpd*rinv; - #else - force *= rinv; - #endif + // total pair force & thermal energy increment + numtyp force = -massi * massj * (fi + fj + fvisc) * wfd; + numtyp deltaE = (numtyp)-0.5 * force * delVdotDelR; f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; - // heat transfer - - if (r < coeff2w) { - numtyp wrT = (numtyp)1.0 - r/coeff2w; - wrT = MAX((numtyp)0.0,MIN((numtyp)1.0,wrT)); - wrT = ucl_pow(wrT, (numtyp)0.5*coeff2z); // powerT[itype][jtype] - numtyp randnumT = (numtyp)0; - saru(tag1, tag2, seed+tag1+tag2, timestep, randnumT); // randomT->gaussian(); - randnumT = MAX((numtyp)-5.0,MIN(randnum,(numtyp)5.0)); - - numtyp kappaT = coeff2y; // kappa[itype][jtype] - if (kappa_flag) { - numtyp factor = (numtyp)1.0; - factor += kcx*T_pow.x + kcy*T_pow.y + kcz*T_pow.z + kcw*T_pow.w; - kappaT *= factor; - } - - numtyp kij = cvi*cvj*kappaT * T_ij*T_ij; - numtyp alphaij = ucl_sqrt((numtyp)2.0*kboltz*kij); - - numtyp dQc = kij * wrT*wrT * (Tj - Ti )/(Ti*Tj); - numtyp dQd = wr*wr*( GammaIJ * vijeij*vijeij - SigmaIJ*SigmaIJ/mass_itype ) - SigmaIJ * wr *vijeij *randnum; - dQd /= (cvi+cvj); - numtyp dQr = alphaij * wrT * dtinvsqrt * randnumT; - Qi += (dQc + dQd + dQr ); - } - if (EVFLAG && eflag) { - numtyp e = (numtyp)0.5*coeffx*T_ij*coeffz * wc*wc; - #ifndef ONETYPE - energy+=factor_dpd*e; - #else + numtyp e = (numtyp)0; energy+=e; - #endif } if (EVFLAG && vflag) { virial[0] += delx*delx*force; @@ -401,6 +418,6 @@ __kernel void k_sph_lj_fast(const __global numtyp4 *restrict x_, } // if ii store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, ans,engv); - store_drhoE(Qi,ii,inum,tid,t_per_atom,offset,Q); + store_drhoE(drhoEacc,ii,inum,tid,t_per_atom,offset,drhoE); } From e51a8ccfbf96a54b2eca8eba94d5d767f74d9312 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 7 Dec 2023 12:00:24 -0600 Subject: [PATCH 10/30] Removed the unnecessary line since CUDA_BUILD_MULTIARCH is ON by default --- cmake/presets/gpu-cuda.cmake | 1 - 1 file changed, 1 deletion(-) diff --git a/cmake/presets/gpu-cuda.cmake b/cmake/presets/gpu-cuda.cmake index c5e967754a5..2ac6bd9ea65 100644 --- a/cmake/presets/gpu-cuda.cmake +++ b/cmake/presets/gpu-cuda.cmake @@ -3,7 +3,6 @@ set(PKG_GPU ON CACHE BOOL "Build GPU package" FORCE) set(GPU_API "cuda" CACHE STRING "APU used by GPU package" FORCE) set(GPU_PREC "mixed" CACHE STRING "" FORCE) -set(GPU_ARCH "sm_60;sm_70;sm_80" CACHE STRING "LAMMPS GPU CUDA SM architectures" FORCE) set(CUDA_NVCC_FLAGS "-allow-unsupported-compiler" CACHE STRING "" FORCE) set(CUDA_NVCC_FLAGS_DEBUG "-allow-unsupported-compiler" CACHE STRING "" FORCE) From 6fe16c7606c66226637303fcae139ba6b6a8b917 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 7 Dec 2023 14:33:47 -0600 Subject: [PATCH 11/30] Added pair style sph/lj/gpu in src/GPU, removed commented codes --- lib/gpu/lal_amoeba.cpp | 8 +- lib/gpu/lal_hippo.cpp | 8 +- lib/gpu/lal_sph_lj.cpp | 6 +- lib/gpu/lal_sph_lj_ext.cpp | 30 +++--- src/GPU/pair_sph_lj_gpu.cpp | 199 ++++++++++++++++++++++++++++++++++++ src/GPU/pair_sph_lj_gpu.h | 48 +++++++++ 6 files changed, 269 insertions(+), 30 deletions(-) create mode 100644 src/GPU/pair_sph_lj_gpu.cpp create mode 100644 src/GPU/pair_sph_lj_gpu.h diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 5e199979135..805c4c4b26b 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -281,13 +281,7 @@ int AmoebaT::polar_real(const int eflag, const int vflag) { const int BX=this->block_size(); const int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); - /* - const int cus = this->device->gpu->cus(); - while (GX < cus && GX > 1) { - BX /= 2; - GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); - } - */ + this->time_pair.start(); // Build the short neighbor list if not done yet diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 8d6ad5dfb29..3511d82b000 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -603,13 +603,7 @@ int HippoT::polar_real(const int eflag, const int vflag) { const int BX=this->block_size(); const int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); - /* - const int cus = this->device->gpu->cus(); - while (GX < cus && GX > 1) { - BX /= 2; - GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); - } - */ + this->time_pair.start(); // Build the short neighbor list if not done yet diff --git a/lib/gpu/lal_sph_lj.cpp b/lib/gpu/lal_sph_lj.cpp index 3abef71cfb8..02b5cc0765e 100644 --- a/lib/gpu/lal_sph_lj.cpp +++ b/lib/gpu/lal_sph_lj.cpp @@ -107,7 +107,7 @@ int SPHLJT::init(const int ntypes, ef_nall=2000; _max_drhoE_size=static_cast(static_cast(ef_nall)*1.10); - drhoE.alloc(_max_drhoE_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + drhoE.alloc(_max_drhoE_size*2,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); _allocated=true; this->_max_bytes=coeff.row_bytes()+drhoE.row_bytes()+sp_lj.row_bytes(); @@ -134,7 +134,7 @@ double SPHLJT::host_memory_usage() const { template void SPHLJT::update_drhoE(void **drhoE_ptr) { *drhoE_ptr=drhoE.host.begin(); - drhoE.update_host(_max_drhoE_size,false); + drhoE.update_host(_max_drhoE_size*2,false); } // --------------------------------------------------------------------------- @@ -148,7 +148,7 @@ int SPHLJT::loop(const int eflag, const int vflag) { // Resize drhoE array if necessary if (nall > _max_drhoE_size) { _max_drhoE_size=static_cast(static_cast(nall)*1.10); - drhoE.resize(_max_drhoE_size); + drhoE.resize(_max_drhoE_size*2); } // signal that we need to transfer extra data from the host diff --git a/lib/gpu/lal_sph_lj_ext.cpp b/lib/gpu/lal_sph_lj_ext.cpp index 7db601c9eb0..f1483c1db84 100644 --- a/lib/gpu/lal_sph_lj_ext.cpp +++ b/lib/gpu/lal_sph_lj_ext.cpp @@ -94,13 +94,15 @@ void sph_lj_gpu_clear() { int ** sph_lj_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, + double *subhi, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, - double **host_v, const double dtinvsqrt, - const int seed, const int timestep, - double *boxlo, double *prd) { + double **host_v, double *boxlo, double *prd) { + double dtinvsqrt = 1.0; + int seed = 0; + int timestep = 0; + tagint* tag = nullptr; return SPHLJMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, @@ -108,16 +110,18 @@ int ** sph_lj_gpu_compute_n(const int ago, const int inum_full, const int nall, } void sph_lj_gpu_compute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *ilist, int *numj, - int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, tagint *tag, - double **host_v, const double dtinvsqrt, - const int seed, const int timestep, - const int nlocal, double *boxlo, double *prd) { + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, + double **host_v, const int nlocal, double *boxlo, double *prd) { + double dtinvsqrt = 1.0; + int seed = 0; + int timestep = 0; + tagint* tag = nullptr; SPHLJMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, - firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, - tag, host_v, dtinvsqrt, seed, timestep, nlocal, boxlo, prd); + firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, + tag, host_v, dtinvsqrt, seed, timestep, nlocal, boxlo, prd); } void sph_lj_gpu_get_extra_data(double *host_rho, double *host_esph, diff --git a/src/GPU/pair_sph_lj_gpu.cpp b/src/GPU/pair_sph_lj_gpu.cpp new file mode 100644 index 00000000000..b73153c1ac5 --- /dev/null +++ b/src/GPU/pair_sph_lj_gpu.cpp @@ -0,0 +1,199 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Dac Nguyen (U Chicago) +------------------------------------------------------------------------- */ + +#include "pair_sph_lj_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "info.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "suffix.h" +#include "update.h" + +#include + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int sph_lj_gpu_init(const int ntypes, double **cutsq, double** host_cut, + double **host_viscosity, double *special_lj, + const int inum, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen); +void sph_lj_gpu_clear(); +int **sph_lj_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, + int *host_type, double *sublo, double *subhi, int **nspecial, + tagint **special, const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double **host_v, + double *boxlo, double *prd); +void sph_lj_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, + int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double **host_v, + const int nlocal, double *boxlo, double *prd); +void sph_lj_gpu_get_extra_data(double *host_rho, double *host_esph, + double *host_cv, double *host_mass); +void sph_lj_gpu_update_drhoE(void **drhoE_ptr); +double sph_lj_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairSPHLJGPU::PairSPHLJGPU(LAMMPS *lmp) : PairSPHLJ(lmp), gpu_mode(GPU_FORCE) +{ + drhoE_pinned = nullptr; + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairSPHLJGPU::~PairSPHLJGPU() +{ + sph_lj_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairSPHLJGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double *rho = atom->rho; + double *esph = atom->esph; + double *cv = atom->cv; + double *mass = atom->mass; + sph_lj_gpu_get_extra_data(rho, esph, cv, mass); + + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + inum = atom->nlocal; + firstneigh = sph_lj_gpu_compute_n( + neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, + cpu_time, success, atom->v, domain->boxlo, domain->prd); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + sph_lj_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, + atom->v, atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + // get the drho and dE from device + + double *drho = atom->drho; + double *desph = atom->desph; + sph_lj_gpu_update_drhoE(&drhoE_pinned); + + int nlocal = atom->nlocal; + if (acc_float) { + auto drhoE_ptr = (float *)drhoE_pinned; + int idx = 0; + for (int i = 0; i < nlocal; i++) { + drho[i] = drhoE_ptr[idx]; + desph[i] = drhoE_ptr[idx+1]; + idx += 2; + } + + } else { + auto drhoE_ptr = (double *)drhoE_pinned; + int idx = 0; + for (int i = 0; i < nlocal; i++) { + drho[i] = drhoE_ptr[idx]; + desph[i] = drhoE_ptr[idx+1]; + idx += 2; + } + } + + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairSPHLJGPU::init_style() +{ + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double mcut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + mcut = init_one(i, j); + mcut *= mcut; + if (mcut > maxcut) maxcut = mcut; + cutsq[i][j] = cutsq[j][i] = mcut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + int success = + sph_lj_gpu_init(atom->ntypes + 1, cutsq, cut, viscosity, + force->special_lj, atom->nlocal, atom->nlocal + atom->nghost, + mnf, maxspecial, cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success, error, world); + + acc_float = Info::has_accelerator_feature("GPU", "precision", "single"); + + if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); +} + +/* ---------------------------------------------------------------------- */ + +double PairSPHLJGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + sph_lj_gpu_bytes(); +} diff --git a/src/GPU/pair_sph_lj_gpu.h b/src/GPU/pair_sph_lj_gpu.h new file mode 100644 index 00000000000..9aae3c2d6ab --- /dev/null +++ b/src/GPU/pair_sph_lj_gpu.h @@ -0,0 +1,48 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(sph/lj/gpu,PairSPHLJGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_SPH_LJ_GPU_H +#define LMP_PAIR_SPH_LJ_GPU_H + +#include "pair_sph_lj.h" + +namespace LAMMPS_NS { + +class PairSPHLJGPU : public PairSPHLJ { + public: + PairSPHLJGPU(LAMMPS *lmp); + ~PairSPHLJGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + void *drhoE_pinned; + bool acc_float; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif From cad4c25750a539e61c27ada926515057bd531c29 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 7 Dec 2023 17:22:45 -0600 Subject: [PATCH 12/30] Adding sph/taitwater in lib/gpu --- lib/gpu/lal_edpd.cpp | 2 +- lib/gpu/lal_sph_lj.cpp | 4 +- lib/gpu/lal_sph_lj.cu | 4 +- lib/gpu/lal_sph_taitwater.cpp | 221 ++++++++++++++++++ lib/gpu/lal_sph_taitwater.cu | 374 ++++++++++++++++++++++++++++++ lib/gpu/lal_sph_taitwater.h | 93 ++++++++ lib/gpu/lal_sph_taitwater_ext.cpp | 140 +++++++++++ src/GPU/pair_sph_lj_gpu.cpp | 4 +- 8 files changed, 835 insertions(+), 7 deletions(-) create mode 100644 lib/gpu/lal_sph_taitwater.cpp create mode 100644 lib/gpu/lal_sph_taitwater.cu create mode 100644 lib/gpu/lal_sph_taitwater.h create mode 100644 lib/gpu/lal_sph_taitwater_ext.cpp diff --git a/lib/gpu/lal_edpd.cpp b/lib/gpu/lal_edpd.cpp index c85644a60bc..c03591b9ed5 100644 --- a/lib/gpu/lal_edpd.cpp +++ b/lib/gpu/lal_edpd.cpp @@ -104,7 +104,7 @@ int EDPDT::init(const int ntypes, this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_power,host_kappa, host_powerT,host_cutT); - UCL_H_Vec dview_mass(ntypes, *(this->ucl_device), UCL_WRITE_ONLY); + UCL_H_Vec dview_mass(ntypes, *(this->ucl_device), UCL_WRITE_ONLY); for (int i = 0; i < ntypes; i++) dview_mass[i] = host_mass[i]; mass.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY); diff --git a/lib/gpu/lal_sph_lj.cpp b/lib/gpu/lal_sph_lj.cpp index 02b5cc0765e..e5184b9e918 100644 --- a/lib/gpu/lal_sph_lj.cpp +++ b/lib/gpu/lal_sph_lj.cpp @@ -163,8 +163,8 @@ int SPHLJT::loop(const int eflag, const int vflag) { int idx = n+i*nstride; numtyp4 v; v.x = rho[i]; - v.x = esph[i]; - v.y = cv[i]; + v.y = esph[i]; + v.z = cv[i]; v.w = mass[i]; pextra[idx] = v; } diff --git a/lib/gpu/lal_sph_lj.cu b/lib/gpu/lal_sph_lj.cu index b965df25a29..0281d9c16aa 100644 --- a/lib/gpu/lal_sph_lj.cu +++ b/lib/gpu/lal_sph_lj.cu @@ -1,9 +1,9 @@ // ************************************************************************** -// edpd.cu +// sph_lj.cu // ------------------- // Trung Dac Nguyen (U Chicago) // -// Device code for acceleration of the edpd pair style +// Device code for acceleration of the sph/lj pair style // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) diff --git a/lib/gpu/lal_sph_taitwater.cpp b/lib/gpu/lal_sph_taitwater.cpp new file mode 100644 index 00000000000..1548502ffa0 --- /dev/null +++ b/lib/gpu/lal_sph_taitwater.cpp @@ -0,0 +1,221 @@ +/*************************************************************************** + sph_taitwater.cpp + ------------------- + Trung Dac Nguyen (U Chicago) + + Class for acceleration of the sph_taitwater pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "sph_taitwater_cl.h" +#elif defined(USE_CUDART) +const char *sph_taitwater=0; +#else +#include "sph_taitwater_cubin.h" +#endif + +#include "lal_sph_taitwater.h" +#include +namespace LAMMPS_AL { +#define SPHTaitwaterT SPHTaitwater + +extern Device device; + +template +SPHTaitwaterT::SPHTaitwater() : BaseDPD(), _allocated(false) { + _max_drhoE_size = 0; +} + +template +SPHTaitwaterT::~SPHTaitwater() { + clear(); +} + +template +int SPHTaitwaterT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int SPHTaitwaterT::init(const int ntypes, + double **host_cutsq, double **host_cut, + double **host_viscosity, double* host_rho0, + double* host_soundspeed, double *host_special_lj, + const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen) { + const int max_shared_types=this->device->max_shared_types(); + + int onetype=0; + #ifdef USE_OPENCL + if (maxspecial==0) + for (int i=1; i0) { + if (onetype>0) + onetype=-1; + else if (onetype==0) + onetype=i*max_shared_types+j; + } + if (onetype<0) onetype=0; + #endif + + int success; + int extra_fields = 4; // round up to accomodate quadruples of numtyp values + // rho, mass + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,sph_taitwater,"k_sph_taitwater",onetype,extra_fields); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=taitwater_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_viscosity, + host_cut, host_cutsq); + + UCL_H_Vec dview_rho0ss(ntypes, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < ntypes; i++) { + dview_rho0ss[i].x = host_rho0[i]; + dview_rho0ss[i].y = host_soundspeed[i]; + } + rho0sspeed.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(rho0sspeed,dview_rho0ss,false); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + // allocate per-atom array Q + + int ef_nall=nall; + if (ef_nall==0) + ef_nall=2000; + + _max_drhoE_size=static_cast(static_cast(ef_nall)*1.10); + drhoE.alloc(_max_drhoE_size*2,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + + _allocated=true; + this->_max_bytes=coeff.row_bytes()+corho0sspeedeff.row_bytes()+drhoE.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void SPHTaitwaterT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff.clear(); + dview_rho0ss.clear(); + drhoE.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double SPHTaitwaterT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(SPHTaitwater); +} + +template +void SPHTaitwaterT::update_drhoE(void **drhoE_ptr) { + *drhoE_ptr=drhoE.host.begin(); + drhoE.update_host(_max_drhoE_size*2,false); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int SPHTaitwaterT::loop(const int eflag, const int vflag) { + + int nall = this->atom->nall(); + + // Resize drhoE array if necessary + if (nall > _max_drhoE_size) { + _max_drhoE_size=static_cast(static_cast(nall)*1.10); + drhoE.resize(_max_drhoE_size*2); + } + + // signal that we need to transfer extra data from the host + + this->atom->extra_data_unavail(); + + numtyp4 *pextra=reinterpret_cast(&(this->atom->extra[0])); + + int n = 0; + int nstride = 1; + for (int i = 0; i < nall; i++) { + int idx = n+i*nstride; + numtyp4 v; + v.x = rho[i]; + v.y = mass[i]; + v.z = 0; + v.w = 0; + pextra[idx] = v; + } + this->atom->add_extra_data(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &rho0sspeed, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &drhoE, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->v, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &rho0sspeed, + &_lj_types, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &drhoE, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->v, &this->_threads_per_atom); + } + + this->time_pair.stop(); + return GX; +} + +// --------------------------------------------------------------------------- +// Get the extra data pointers from host +// --------------------------------------------------------------------------- + +template +void SPHTaitwaterT::get_extra_data(double *host_rho, double* host_mass) { + rho = host_rho; + mass = host_mass; +} + +template class SPHTaitwater; +} diff --git a/lib/gpu/lal_sph_taitwater.cu b/lib/gpu/lal_sph_taitwater.cu new file mode 100644 index 00000000000..d9e809bd451 --- /dev/null +++ b/lib/gpu/lal_sph_taitwater.cu @@ -0,0 +1,374 @@ +// ************************************************************************** +// sph_taitwater.cu +// ------------------- +// Trung Dac Nguyen (U Chicago) +// +// Device code for acceleration of the sph/taitwater pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : September 2023 +// email : ndactrung@gmail.com +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( vel_tex,float4); +#else +_texture_2d( pos_tex,int4); +_texture_2d( vel_tex,int4); +#endif +#else +#define pos_tex x_ +#define vel_tex v_ +#endif + +#if (SHUFFLE_AVAIL == 0) + +#define store_drhoE(drhoEacc, ii, inum, tid, t_per_atom, offset, drhoE) \ + if (t_per_atom>1) { \ + simdsync(); \ + simd_reduce_add2(t_per_atom, red_acc, offset, tid, \ + drhoEacc.x, drhoEacc.y); \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + drhoEacc.x += shfl_down(drhoEacc.x, s, t_per_atom); \ + drhoEacc.y += shfl_down(drhoEacc.y, s, t_per_atom); \ + } \ + } \ + if (offset==0 && iidimension == 3 Lucy Kernel, 3d + wfd = (numtyp)-25.066903536973515383 * wfd * wfd * ihsq * ihsq * ihsq * ih; + + // Lucy Kernel, 2d + //wfd = -19.098593171027440292e0 * wfd * wfd * ihsq * ihsq * ihsq; + + // compute pressure of atom j with Tait EOS + numtyp tmp = rho[j] / rho0[jtype]; + numtyp fj = tmp * tmp * tmp; + fj = B[jtype] * (fj * fj * tmp - 1.0); + fj /= (rhoj * rhoj); + + // dot product of velocity delta and distance vector + numtyp delvx = iv.x - jv.x; + numtyp delvy = iv.y - jv.y; + numtyp delvz = iv.z - jv.z; + numtyp delVdotDelR = delx*delvx + dely*delvy + delz*delvz; + + // artificial viscosity (Monaghan 1992) + numtyp fvisc = (numtyp)0; + if (delVdotDelR < (numyp)0) { + numtyp mu = h * delVdotDelR / (rsq + (numyp)0.01 * h * h); + //fvisc = -coeffx * (ci + cj) * mu / (rhoi + rhoj); // viscosity[itype][jtype] + + fvisc = -coeffx * (soundspeed[itype] + + soundspeed[jtype]) * mu / (rhoi + rhoj); + } + + // total pair force & thermal energy increment + numtyp force = -massi * massj * (fi + fj + fvisc) * wfd; + numtyp deltaE = (numtyp)-0.5 * force * delVdotDelR; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + // and change in density, drho[i] + drhoEacc.x += massj * delVdotDelR * wfd; + + // change in thermal energy, desph[i] + drhoEacc.y += deltaE; + + if (EVFLAG && eflag) { + numtyp e = (numtyp)0; + energy+=e; + } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + } // for nbor + } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + store_drhoE(drhoEacc,ii,inum,tid,t_per_atom,offset,drhoE); +} + +__kernel void k_sph_taitwater_fast(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict extra, + const __global numtyp4 *restrict coeff_in, + const __global numtyp2 *restrict rho0sspeed_in, + const __global numtyp *restrict sp_lj_in, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + __global acctyp *restrict drhoE, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, + const __global numtyp4 *restrict v_, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + #ifndef ONETYPE + __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[4]; + if (tid<4) { + sp_lj[tid]=sp_lj_in[tid]; + } + if (tiddimension == 3 Lucy Kernel, 3d + wfd = (numtyp)-25.066903536973515383 * wfd * wfd * ihsq * ihsq * ihsq * ih; + + // Lucy Kernel, 2d + //wfd = -19.098593171027440292e0 * wfd * wfd * ihsq * ihsq * ihsq; + + // function call to LJ EOS + numtyp fcj[2]; + LJEOS2(rhoj, esphj, cvj, fcj); + numtyp fj = fcj[0]; + numtyp cj = fcj[1]; + fj /= (rhoj * rhoj); + + // apply long-range correction to model a LJ fluid with cutoff + // this implies that the modelled LJ fluid has cutoff == SPH cutoff + numtyp lrc = (numtyp)-11.1701 * (ihcub * ihcub * ihcub - (numtyp)1.5 * ihcub); + fi += lrc; + fj += lrc; + + // dot product of velocity delta and distance vector + numtyp delvx = iv.x - jv.x; + numtyp delvy = iv.y - jv.y; + numtyp delvz = iv.z - jv.z; + numtyp delVdotDelR = delx*delvx + dely*delvy + delz*delvz; + + // artificial viscosity (Monaghan 1992) + numtyp fvisc = (numtyp)0; + if (delVdotDelR < (numyp)0) { + numtyp mu = h * delVdotDelR / (rsq + (numyp)0.01 * h * h); + fvisc = -coeffx * (ci + cj) * mu / (rhoi + rhoj); // viscosity[itype][jtype] + } + + // total pair force & thermal energy increment + numtyp force = -massi * massj * (fi + fj + fvisc) * wfd; + numtyp deltaE = (numtyp)-0.5 * force * delVdotDelR; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (EVFLAG && eflag) { + numtyp e = (numtyp)0; + energy+=e; + } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + + } + } // for nbor + } // if ii + + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, ans,engv); + store_drhoE(drhoEacc,ii,inum,tid,t_per_atom,offset,drhoE); +} + diff --git a/lib/gpu/lal_sph_taitwater.h b/lib/gpu/lal_sph_taitwater.h new file mode 100644 index 00000000000..cae5a0b09d4 --- /dev/null +++ b/lib/gpu/lal_sph_taitwater.h @@ -0,0 +1,93 @@ +/*************************************************************************** + sph_lj.h + ------------------- + Trung Dac Nguyen (U Chicago) + + Class for acceleration of the sph lj pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#ifndef LAL_SPH_TAITWATER_H +#define LAL_SPH_TaitLAL_SPH_TAITWATER_Hwater_H + +#include "lal_base_dpd.h" + +namespace LAMMPS_AL { + +template +class SPHTaitwater : public BaseDPD { + public: + SPHTaitwater(); + ~SPHTaitwater(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double** host_cut, double **host_viscosity, + double* host_rho0, double* host_soundspeed, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + void get_extra_data(double *host_rho, double* host_mass); + + /// copy drho and desph from device to host + void update_drhoE(void **drhoE_ptr); + + // --------------------------- TYPE DATA -------------------------- + + /// coeff.x = viscosity, coeff.y = cut, coeff.z = cutsq + UCL_D_Vec coeff; + + UCL_D_Vec rho0sspeed; + + /// Special LJ values + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + /// Per-atom arrays + UCL_Vector drhoE; + int _max_drhoE_size; + + /// pointer to host data + double *rho, *mass; + + private: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_sph_taitwater_ext.cpp b/lib/gpu/lal_sph_taitwater_ext.cpp new file mode 100644 index 00000000000..ef4bd54ab4c --- /dev/null +++ b/lib/gpu/lal_sph_taitwater_ext.cpp @@ -0,0 +1,140 @@ +/*************************************************************************** + sph_taitwater_ext.cpp + ------------------- + Trung Dac Nguyen (U Chicago) + + Functions for LAMMPS access to sph lj acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#include +#include +#include + +#include "lal_sph_taitwater.h" + +using namespace std; +using namespace LAMMPS_AL; + +static SPHTaitwater SPHTaitwaterMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int sph_taitwater_gpu_init(const int ntypes, double **cutsq, double** host_cut, + double **host_viscosity, double* host_rho0, + double* host_soundspeed, double *special_lj, + const int inum, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + SPHTaitwaterMF.clear(); + gpu_mode=SPHTaitwaterMF.device->gpu_mode(); + double gpu_split=SPHTaitwaterMF.device->particle_split(); + int first_gpu=SPHTaitwaterMF.device->first_device(); + int last_gpu=SPHTaitwaterMF.device->last_device(); + int world_me=SPHTaitwaterMF.device->world_me(); + int gpu_rank=SPHTaitwaterMF.device->gpu_rank(); + int procs_per_gpu=SPHTaitwaterMF.device->procs_per_gpu(); + + SPHTaitwaterMF.device->init_message(screen,"sph_taitwater",first_gpu,last_gpu); + + bool message=false; + if (SPHTaitwaterMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=SPHTaitwaterMF.init(ntypes, cutsq, host_cut, host_viscosity, + host_rho0, host_soundspeed, special_lj, + inum, nall, max_nbors, maxspecial, cell_size, + gpu_split, screen); + + SPHTaitwaterMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; iserialize_init(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + SPHTaitwaterMF.estimate_gpu_overhead(); + return init_ok; +} + +void sph_taitwater_gpu_clear() { + SPHTaitwaterMF.clear(); +} + +int ** sph_taitwater_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v, double *boxlo, double *prd) { + double dtinvsqrt = 1.0; + int seed = 0; + int timestep = 0; + tagint* tag = nullptr; + return SPHTaitwaterMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_v, dtinvsqrt, seed, timestep, boxlo, prd); +} + +void sph_taitwater_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, + double **host_v, const int nlocal, double *boxlo, double *prd) { + double dtinvsqrt = 1.0; + int seed = 0; + int timestep = 0; + tagint* tag = nullptr; + SPHTaitwaterMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, + firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, + tag, host_v, dtinvsqrt, seed, timestep, nlocal, boxlo, prd); +} + +void sph_taitwater_gpu_get_extra_data(double *host_rho, double *host_mass) { + SPHTaitwaterMF.get_extra_data(host_rho, host_mass); +} + +void sph_taitwater_gpu_update_drhoE(void **drhoE_ptr) { + SPHTaitwaterMF.update_drhoE(drhoE_ptr); +} + +double sph_taitwater_gpu_bytes() { + return SPHTaitwaterMF.host_memory_usage(); +} diff --git a/src/GPU/pair_sph_lj_gpu.cpp b/src/GPU/pair_sph_lj_gpu.cpp index b73153c1ac5..0b6a7ee72d6 100644 --- a/src/GPU/pair_sph_lj_gpu.cpp +++ b/src/GPU/pair_sph_lj_gpu.cpp @@ -118,8 +118,8 @@ void PairSPHLJGPU::compute(int eflag, int vflag) numneigh = list->numneigh; firstneigh = list->firstneigh; sph_lj_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, - eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, - atom->v, atom->nlocal, domain->boxlo, domain->prd); + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, + atom->v, atom->nlocal, domain->boxlo, domain->prd); } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); From 7fad795141d1cb34b8fba0f41fd8119685f20044 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 8 Dec 2023 00:10:01 -0600 Subject: [PATCH 13/30] Added pair sph/taitwater/gpu in src/GPU --- lib/gpu/lal_sph_lj.cpp | 9 +- lib/gpu/lal_sph_lj.cu | 30 +++-- lib/gpu/lal_sph_lj.h | 4 +- lib/gpu/lal_sph_lj_ext.cpp | 8 +- lib/gpu/lal_sph_taitwater.cpp | 47 ++++--- lib/gpu/lal_sph_taitwater.cu | 155 +++++++++++----------- lib/gpu/lal_sph_taitwater.h | 13 +- lib/gpu/lal_sph_taitwater_ext.cpp | 15 ++- src/GPU/pair_sph_lj_gpu.cpp | 14 +- src/GPU/pair_sph_taitwater_gpu.cpp | 198 +++++++++++++++++++++++++++++ src/GPU/pair_sph_taitwater_gpu.h | 48 +++++++ 11 files changed, 405 insertions(+), 136 deletions(-) create mode 100644 src/GPU/pair_sph_taitwater_gpu.cpp create mode 100644 src/GPU/pair_sph_taitwater_gpu.h diff --git a/lib/gpu/lal_sph_lj.cpp b/lib/gpu/lal_sph_lj.cpp index e5184b9e918..85c144e2401 100644 --- a/lib/gpu/lal_sph_lj.cpp +++ b/lib/gpu/lal_sph_lj.cpp @@ -46,7 +46,8 @@ int SPHLJT::bytes_per_atom(const int max_nbors) const { template int SPHLJT::init(const int ntypes, double **host_cutsq, double **host_cut, - double **host_viscosity, double *host_special_lj, + double **host_viscosity, const int dimension, + double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -109,6 +110,8 @@ int SPHLJT::init(const int ntypes, _max_drhoE_size=static_cast(static_cast(ef_nall)*1.10); drhoE.alloc(_max_drhoE_size*2,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _dimension = dimension; + _allocated=true; this->_max_bytes=coeff.row_bytes()+drhoE.row_bytes()+sp_lj.row_bytes(); return 0; @@ -184,13 +187,13 @@ int SPHLJT::loop(const int eflag, const int vflag) { this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &drhoE, &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->v, &this->_threads_per_atom); + &ainum, &nbor_pitch, &this->atom->v, &_dimension, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &_lj_types, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &drhoE, &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->v, &this->_threads_per_atom); + &ainum, &nbor_pitch, &this->atom->v, &_dimension, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_sph_lj.cu b/lib/gpu/lal_sph_lj.cu index 0281d9c16aa..ab41477d2f0 100644 --- a/lib/gpu/lal_sph_lj.cu +++ b/lib/gpu/lal_sph_lj.cu @@ -108,7 +108,7 @@ __kernel void k_sph_lj(const __global numtyp4 *restrict x_, const int eflag, const int vflag, const int inum, const int nbor_pitch, const __global numtyp4 *restrict v_, - const int t_per_atom) { + const int dimension, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -182,12 +182,14 @@ __kernel void k_sph_lj(const __global numtyp4 *restrict x_, numtyp ihcub = ihsq * ih; numtyp wfd = h - ucl_sqrt(rsq); - // domain->dimension == 3 Lucy Kernel, 3d - wfd = (numtyp)-25.066903536973515383 * wfd * wfd * ihsq * ihsq * ihsq * ih; - - // Lucy Kernel, 2d - //wfd = -19.098593171027440292e0 * wfd * wfd * ihsq * ihsq * ihsq; - + if (dimension == 3) { + // Lucy Kernel, 3d + wfd = (numtyp)-25.066903536973515383 * wfd * wfd * ihsq * ihsq * ihsq * ih; + } else { + // Lucy Kernel, 2d + wfd = -19.098593171027440292e0 * wfd * wfd * ihsq * ihsq * ihsq; + } + // function call to LJ EOS numtyp fcj[2]; LJEOS2(rhoj, esphj, cvj, fcj); @@ -260,7 +262,7 @@ __kernel void k_sph_lj_fast(const __global numtyp4 *restrict x_, const int eflag, const int vflag, const int inum, const int nbor_pitch, const __global numtyp4 *restrict v_, - const int t_per_atom) { + const int dimension, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -360,11 +362,13 @@ __kernel void k_sph_lj_fast(const __global numtyp4 *restrict x_, numtyp ihcub = ihsq * ih; numtyp wfd = h - ucl_sqrt(rsq); - // domain->dimension == 3 Lucy Kernel, 3d - wfd = (numtyp)-25.066903536973515383 * wfd * wfd * ihsq * ihsq * ihsq * ih; - - // Lucy Kernel, 2d - //wfd = -19.098593171027440292e0 * wfd * wfd * ihsq * ihsq * ihsq; + if (dimension == 3) { + // Lucy Kernel, 3d + wfd = (numtyp)-25.066903536973515383 * wfd * wfd * ihsq * ihsq * ihsq * ih; + } else { + // Lucy Kernel, 2d + wfd = -19.098593171027440292e0 * wfd * wfd * ihsq * ihsq * ihsq; + } // function call to LJ EOS numtyp fcj[2]; diff --git a/lib/gpu/lal_sph_lj.h b/lib/gpu/lal_sph_lj.h index 04423f85fac..b2f78ddeca6 100644 --- a/lib/gpu/lal_sph_lj.h +++ b/lib/gpu/lal_sph_lj.h @@ -38,7 +38,7 @@ class SPHLJ : public BaseDPD { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, - double** host_cut, double **host_viscosity, + double** host_cut, double **host_viscosity, const int dimension, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); @@ -77,6 +77,8 @@ class SPHLJ : public BaseDPD { UCL_Vector drhoE; int _max_drhoE_size; + int _dimension; + /// pointer to host data double *rho, *esph, *cv, *mass; diff --git a/lib/gpu/lal_sph_lj_ext.cpp b/lib/gpu/lal_sph_lj_ext.cpp index f1483c1db84..b23a398db7c 100644 --- a/lib/gpu/lal_sph_lj_ext.cpp +++ b/lib/gpu/lal_sph_lj_ext.cpp @@ -28,8 +28,8 @@ static SPHLJ SPHLJMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int sph_lj_gpu_init(const int ntypes, double **cutsq, double** host_cut, - double **host_viscosity, double *special_lj, - const int inum, const int nall, + double **host_viscosity, const int dimension, + double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { SPHLJMF.clear(); @@ -54,7 +54,7 @@ int sph_lj_gpu_init(const int ntypes, double **cutsq, double** host_cut, int init_ok=0; if (world_me==0) - init_ok=SPHLJMF.init(ntypes, cutsq, host_cut, host_viscosity, + init_ok=SPHLJMF.init(ntypes, cutsq, host_cut, host_viscosity, dimension, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); @@ -72,7 +72,7 @@ int sph_lj_gpu_init(const int ntypes, double **cutsq, double** host_cut, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=SPHLJMF.init(ntypes, cutsq, host_cut, host_viscosity, + init_ok=SPHLJMF.init(ntypes, cutsq, host_cut, host_viscosity, dimension, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); diff --git a/lib/gpu/lal_sph_taitwater.cpp b/lib/gpu/lal_sph_taitwater.cpp index 1548502ffa0..84a25aab5c1 100644 --- a/lib/gpu/lal_sph_taitwater.cpp +++ b/lib/gpu/lal_sph_taitwater.cpp @@ -44,14 +44,14 @@ int SPHTaitwaterT::bytes_per_atom(const int max_nbors) const { } template -int SPHTaitwaterT::init(const int ntypes, - double **host_cutsq, double **host_cut, - double **host_viscosity, double* host_rho0, - double* host_soundspeed, double *host_special_lj, - const int nlocal, const int nall, - const int max_nbors, const int maxspecial, - const double cell_size, - const double gpu_split, FILE *_screen) { +int SPHTaitwaterT::init(const int ntypes, double **host_cutsq, + double **host_cut, double **host_viscosity, + double* host_rho0, double* host_soundspeed, + double* host_B, const int dimension, + double *host_special_lj, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen) { const int max_shared_types=this->device->max_shared_types(); int onetype=0; @@ -72,7 +72,8 @@ int SPHTaitwaterT::init(const int ntypes, int extra_fields = 4; // round up to accomodate quadruples of numtyp values // rho, mass success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, - gpu_split,_screen,sph_taitwater,"k_sph_taitwater",onetype,extra_fields); + gpu_split,_screen,sph_taitwater,"k_sph_taitwater", + onetype,extra_fields); if (success!=0) return success; @@ -83,7 +84,7 @@ int SPHTaitwaterT::init(const int ntypes, lj_types=max_shared_types; shared_types=true; } - _lj_types=taitwater_types; + _lj_types=lj_types; // Allocate a host write buffer for data initialization UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), @@ -96,13 +97,15 @@ int SPHTaitwaterT::init(const int ntypes, this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_viscosity, host_cut, host_cutsq); - UCL_H_Vec dview_rho0ss(ntypes, *(this->ucl_device), UCL_WRITE_ONLY); + UCL_H_Vec dview_coeff2(ntypes, *(this->ucl_device), UCL_WRITE_ONLY); for (int i = 0; i < ntypes; i++) { - dview_rho0ss[i].x = host_rho0[i]; - dview_rho0ss[i].y = host_soundspeed[i]; + dview_coeff2[i].x = host_rho0[i]; + dview_coeff2[i].y = host_soundspeed[i]; + dview_coeff2[i].z = host_B[i]; + dview_coeff2[i].w = 0; } - rho0sspeed.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY); - ucl_copy(rho0sspeed,dview_rho0ss,false); + coeff2.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(coeff2,dview_coeff2,false); UCL_H_Vec dview; sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); @@ -118,8 +121,10 @@ int SPHTaitwaterT::init(const int ntypes, _max_drhoE_size=static_cast(static_cast(ef_nall)*1.10); drhoE.alloc(_max_drhoE_size*2,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _dimension = dimension; + _allocated=true; - this->_max_bytes=coeff.row_bytes()+corho0sspeedeff.row_bytes()+drhoE.row_bytes()+sp_lj.row_bytes(); + this->_max_bytes=coeff.row_bytes()+coeff2.row_bytes()+drhoE.row_bytes()+sp_lj.row_bytes(); return 0; } @@ -130,7 +135,7 @@ void SPHTaitwaterT::clear() { _allocated=false; coeff.clear(); - dview_rho0ss.clear(); + coeff2.clear(); drhoE.clear(); sp_lj.clear(); this->clear_atomic(); @@ -191,16 +196,16 @@ int SPHTaitwaterT::loop(const int eflag, const int vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_sel->set_size(GX,BX); - this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &rho0sspeed, &sp_lj, + this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &coeff2, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &drhoE, &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->v, &this->_threads_per_atom); + &ainum, &nbor_pitch, &this->atom->v, &_dimension, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &rho0sspeed, + this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &coeff2, &_lj_types, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &drhoE, &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->v, &this->_threads_per_atom); + &ainum, &nbor_pitch, &this->atom->v, &_dimension, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_sph_taitwater.cu b/lib/gpu/lal_sph_taitwater.cu index d9e809bd451..d07defc42b1 100644 --- a/lib/gpu/lal_sph_taitwater.cu +++ b/lib/gpu/lal_sph_taitwater.cu @@ -52,20 +52,20 @@ _texture_2d( vel_tex,int4); #endif __kernel void k_sph_taitwater(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict extra, - const __global numtyp4 *restrict coeff, - const __global numtyp2 *restrict rho0sspeed, - const int lj_types, - const __global numtyp *restrict sp_lj, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp3 *restrict ans, - __global acctyp *restrict engv, - __global acctyp *restrict drhoE, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, - const __global numtyp4 *restrict v_, - const int t_per_atom) { + const __global numtyp4 *restrict extra, + const __global numtyp4 *restrict coeff, + const __global numtyp4 *restrict coeff2, + const int lj_types, + const __global numtyp *restrict sp_lj, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + __global acctyp *restrict drhoE, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, + const __global numtyp4 *restrict v_, + const int dimension, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -89,6 +89,9 @@ __kernel void k_sph_taitwater(const __global numtyp4 *restrict x_, numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; + numtyp rho0_itype = coeff2[itype].x; + numtyp soundspeed_itype = coeff2[itype].y; + numtyp B_itype = coeff2[itype].z; numtyp4 iv; fetch4(iv,i,vel_tex); //v_[i]; const numtyp4 extrai = extra[i]; @@ -96,9 +99,9 @@ __kernel void k_sph_taitwater(const __global numtyp4 *restrict x_, numtyp massi= extrai.y; // compute pressure of atom i with Tait EOS - numtyp tmp = rho[i] / rho0[itype]; + numtyp tmp = rhoi / rho0_itype; numtyp fi = tmp * tmp * tmp; - fi = B[itype] * (fi * fi * tmp - 1.0); + fi = B_itype * (fi * fi * tmp - (numtyp)1.0); fi /= (rhoi * rhoi); for ( ; nbordimension == 3 Lucy Kernel, 3d - wfd = (numtyp)-25.066903536973515383 * wfd * wfd * ihsq * ihsq * ihsq * ih; + if (dimension == 3) { + // Lucy Kernel, 3d + wfd = (numtyp)-25.066903536973515383 * wfd * wfd * ihsq * ihsq * ihsq * ih; + } else { + // Lucy Kernel, 2d + wfd = -19.098593171027440292e0 * wfd * wfd * ihsq * ihsq * ihsq; + } - // Lucy Kernel, 2d - //wfd = -19.098593171027440292e0 * wfd * wfd * ihsq * ihsq * ihsq; - // compute pressure of atom j with Tait EOS - numtyp tmp = rho[j] / rho0[jtype]; + + numtyp tmp = rhoj / rho0_jtype; numtyp fj = tmp * tmp * tmp; - fj = B[jtype] * (fj * fj * tmp - 1.0); + fj = B_jtype * (fj * fj * tmp - (numtyp)1.0); fj /= (rhoj * rhoj); // dot product of velocity delta and distance vector @@ -158,10 +165,8 @@ __kernel void k_sph_taitwater(const __global numtyp4 *restrict x_, numtyp fvisc = (numtyp)0; if (delVdotDelR < (numyp)0) { numtyp mu = h * delVdotDelR / (rsq + (numyp)0.01 * h * h); - //fvisc = -coeffx * (ci + cj) * mu / (rhoi + rhoj); // viscosity[itype][jtype] - - fvisc = -coeffx * (soundspeed[itype] - + soundspeed[jtype]) * mu / (rhoi + rhoj); + fvisc = -coeffx * (soundspeed_itype + + soundspeed_jtype) * mu / (rhoi + rhoj); } // total pair force & thermal energy increment @@ -199,19 +204,20 @@ __kernel void k_sph_taitwater(const __global numtyp4 *restrict x_, } __kernel void k_sph_taitwater_fast(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict extra, - const __global numtyp4 *restrict coeff_in, - const __global numtyp2 *restrict rho0sspeed_in, - const __global numtyp *restrict sp_lj_in, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp3 *restrict ans, - __global acctyp *restrict engv, - __global acctyp *restrict drhoE, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, - const __global numtyp4 *restrict v_, - const int t_per_atom) { + const __global numtyp4 *restrict extra, + const __global numtyp4 *restrict coeff_in, + const __global numtyp4 *restrict coeff2_in, + const __global numtyp2 *restrict rho0sspeed_in, + const __global numtyp *restrict sp_lj_in, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + __global acctyp *restrict drhoE, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, + const __global numtyp4 *restrict v_, + const int dimension, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -251,7 +257,9 @@ __kernel void k_sph_taitwater_fast(const __global numtyp4 *restrict x_, numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int iw=ix.w; - numtyp mass_itype = mass[iw]; + numtyp rho0_itype = coeff2[iw].x; + numtyp soundspeed_itype = coeff2[iw].y; + numtyp B_itype = coeff2[iw].z; #ifndef ONETYPE int itype=fast_mul((int)MAX_SHARED_TYPES,iw); #endif @@ -260,15 +268,12 @@ __kernel void k_sph_taitwater_fast(const __global numtyp4 *restrict x_, const numtyp4 extrai = extra[i]; numtyp rhoi = extrai.x; - numtyp esphi = extrai.y; - numtyp cvi = extrai.z; - numtyp massi= extrai.w; - - // compute pressure of particle i with LJ EOS - numtyp fci[2]; - LJEOS2(rhoi, esphi, cvi, fci); - numtyp fi = fci[0]; - numtyp ci = fci[1]; + numtyp massi= extrai.y; + + // compute pressure of atom i with Tait EOS + numtyp tmp = rhoi / rho0_itype; + numtyp fi = tmp * tmp * tmp; + fi = B_itype * (fi * fi * tmp - (numtyp)1.0); fi /= (rhoi * rhoi); for ( ; nbordimension == 3 Lucy Kernel, 3d - wfd = (numtyp)-25.066903536973515383 * wfd * wfd * ihsq * ihsq * ihsq * ih; - - // Lucy Kernel, 2d - //wfd = -19.098593171027440292e0 * wfd * wfd * ihsq * ihsq * ihsq; - - // function call to LJ EOS - numtyp fcj[2]; - LJEOS2(rhoj, esphj, cvj, fcj); - numtyp fj = fcj[0]; - numtyp cj = fcj[1]; - fj /= (rhoj * rhoj); + if (dimension == 3) { + // Lucy Kernel, 3d + wfd = (numtyp)-25.066903536973515383 * wfd * wfd * ihsq * ihsq * ihsq * ih; + } else { + // Lucy Kernel, 2d + wfd = -19.098593171027440292e0 * wfd * wfd * ihsq * ihsq * ihsq; + } - // apply long-range correction to model a LJ fluid with cutoff - // this implies that the modelled LJ fluid has cutoff == SPH cutoff - numtyp lrc = (numtyp)-11.1701 * (ihcub * ihcub * ihcub - (numtyp)1.5 * ihcub); - fi += lrc; - fj += lrc; + // compute pressure of atom j with Tait EOS + numtyp tmp = rhoj / rho0_jtype; + numtyp fj = tmp * tmp * tmp; + fj = B_jtype * (fj * fj * tmp - (numtyp)1.0); + fj /= (rhoj * rhoj); // dot product of velocity delta and distance vector numtyp delvx = iv.x - jv.x; @@ -340,7 +343,8 @@ __kernel void k_sph_taitwater_fast(const __global numtyp4 *restrict x_, numtyp fvisc = (numtyp)0; if (delVdotDelR < (numyp)0) { numtyp mu = h * delVdotDelR / (rsq + (numyp)0.01 * h * h); - fvisc = -coeffx * (ci + cj) * mu / (rhoi + rhoj); // viscosity[itype][jtype] + fvisc = -coeffx * (soundspeed_itype + + soundspeed_jtype) * mu / (rhoi + rhoj); } // total pair force & thermal energy increment @@ -363,7 +367,6 @@ __kernel void k_sph_taitwater_fast(const __global numtyp4 *restrict x_, virial[4] += delx*delz*force; virial[5] += dely*delz*force; } - } } // for nbor } // if ii diff --git a/lib/gpu/lal_sph_taitwater.h b/lib/gpu/lal_sph_taitwater.h index cae5a0b09d4..3b209da3692 100644 --- a/lib/gpu/lal_sph_taitwater.h +++ b/lib/gpu/lal_sph_taitwater.h @@ -40,8 +40,8 @@ class SPHTaitwater : public BaseDPD { int init(const int ntypes, double **host_cutsq, double** host_cut, double **host_viscosity, double* host_rho0, double* host_soundspeed, - double *host_special_lj, const int nlocal, - const int nall, const int max_nbors, + double* host_B, const int dimension, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); @@ -62,10 +62,11 @@ class SPHTaitwater : public BaseDPD { // --------------------------- TYPE DATA -------------------------- - /// coeff.x = viscosity, coeff.y = cut, coeff.z = cutsq - UCL_D_Vec coeff; + /// per-pair coeffs: coeff.x = viscosity, coeff.y = cut, coeff.z = cutsq + UCL_D_Vec coeff; - UCL_D_Vec rho0sspeed; + /// per-type coeffs + UCL_D_Vec coeff2; /// Special LJ values UCL_D_Vec sp_lj; @@ -80,6 +81,8 @@ class SPHTaitwater : public BaseDPD { UCL_Vector drhoE; int _max_drhoE_size; + int _dimension; + /// pointer to host data double *rho, *mass; diff --git a/lib/gpu/lal_sph_taitwater_ext.cpp b/lib/gpu/lal_sph_taitwater_ext.cpp index ef4bd54ab4c..fdf84dfe779 100644 --- a/lib/gpu/lal_sph_taitwater_ext.cpp +++ b/lib/gpu/lal_sph_taitwater_ext.cpp @@ -29,7 +29,8 @@ static SPHTaitwater SPHTaitwaterMF; // --------------------------------------------------------------------------- int sph_taitwater_gpu_init(const int ntypes, double **cutsq, double** host_cut, double **host_viscosity, double* host_rho0, - double* host_soundspeed, double *special_lj, + double* host_soundspeed, double* host_B, + const int dimension, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { @@ -56,9 +57,9 @@ int sph_taitwater_gpu_init(const int ntypes, double **cutsq, double** host_cut, int init_ok=0; if (world_me==0) init_ok=SPHTaitwaterMF.init(ntypes, cutsq, host_cut, host_viscosity, - host_rho0, host_soundspeed, special_lj, - inum, nall, max_nbors, maxspecial, cell_size, - gpu_split, screen); + host_rho0, host_soundspeed, host_B, dimension, + special_lj, inum, nall, max_nbors, maxspecial, + cell_size, gpu_split, screen); SPHTaitwaterMF.device->world_barrier(); if (message) @@ -75,9 +76,9 @@ int sph_taitwater_gpu_init(const int ntypes, double **cutsq, double** host_cut, } if (gpu_rank==i && world_me!=0) init_ok=SPHTaitwaterMF.init(ntypes, cutsq, host_cut, host_viscosity, - host_rho0, host_soundspeed, special_lj, - inum, nall, max_nbors, maxspecial, cell_size, - gpu_split, screen); + host_rho0, host_soundspeed, host_B, dimension, + special_lj, inum, nall, max_nbors, maxspecial, + cell_size, gpu_split, screen); SPHTaitwaterMF.device->serialize_init(); if (message) diff --git a/src/GPU/pair_sph_lj_gpu.cpp b/src/GPU/pair_sph_lj_gpu.cpp index 0b6a7ee72d6..8ba90a1abf4 100644 --- a/src/GPU/pair_sph_lj_gpu.cpp +++ b/src/GPU/pair_sph_lj_gpu.cpp @@ -35,7 +35,7 @@ using namespace LAMMPS_NS; // External functions from cuda library for atom decomposition int sph_lj_gpu_init(const int ntypes, double **cutsq, double** host_cut, - double **host_viscosity, double *special_lj, + double **host_viscosity, const int dimension, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); @@ -109,16 +109,18 @@ void PairSPHLJGPU::compute(int eflag, int vflag) } inum = atom->nlocal; firstneigh = sph_lj_gpu_compute_n( - neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->nspecial, - atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, + neighbor->ago, inum, nall, atom->x, atom->type, + sublo, subhi, atom->nspecial, atom->special, eflag, vflag, + eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, success, atom->v, domain->boxlo, domain->prd); } else { inum = list->inum; ilist = list->ilist; numneigh = list->numneigh; firstneigh = list->firstneigh; - sph_lj_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, - eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, + sph_lj_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, + eflag_atom, vflag_atom, host_start, cpu_time, success, atom->v, atom->nlocal, domain->boxlo, domain->prd); } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); @@ -180,7 +182,7 @@ void PairSPHLJGPU::init_style() if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; int mnf = 5e-2 * neighbor->oneatom; int success = - sph_lj_gpu_init(atom->ntypes + 1, cutsq, cut, viscosity, + sph_lj_gpu_init(atom->ntypes + 1, cutsq, cut, viscosity, domain->dimension, force->special_lj, atom->nlocal, atom->nlocal + atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success, error, world); diff --git a/src/GPU/pair_sph_taitwater_gpu.cpp b/src/GPU/pair_sph_taitwater_gpu.cpp new file mode 100644 index 00000000000..1322197b89d --- /dev/null +++ b/src/GPU/pair_sph_taitwater_gpu.cpp @@ -0,0 +1,198 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Dac Nguyen (U Chicago) +------------------------------------------------------------------------- */ + +#include "pair_sph_taitwater_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "info.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "suffix.h" +#include "update.h" + +#include + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int sph_taitwater_gpu_init(const int ntypes, double **cutsq, double** host_cut, + double **host_viscosity, double* host_rho0, + double* host_soundspeed, double* host_B, const int dimension, + double *special_lj, const int inum, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen); +void sph_taitwater_gpu_clear(); +int **sph_taitwater_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, + int *host_type, double *sublo, double *subhi, int **nspecial, + tagint **special, const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double **host_v, + double *boxlo, double *prd); +void sph_taitwater_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, + int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double **host_v, + const int nlocal, double *boxlo, double *prd); +void sph_taitwater_gpu_get_extra_data(double *host_rho, double *host_mass); +void sph_taitwater_gpu_update_drhoE(void **drhoE_ptr); +double sph_taitwater_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairSPHTaitwaterGPU::PairSPHTaitwaterGPU(LAMMPS *lmp) : PairSPHTaitwater(lmp), gpu_mode(GPU_FORCE) +{ + drhoE_pinned = nullptr; + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairSPHTaitwaterGPU::~PairSPHTaitwaterGPU() +{ + sph_taitwater_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairSPHTaitwaterGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double *rho = atom->rho; + double *mass = atom->mass; + sph_taitwater_gpu_get_extra_data(rho, mass); + + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + inum = atom->nlocal; + firstneigh = sph_taitwater_gpu_compute_n( + neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, + cpu_time, success, atom->v, domain->boxlo, domain->prd); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + sph_taitwater_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, + atom->v, atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + // get the drho and dE from device + + double *drho = atom->drho; + double *desph = atom->desph; + sph_taitwater_gpu_update_drhoE(&drhoE_pinned); + + int nlocal = atom->nlocal; + if (acc_float) { + auto drhoE_ptr = (float *)drhoE_pinned; + int idx = 0; + for (int i = 0; i < nlocal; i++) { + drho[i] = drhoE_ptr[idx]; + desph[i] = drhoE_ptr[idx+1]; + idx += 2; + } + + } else { + auto drhoE_ptr = (double *)drhoE_pinned; + int idx = 0; + for (int i = 0; i < nlocal; i++) { + drho[i] = drhoE_ptr[idx]; + desph[i] = drhoE_ptr[idx+1]; + idx += 2; + } + } + + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairSPHTaitwaterGPU::init_style() +{ + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double mcut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + mcut = init_one(i, j); + mcut *= mcut; + if (mcut > maxcut) maxcut = mcut; + cutsq[i][j] = cutsq[j][i] = mcut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + int success = + sph_taitwater_gpu_init(atom->ntypes + 1, cutsq, cut, viscosity, + rho0, soundspeed, B, domain->dimension, force->special_lj, + atom->nlocal, atom->nlocal + atom->nghost, + mnf, maxspecial, cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success, error, world); + + acc_float = Info::has_accelerator_feature("GPU", "precision", "single"); + + if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); +} + +/* ---------------------------------------------------------------------- */ + +double PairSPHTaitwaterGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + sph_taitwater_gpu_bytes(); +} diff --git a/src/GPU/pair_sph_taitwater_gpu.h b/src/GPU/pair_sph_taitwater_gpu.h new file mode 100644 index 00000000000..df8119a3c00 --- /dev/null +++ b/src/GPU/pair_sph_taitwater_gpu.h @@ -0,0 +1,48 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(sph/taitwater/gpu,PairSPHTaitwaterGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_SPH_TAITWATER_GPU_H +#define LMP_PAIR_SPH_TAITWATER_GPU_H + +#include "pair_sph_taitwater.h" + +namespace LAMMPS_NS { + +class PairSPHTaitwaterGPU : public PairSPHTaitwater { + public: + PairSPHTaitwaterGPU(LAMMPS *lmp); + ~PairSPHTaitwaterGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + void *drhoE_pinned; + bool acc_float; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif From 0f78afc66f4f37411f29620744789409bd87e9b8 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 8 Dec 2023 00:20:53 -0600 Subject: [PATCH 14/30] Added atom tag for dpd to work with tag and v --- lib/gpu/lal_sph_lj_ext.cpp | 8 +++----- lib/gpu/lal_sph_taitwater_ext.cpp | 6 ++---- src/GPU/pair_sph_lj_gpu.cpp | 8 ++++---- src/GPU/pair_sph_taitwater_gpu.cpp | 8 ++++---- 4 files changed, 13 insertions(+), 17 deletions(-) diff --git a/lib/gpu/lal_sph_lj_ext.cpp b/lib/gpu/lal_sph_lj_ext.cpp index b23a398db7c..0b8ea0e2633 100644 --- a/lib/gpu/lal_sph_lj_ext.cpp +++ b/lib/gpu/lal_sph_lj_ext.cpp @@ -94,15 +94,14 @@ void sph_lj_gpu_clear() { int ** sph_lj_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, int **nspecial, + double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double **host_v, double *boxlo, double *prd) { double dtinvsqrt = 1.0; int seed = 0; - int timestep = 0; - tagint* tag = nullptr; + int timestep = 0; return SPHLJMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, @@ -113,12 +112,11 @@ void sph_lj_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, + const double cpu_time, bool &success, tagint *tag, double **host_v, const int nlocal, double *boxlo, double *prd) { double dtinvsqrt = 1.0; int seed = 0; int timestep = 0; - tagint* tag = nullptr; SPHLJMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, tag, host_v, dtinvsqrt, seed, timestep, nlocal, boxlo, prd); diff --git a/lib/gpu/lal_sph_taitwater_ext.cpp b/lib/gpu/lal_sph_taitwater_ext.cpp index fdf84dfe779..e5b1d439b82 100644 --- a/lib/gpu/lal_sph_taitwater_ext.cpp +++ b/lib/gpu/lal_sph_taitwater_ext.cpp @@ -98,7 +98,7 @@ void sph_taitwater_gpu_clear() { int ** sph_taitwater_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, int **nspecial, + double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, @@ -106,7 +106,6 @@ int ** sph_taitwater_gpu_compute_n(const int ago, const int inum_full, const int double dtinvsqrt = 1.0; int seed = 0; int timestep = 0; - tagint* tag = nullptr; return SPHTaitwaterMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, @@ -117,12 +116,11 @@ void sph_taitwater_gpu_compute(const int ago, const int inum_full, const int nal double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, + const double cpu_time, bool &success, tagint *tag, double **host_v, const int nlocal, double *boxlo, double *prd) { double dtinvsqrt = 1.0; int seed = 0; int timestep = 0; - tagint* tag = nullptr; SPHTaitwaterMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, tag, host_v, dtinvsqrt, seed, timestep, nlocal, boxlo, prd); diff --git a/src/GPU/pair_sph_lj_gpu.cpp b/src/GPU/pair_sph_lj_gpu.cpp index 8ba90a1abf4..33366f09a62 100644 --- a/src/GPU/pair_sph_lj_gpu.cpp +++ b/src/GPU/pair_sph_lj_gpu.cpp @@ -41,7 +41,7 @@ int sph_lj_gpu_init(const int ntypes, double **cutsq, double** host_cut, const double cell_size, int &gpu_mode, FILE *screen); void sph_lj_gpu_clear(); int **sph_lj_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, - int *host_type, double *sublo, double *subhi, int **nspecial, + int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double **host_v, @@ -49,7 +49,7 @@ int **sph_lj_gpu_compute_n(const int ago, const int inum_full, const int nall, d void sph_lj_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double **host_v, + const double cpu_time, bool &success, tagint *tag, double **host_v, const int nlocal, double *boxlo, double *prd); void sph_lj_gpu_get_extra_data(double *host_rho, double *host_esph, double *host_cv, double *host_mass); @@ -110,7 +110,7 @@ void PairSPHLJGPU::compute(int eflag, int vflag) inum = atom->nlocal; firstneigh = sph_lj_gpu_compute_n( neighbor->ago, inum, nall, atom->x, atom->type, - sublo, subhi, atom->nspecial, atom->special, eflag, vflag, + sublo, subhi, atom->tag, atom->nspecial, atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, success, atom->v, domain->boxlo, domain->prd); } else { @@ -121,7 +121,7 @@ void PairSPHLJGPU::compute(int eflag, int vflag) sph_lj_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, - atom->v, atom->nlocal, domain->boxlo, domain->prd); + atom->tag, atom->v, atom->nlocal, domain->boxlo, domain->prd); } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); diff --git a/src/GPU/pair_sph_taitwater_gpu.cpp b/src/GPU/pair_sph_taitwater_gpu.cpp index 1322197b89d..b98a89c3ff3 100644 --- a/src/GPU/pair_sph_taitwater_gpu.cpp +++ b/src/GPU/pair_sph_taitwater_gpu.cpp @@ -42,7 +42,7 @@ int sph_taitwater_gpu_init(const int ntypes, double **cutsq, double** host_cut, const double cell_size, int &gpu_mode, FILE *screen); void sph_taitwater_gpu_clear(); int **sph_taitwater_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, - int *host_type, double *sublo, double *subhi, int **nspecial, + int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double **host_v, @@ -50,7 +50,7 @@ int **sph_taitwater_gpu_compute_n(const int ago, const int inum_full, const int void sph_taitwater_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double **host_v, + const double cpu_time, bool &success, tagint *tag, double **host_v, const int nlocal, double *boxlo, double *prd); void sph_taitwater_gpu_get_extra_data(double *host_rho, double *host_mass); void sph_taitwater_gpu_update_drhoE(void **drhoE_ptr); @@ -107,7 +107,7 @@ void PairSPHTaitwaterGPU::compute(int eflag, int vflag) } inum = atom->nlocal; firstneigh = sph_taitwater_gpu_compute_n( - neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->nspecial, + neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->tag, atom->nspecial, atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, success, atom->v, domain->boxlo, domain->prd); } else { @@ -117,7 +117,7 @@ void PairSPHTaitwaterGPU::compute(int eflag, int vflag) firstneigh = list->firstneigh; sph_taitwater_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, - atom->v, atom->nlocal, domain->boxlo, domain->prd); + atom->tag, atom->v, atom->nlocal, domain->boxlo, domain->prd); } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); From 46a670889dd0f472672f37499c52edd04dcd064c Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 8 Dec 2023 14:09:39 -0600 Subject: [PATCH 15/30] Added BaseSPH for sph pair styles, fixed bugs with per-type arrays --- lib/gpu/lal_base_sph.cpp | 362 +++++++++++++++++++++++++++++ lib/gpu/lal_base_sph.h | 209 +++++++++++++++++ lib/gpu/lal_sph_lj.cpp | 25 +- lib/gpu/lal_sph_lj.cu | 27 ++- lib/gpu/lal_sph_lj.h | 16 +- lib/gpu/lal_sph_lj_ext.cpp | 45 ++-- lib/gpu/lal_sph_taitwater.cpp | 25 +- lib/gpu/lal_sph_taitwater.cu | 74 +++--- lib/gpu/lal_sph_taitwater.h | 20 +- lib/gpu/lal_sph_taitwater_ext.cpp | 34 ++- src/GPU/pair_sph_lj_gpu.cpp | 41 ++-- src/GPU/pair_sph_taitwater_gpu.cpp | 37 +-- 12 files changed, 750 insertions(+), 165 deletions(-) create mode 100644 lib/gpu/lal_base_sph.cpp create mode 100644 lib/gpu/lal_base_sph.h diff --git a/lib/gpu/lal_base_sph.cpp b/lib/gpu/lal_base_sph.cpp new file mode 100644 index 00000000000..c6876a7dcd0 --- /dev/null +++ b/lib/gpu/lal_base_sph.cpp @@ -0,0 +1,362 @@ +/*************************************************************************** + base_sph.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Base class for SPH pair styles needing per-particle data for position, + velocity, and type. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#include "lal_base_sph.h" +namespace LAMMPS_AL { +#define BaseSPHT BaseSPH + +extern Device global_device; + +template +BaseSPHT::BaseSPH() : _compiled(false), _max_bytes(0) { + device=&global_device; + ans=new Answer(); + nbor=new Neighbor(); + pair_program=nullptr; + ucl_device=nullptr; + #if defined(LAL_OCL_EV_JIT) + pair_program_noev=nullptr; + #endif +} + +template +BaseSPHT::~BaseSPH() { + delete ans; + delete nbor; + k_pair_fast.clear(); + k_pair.clear(); + if (pair_program) delete pair_program; + #if defined(LAL_OCL_EV_JIT) + k_pair_noev.clear(); + if (pair_program_noev) delete pair_program_noev; + #endif +} + +template +int BaseSPHT::bytes_per_atom_atomic(const int max_nbors) const { + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ + nbor->bytes_per_atom(max_nbors); +} + +template +int BaseSPHT::init_atomic(const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, const double gpu_split, + FILE *_screen, const void *pair_program, + const char *k_name, const int onetype, + const int extra_fields) { + screen=_screen; + + int gpu_nbor=0; + if (device->gpu_mode()==Device::GPU_NEIGH) + gpu_nbor=1; + else if (device->gpu_mode()==Device::GPU_HYB_NEIGH) + gpu_nbor=2; + + int _gpu_host=0; + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); + if (host_nlocal>0) + _gpu_host=1; + + _threads_per_atom=device->threads_per_atom(); + + bool charge = false; + bool rot = false; + bool vel = true; + _extra_fields = extra_fields; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields/4); + if (success!=0) + return success; + + if (ucl_device!=device->gpu) _compiled=false; + + ucl_device=device->gpu; + atom=&device->atom; + + _block_size=device->pair_block_size(); + compile_kernels(*ucl_device,pair_program,k_name,onetype); + + if (_threads_per_atom>1 && gpu_nbor==0) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + + success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, + max_nbors,cell_size,false,_threads_per_atom); + if (success!=0) + return success; + + // Initialize host-device load balancer + hd_balancer.init(device,gpu_nbor,gpu_split); + + // Initialize timers for the selected GPU + time_pair.init(*ucl_device); + time_pair.zero(); + + pos_tex.bind_float(atom->x,4); + vel_tex.bind_float(atom->v,4); + + _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + + return success; +} + +template +void BaseSPHT::estimate_gpu_overhead() { + device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead); +} + +template +void BaseSPHT::clear_atomic() { + // Output any timing information + acc_timers(); + double avg_split=hd_balancer.all_avg_split(); + _gpu_overhead*=hd_balancer.timestep(); + _driver_overhead*=hd_balancer.timestep(); + device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, + _gpu_overhead,_driver_overhead,_threads_per_atom,screen); + + time_pair.clear(); + hd_balancer.clear(); + + nbor->clear(); + ans->clear(); +} + +// --------------------------------------------------------------------------- +// Copy neighbor list from host +// --------------------------------------------------------------------------- +template +int * BaseSPHT::reset_nbors(const int nall, const int inum, int *ilist, + int *numj, int **firstneigh, bool &success) { + success=true; + + int mn=nbor->max_nbor_loop(inum,numj,ilist); + resize_atom(inum,nall,success); + resize_local(inum,mn,success); + if (!success) + return nullptr; + + nbor->get_host(inum,ilist,numj,firstneigh,block_size()); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; + + return ilist; +} + +// --------------------------------------------------------------------------- +// Build neighbor list on device +// --------------------------------------------------------------------------- +template +inline void BaseSPHT::build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, + int *host_type, double *sublo, + double *subhi, tagint *tag, + int **nspecial, tagint **special, + bool &success) { + success=true; + resize_atom(inum,nall,success); + resize_local(inum,host_inum,nbor->max_nbors(),success); + if (!success) + return; + atom->cast_copy_x(host_x,host_type); + + int mn; + nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, + tag, nspecial, special, success, mn, ans->error_flag); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; +} + +// --------------------------------------------------------------------------- +// Copy nbor list from host if necessary and then calculate forces, virials,.. +// --------------------------------------------------------------------------- +template +void BaseSPHT::compute(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **host_v, const int nlocal) { + acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return; + } + + int ago=hd_balancer.ago_first(f_ago); + int inum=hd_balancer.balance(ago,inum_full,cpu_time); + ans->inum(inum); + host_start=inum; + + if (ago==0) { + reset_nbors(nall, inum, ilist, numj, firstneigh, success); + if (!success) + return; + } + + atom->cast_x_data(host_x,host_type); + atom->cast_v_data(host_v,tag); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + atom->add_v_data(host_v,tag); + + const int red_blocks=loop(eflag,vflag); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); + device->add_ans_object(ans); + hd_balancer.stop_timer(); +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary and then compute forces, virials, energies +// --------------------------------------------------------------------------- +template +int** BaseSPHT::compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v) { + acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return nullptr; + } + + hd_balancer.balance(cpu_time); + int inum=hd_balancer.get_gpu_count(ago,inum_full); + ans->inum(inum); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + sublo, subhi, tag, nspecial, special, success); + if (!success) + return nullptr; + atom->cast_v_data(host_v,tag); + hd_balancer.start_timer(); + } else { + atom->cast_x_data(host_x,host_type); + atom->cast_v_data(host_v,tag); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + } + atom->add_v_data(host_v,tag); + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); + + const int red_blocks=loop(eflag,vflag); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + device->add_ans_object(ans); + hd_balancer.stop_timer(); + + return nbor->host_jlist.begin()-host_start; +} + +template +double BaseSPHT::host_memory_usage_atomic() const { + return device->atom.host_memory_usage()+nbor->host_memory_usage()+ + 4*sizeof(numtyp)+sizeof(BaseSPH); +} + +template +void BaseSPHT::compile_kernels(UCL_Device &dev, const void *pair_str, + const char *kname, const int onetype) { + if (_compiled && _onetype==onetype) + return; + + _onetype=onetype; + + std::string s_fast=std::string(kname)+"_fast"; + if (pair_program) delete pair_program; + pair_program=new UCL_Program(dev); + std::string oclstring = device->compile_string()+" -DEVFLAG=1"; + if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype); + pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); + k_pair_fast.set_function(*pair_program,s_fast.c_str()); + k_pair.set_function(*pair_program,kname); + pos_tex.get_texture(*pair_program,"pos_tex"); + vel_tex.get_texture(*pair_program,"vel_tex"); + + #if defined(LAL_OCL_EV_JIT) + oclstring = device->compile_string()+" -DEVFLAG=0"; + if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype); + if (pair_program_noev) delete pair_program_noev; + pair_program_noev=new UCL_Program(dev); + pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen); + k_pair_noev.set_function(*pair_program_noev,s_fast.c_str()); + #else + k_pair_sel = &k_pair_fast; + #endif + + _compiled=true; + + #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)) + if (dev.has_subgroup_support()) { + size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size); + #if defined(LAL_OCL_EV_JIT) + mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size)); + #endif + if (_threads_per_atom > (int)mx_subgroup_sz) _threads_per_atom = mx_subgroup_sz; + device->set_simd_size(mx_subgroup_sz); + } + #endif + +} + +template class BaseSPH; +} diff --git a/lib/gpu/lal_base_sph.h b/lib/gpu/lal_base_sph.h new file mode 100644 index 00000000000..950462e9989 --- /dev/null +++ b/lib/gpu/lal_base_sph.h @@ -0,0 +1,209 @@ +/*************************************************************************** + base_sph.h + ------------------- + Trung Dac Nguyen (U Chicago) + + Base class for SPH pair styles needing per-particle data for position, + velocity, and type. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#ifndef LAL_BASE_SPH_H +#define LAL_BASE_DPD_H + +#include "lal_device.h" +#include "lal_balance.h" +#include "mpi.h" + +#ifdef USE_OPENCL +#include "geryon/ocl_texture.h" +#elif defined(USE_HIP) +#include "geryon/hip_texture.h" +#else +#include "geryon/nvd_texture.h" +#endif + +namespace LAMMPS_AL { + +template +class BaseSPH { + public: + BaseSPH(); + virtual ~BaseSPH(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * \param k_name name for the kernel for force calculation + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init_atomic(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + const void *pair_program, const char *k_name, + const int onetype=0, const int extra_fields=0); + + /// Estimate the overhead for GPU context changes and CPU driver + void estimate_gpu_overhead(); + + /// Check if there is enough storage for atom arrays and realloc if not + /** \param success set to false if insufficient memory **/ + inline void resize_atom(const int inum, const int nall, bool &success) { + if (atom->resize(nall, success)) { + pos_tex.bind_float(atom->x,4); + vel_tex.bind_float(atom->v,4); + } + ans->resize(inum,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note olist_size=total number of local particles **/ + inline void resize_local(const int inum, const int max_nbors, bool &success) { + nbor->resize(inum,max_nbors,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note host_inum is 0 if the host is performing neighboring + * \note nlocal+host_inum=total number local particles + * \note olist_size=0 **/ + inline void resize_local(const int inum, const int host_inum, + const int max_nbors, bool &success) { + nbor->resize(inum,host_inum,max_nbors,success); + } + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear_atomic(); + + /// Returns memory usage on device per atom + int bytes_per_atom_atomic(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage_atomic() const; + + /// Accumulate timers + inline void acc_timers() { + if (device->time_device()) { + nbor->acc_timers(screen); + time_pair.add_to_total(); + atom->acc_timers(); + ans->acc_timers(); + } + } + + /// Zero timers + inline void zero_timers() { + time_pair.zero(); + atom->zero_timers(); + ans->zero_timers(); + } + + /// Copy neighbor list from host + int * reset_nbors(const int nall, const int inum, int *ilist, int *numj, + int **firstneigh, bool &success); + + /// Build neighbor list on device + void build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, bool &success); + + /// Pair loop with host neighboring + void compute(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **v, const int nlocal); + + /// Pair loop with device neighboring + int** compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double **v); + + // -------------------------- DEVICE DATA ------------------------- + + /// Device Properties and Atom and Neighbor storage + Device *device; + + /// Geryon device + UCL_Device *ucl_device; + + /// Device Timers + UCL_Timer time_pair; + + /// Host device load balancer + Balance hd_balancer; + + /// LAMMPS pointer for screen output + FILE *screen; + + // --------------------------- ATOM DATA -------------------------- + + /// Atom Data + Atom *atom; + + // ------------------------ FORCE/ENERGY DATA ----------------------- + + Answer *ans; + + // --------------------------- NBOR DATA ---------------------------- + + /// Neighbor data + Neighbor *nbor; + + // ------------------------- DEVICE KERNELS ------------------------- + UCL_Program *pair_program, *pair_program_noev; + UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel; + inline int block_size() { return _block_size; } + inline void set_kernel(const int eflag, const int vflag) { + #if defined(LAL_OCL_EV_JIT) + if (eflag || vflag) k_pair_sel = &k_pair_fast; + else k_pair_sel = &k_pair_noev; + #endif + } + + + // --------------------------- TEXTURES ----------------------------- + UCL_Texture pos_tex; + UCL_Texture vel_tex; + + // ------------------------- COMMON VARS ---------------------------- + + protected: + bool _compiled; + int _block_size, _threads_per_atom, _onetype, _extra_fields; + double _max_bytes, _max_an_bytes; + double _gpu_overhead, _driver_overhead; + UCL_D_Vec *_nbor_data; + + void compile_kernels(UCL_Device &dev, const void *pair_string, + const char *k, const int onetype); + virtual int loop(const int eflag, const int vflag) = 0; +}; + +} + +#endif diff --git a/lib/gpu/lal_sph_lj.cpp b/lib/gpu/lal_sph_lj.cpp index 85c144e2401..dbe44bf30a9 100644 --- a/lib/gpu/lal_sph_lj.cpp +++ b/lib/gpu/lal_sph_lj.cpp @@ -29,7 +29,7 @@ namespace LAMMPS_AL { extern Device device; template -SPHLJT::SPHLJ() : BaseDPD(), _allocated(false) { +SPHLJT::SPHLJ() : BaseSPH(), _allocated(false) { _max_drhoE_size = 0; } @@ -46,8 +46,8 @@ int SPHLJT::bytes_per_atom(const int max_nbors) const { template int SPHLJT::init(const int ntypes, double **host_cutsq, double **host_cut, - double **host_viscosity, const int dimension, - double *host_special_lj, + double **host_viscosity, double* host_mass, + const int dimension, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -70,7 +70,7 @@ int SPHLJT::init(const int ntypes, int success; int extra_fields = 4; // round up to accomodate quadruples of numtyp values - // rho, cv, mass + // rho, cv success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, gpu_split,_screen,sph_lj,"k_sph_lj",onetype,extra_fields); if (success!=0) @@ -96,6 +96,12 @@ int SPHLJT::init(const int ntypes, this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_viscosity, host_cut, host_cutsq); + UCL_H_Vec dview_mass(ntypes, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < ntypes; i++) + dview_mass[i] = host_mass[i]; + mass.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(mass,dview_mass,false); + UCL_H_Vec dview; sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); dview.view(host_special_lj,4,*(this->ucl_device)); @@ -124,6 +130,7 @@ void SPHLJT::clear() { _allocated=false; coeff.clear(); + mass.clear(); drhoE.clear(); sp_lj.clear(); this->clear_atomic(); @@ -168,7 +175,7 @@ int SPHLJT::loop(const int eflag, const int vflag) { v.x = rho[i]; v.y = esph[i]; v.z = cv[i]; - v.w = mass[i]; + v.w = 0; pextra[idx] = v; } this->atom->add_extra_data(); @@ -184,13 +191,13 @@ int SPHLJT::loop(const int eflag, const int vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_sel->set_size(GX,BX); - this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &sp_lj, + this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &mass, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &drhoE, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->v, &_dimension, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, + this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &mass, &_lj_types, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &drhoE, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->v, &_dimension, &this->_threads_per_atom); @@ -205,12 +212,10 @@ int SPHLJT::loop(const int eflag, const int vflag) { // --------------------------------------------------------------------------- template -void SPHLJT::get_extra_data(double *host_rho, double *host_esph, - double *host_cv, double* host_mass) { +void SPHLJT::get_extra_data(double *host_rho, double *host_esph, double *host_cv) { rho = host_rho; esph = host_esph; cv = host_cv; - mass = host_mass; } template class SPHLJ; diff --git a/lib/gpu/lal_sph_lj.cu b/lib/gpu/lal_sph_lj.cu index ab41477d2f0..b105632330e 100644 --- a/lib/gpu/lal_sph_lj.cu +++ b/lib/gpu/lal_sph_lj.cu @@ -47,7 +47,7 @@ _texture_2d( vel_tex,int4); } \ } \ if (offset==0 && ii -class SPHLJ : public BaseDPD { +class SPHLJ : public BaseSPH { public: SPHLJ(); ~SPHLJ(); @@ -38,7 +38,8 @@ class SPHLJ : public BaseDPD { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, - double** host_cut, double **host_viscosity, const int dimension, + double** host_cut, double **host_viscosity, double *host_mass, + const int dimension, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); @@ -54,7 +55,7 @@ class SPHLJ : public BaseDPD { double host_memory_usage() const; void get_extra_data(double *host_rho, double *host_esph, - double *host_cv, double* host_mass); + double *host_cv); /// copy drho and desph from device to host void update_drhoE(void **drhoE_ptr); @@ -62,7 +63,10 @@ class SPHLJ : public BaseDPD { // --------------------------- TYPE DATA -------------------------- /// coeff.x = viscosity, coeff.y = cut, coeff.z = cutsq - UCL_D_Vec coeff; + UCL_D_Vec coeff; + + /// per-type coeffs + UCL_D_Vec mass; /// Special LJ values UCL_D_Vec sp_lj; @@ -80,7 +84,7 @@ class SPHLJ : public BaseDPD { int _dimension; /// pointer to host data - double *rho, *esph, *cv, *mass; + double *rho, *esph, *cv; private: bool _allocated; diff --git a/lib/gpu/lal_sph_lj_ext.cpp b/lib/gpu/lal_sph_lj_ext.cpp index 0b8ea0e2633..488f7b12007 100644 --- a/lib/gpu/lal_sph_lj_ext.cpp +++ b/lib/gpu/lal_sph_lj_ext.cpp @@ -28,7 +28,7 @@ static SPHLJ SPHLJMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int sph_lj_gpu_init(const int ntypes, double **cutsq, double** host_cut, - double **host_viscosity, const int dimension, + double **host_viscosity, double* host_mass, const int dimension, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { @@ -54,8 +54,8 @@ int sph_lj_gpu_init(const int ntypes, double **cutsq, double** host_cut, int init_ok=0; if (world_me==0) - init_ok=SPHLJMF.init(ntypes, cutsq, host_cut, host_viscosity, dimension, - special_lj, inum, nall, max_nbors, maxspecial, + init_ok=SPHLJMF.init(ntypes, cutsq, host_cut, host_viscosity, host_mass, + dimension, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); SPHLJMF.device->world_barrier(); @@ -72,8 +72,8 @@ int sph_lj_gpu_init(const int ntypes, double **cutsq, double** host_cut, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=SPHLJMF.init(ntypes, cutsq, host_cut, host_viscosity, dimension, - special_lj, inum, nall, max_nbors, maxspecial, + init_ok=SPHLJMF.init(ntypes, cutsq, host_cut, host_viscosity, host_mass, + dimension, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); SPHLJMF.device->serialize_init(); @@ -93,38 +93,31 @@ void sph_lj_gpu_clear() { } int ** sph_lj_gpu_compute_n(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, bool &success, - double **host_v, double *boxlo, double *prd) { - double dtinvsqrt = 1.0; - int seed = 0; - int timestep = 0; + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *host_tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v) { return SPHLJMF.compute(ago, inum_full, nall, host_x, host_type, sublo, - subhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, ilist, jnum, cpu_time, success, - host_v, dtinvsqrt, seed, timestep, boxlo, prd); + subhi, host_tag, nspecial, special, eflag, vflag, + eatom, vatom, host_start, ilist, jnum, cpu_time, success, + host_v); } void sph_lj_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, tagint *tag, - double **host_v, const int nlocal, double *boxlo, double *prd) { - double dtinvsqrt = 1.0; - int seed = 0; - int timestep = 0; + const double cpu_time, bool &success, tagint *host_tag, + double **host_v, const int nlocal) { SPHLJMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, - tag, host_v, dtinvsqrt, seed, timestep, nlocal, boxlo, prd); + host_tag, host_v, nlocal); } -void sph_lj_gpu_get_extra_data(double *host_rho, double *host_esph, - double *host_cv, double *host_mass) { - SPHLJMF.get_extra_data(host_rho, host_esph, host_cv, host_mass); +void sph_lj_gpu_get_extra_data(double *host_rho, double *host_esph, double *host_cv) { + SPHLJMF.get_extra_data(host_rho, host_esph, host_cv); } void sph_lj_gpu_update_drhoE(void **drhoE_ptr) { diff --git a/lib/gpu/lal_sph_taitwater.cpp b/lib/gpu/lal_sph_taitwater.cpp index 84a25aab5c1..7a584d435ec 100644 --- a/lib/gpu/lal_sph_taitwater.cpp +++ b/lib/gpu/lal_sph_taitwater.cpp @@ -3,7 +3,7 @@ ------------------- Trung Dac Nguyen (U Chicago) - Class for acceleration of the sph_taitwater pair style. + Class for acceleration of the sph/taitwater pair style. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) @@ -29,7 +29,7 @@ namespace LAMMPS_AL { extern Device device; template -SPHTaitwaterT::SPHTaitwater() : BaseDPD(), _allocated(false) { +SPHTaitwaterT::SPHTaitwater() : BaseSPH(), _allocated(false) { _max_drhoE_size = 0; } @@ -46,8 +46,8 @@ int SPHTaitwaterT::bytes_per_atom(const int max_nbors) const { template int SPHTaitwaterT::init(const int ntypes, double **host_cutsq, double **host_cut, double **host_viscosity, - double* host_rho0, double* host_soundspeed, - double* host_B, const int dimension, + double* host_mass, double* host_rho0, + double* host_soundspeed, double* host_B, const int dimension, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -70,7 +70,7 @@ int SPHTaitwaterT::init(const int ntypes, double **host_cutsq, int success; int extra_fields = 4; // round up to accomodate quadruples of numtyp values - // rho, mass + // rho success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, gpu_split,_screen,sph_taitwater,"k_sph_taitwater", onetype,extra_fields); @@ -99,10 +99,10 @@ int SPHTaitwaterT::init(const int ntypes, double **host_cutsq, UCL_H_Vec dview_coeff2(ntypes, *(this->ucl_device), UCL_WRITE_ONLY); for (int i = 0; i < ntypes; i++) { - dview_coeff2[i].x = host_rho0[i]; - dview_coeff2[i].y = host_soundspeed[i]; - dview_coeff2[i].z = host_B[i]; - dview_coeff2[i].w = 0; + dview_coeff2[i].x = host_mass[i]; + dview_coeff2[i].y = host_rho0[i]; + dview_coeff2[i].z = host_soundspeed[i]; + dview_coeff2[i].w = host_B[i]; } coeff2.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY); ucl_copy(coeff2,dview_coeff2,false); @@ -122,7 +122,7 @@ int SPHTaitwaterT::init(const int ntypes, double **host_cutsq, drhoE.alloc(_max_drhoE_size*2,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); _dimension = dimension; - + _allocated=true; this->_max_bytes=coeff.row_bytes()+coeff2.row_bytes()+drhoE.row_bytes()+sp_lj.row_bytes(); return 0; @@ -178,7 +178,7 @@ int SPHTaitwaterT::loop(const int eflag, const int vflag) { int idx = n+i*nstride; numtyp4 v; v.x = rho[i]; - v.y = mass[i]; + v.y = 0; v.z = 0; v.w = 0; pextra[idx] = v; @@ -217,9 +217,8 @@ int SPHTaitwaterT::loop(const int eflag, const int vflag) { // --------------------------------------------------------------------------- template -void SPHTaitwaterT::get_extra_data(double *host_rho, double* host_mass) { +void SPHTaitwaterT::get_extra_data(double *host_rho) { rho = host_rho; - mass = host_mass; } template class SPHTaitwater; diff --git a/lib/gpu/lal_sph_taitwater.cu b/lib/gpu/lal_sph_taitwater.cu index d07defc42b1..cc32e4a3c92 100644 --- a/lib/gpu/lal_sph_taitwater.cu +++ b/lib/gpu/lal_sph_taitwater.cu @@ -47,7 +47,7 @@ _texture_2d( vel_tex,int4); } \ } \ if (offset==0 && ii -class SPHTaitwater : public BaseDPD { +class SPHTaitwater : public BaseSPH { public: SPHTaitwater(); ~SPHTaitwater(); @@ -38,9 +38,9 @@ class SPHTaitwater : public BaseDPD { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, - double** host_cut, double **host_viscosity, - double* host_rho0, double* host_soundspeed, - double* host_B, const int dimension, double *host_special_lj, + double** host_cut, double **host_viscosity, double *host_mass, + double* host_rho0, double* host_soundspeed, double* host_B, + const int dimension, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); @@ -55,7 +55,7 @@ class SPHTaitwater : public BaseDPD { /// Total host memory used by library for pair style double host_memory_usage() const; - void get_extra_data(double *host_rho, double* host_mass); + void get_extra_data(double *host_rho); /// copy drho and desph from device to host void update_drhoE(void **drhoE_ptr); @@ -84,7 +84,7 @@ class SPHTaitwater : public BaseDPD { int _dimension; /// pointer to host data - double *rho, *mass; + double *rho; private: bool _allocated; diff --git a/lib/gpu/lal_sph_taitwater_ext.cpp b/lib/gpu/lal_sph_taitwater_ext.cpp index e5b1d439b82..9d125a63958 100644 --- a/lib/gpu/lal_sph_taitwater_ext.cpp +++ b/lib/gpu/lal_sph_taitwater_ext.cpp @@ -3,7 +3,7 @@ ------------------- Trung Dac Nguyen (U Chicago) - Functions for LAMMPS access to sph lj acceleration routines. + Functions for LAMMPS access to sph taitwater acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) @@ -28,8 +28,8 @@ static SPHTaitwater SPHTaitwaterMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int sph_taitwater_gpu_init(const int ntypes, double **cutsq, double** host_cut, - double **host_viscosity, double* host_rho0, - double* host_soundspeed, double* host_B, + double **host_viscosity, double* host_mass, + double* host_rho0, double* host_soundspeed, double* host_B, const int dimension, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, @@ -56,7 +56,7 @@ int sph_taitwater_gpu_init(const int ntypes, double **cutsq, double** host_cut, int init_ok=0; if (world_me==0) - init_ok=SPHTaitwaterMF.init(ntypes, cutsq, host_cut, host_viscosity, + init_ok=SPHTaitwaterMF.init(ntypes, cutsq, host_cut, host_viscosity, host_mass, host_rho0, host_soundspeed, host_B, dimension, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); @@ -75,7 +75,7 @@ int sph_taitwater_gpu_init(const int ntypes, double **cutsq, double** host_cut, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=SPHTaitwaterMF.init(ntypes, cutsq, host_cut, host_viscosity, + init_ok=SPHTaitwaterMF.init(ntypes, cutsq, host_cut, host_viscosity, host_mass, host_rho0, host_soundspeed, host_B, dimension, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); @@ -98,36 +98,30 @@ void sph_taitwater_gpu_clear() { int ** sph_taitwater_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, + double *subhi, tagint *host_tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, - double **host_v, double *boxlo, double *prd) { - double dtinvsqrt = 1.0; - int seed = 0; - int timestep = 0; + double **host_v) { return SPHTaitwaterMF.compute(ago, inum_full, nall, host_x, host_type, sublo, - subhi, tag, nspecial, special, eflag, vflag, eatom, + subhi, host_tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, - host_v, dtinvsqrt, seed, timestep, boxlo, prd); + host_v); } void sph_taitwater_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, tagint *tag, - double **host_v, const int nlocal, double *boxlo, double *prd) { - double dtinvsqrt = 1.0; - int seed = 0; - int timestep = 0; + const double cpu_time, bool &success, tagint *host_tag, + double **host_v, const int nlocal) { SPHTaitwaterMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, - tag, host_v, dtinvsqrt, seed, timestep, nlocal, boxlo, prd); + host_tag, host_v, nlocal); } -void sph_taitwater_gpu_get_extra_data(double *host_rho, double *host_mass) { - SPHTaitwaterMF.get_extra_data(host_rho, host_mass); +void sph_taitwater_gpu_get_extra_data(double *host_rho) { + SPHTaitwaterMF.get_extra_data(host_rho); } void sph_taitwater_gpu_update_drhoE(void **drhoE_ptr) { diff --git a/src/GPU/pair_sph_lj_gpu.cpp b/src/GPU/pair_sph_lj_gpu.cpp index 33366f09a62..942a3c33bd4 100644 --- a/src/GPU/pair_sph_lj_gpu.cpp +++ b/src/GPU/pair_sph_lj_gpu.cpp @@ -35,24 +35,27 @@ using namespace LAMMPS_NS; // External functions from cuda library for atom decomposition int sph_lj_gpu_init(const int ntypes, double **cutsq, double** host_cut, - double **host_viscosity, const int dimension, double *special_lj, + double **host_viscosity, double* host_mass, + const int dimension, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); void sph_lj_gpu_clear(); -int **sph_lj_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, - int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, double **host_v, - double *boxlo, double *prd); -void sph_lj_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, - int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, tagint *tag, double **host_v, - const int nlocal, double *boxlo, double *prd); +int **sph_lj_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *host_tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v); +void sph_lj_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *host_tag, + double **host_v, const int nlocal); void sph_lj_gpu_get_extra_data(double *host_rho, double *host_esph, - double *host_cv, double *host_mass); + double *host_cv); void sph_lj_gpu_update_drhoE(void **drhoE_ptr); double sph_lj_gpu_bytes(); @@ -92,8 +95,7 @@ void PairSPHLJGPU::compute(int eflag, int vflag) double *rho = atom->rho; double *esph = atom->esph; double *cv = atom->cv; - double *mass = atom->mass; - sph_lj_gpu_get_extra_data(rho, esph, cv, mass); + sph_lj_gpu_get_extra_data(rho, esph, cv); if (gpu_mode != GPU_FORCE) { double sublo[3], subhi[3]; @@ -112,7 +114,7 @@ void PairSPHLJGPU::compute(int eflag, int vflag) neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->tag, atom->nspecial, atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, - cpu_time, success, atom->v, domain->boxlo, domain->prd); + cpu_time, success, atom->v); } else { inum = list->inum; ilist = list->ilist; @@ -121,7 +123,7 @@ void PairSPHLJGPU::compute(int eflag, int vflag) sph_lj_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, - atom->tag, atom->v, atom->nlocal, domain->boxlo, domain->prd); + atom->tag, atom->v, atom->nlocal); } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); @@ -182,8 +184,9 @@ void PairSPHLJGPU::init_style() if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; int mnf = 5e-2 * neighbor->oneatom; int success = - sph_lj_gpu_init(atom->ntypes + 1, cutsq, cut, viscosity, domain->dimension, - force->special_lj, atom->nlocal, atom->nlocal + atom->nghost, + sph_lj_gpu_init(atom->ntypes + 1, cutsq, cut, viscosity, atom->mass, + domain->dimension, force->special_lj, atom->nlocal, + atom->nlocal + atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success, error, world); diff --git a/src/GPU/pair_sph_taitwater_gpu.cpp b/src/GPU/pair_sph_taitwater_gpu.cpp index b98a89c3ff3..37a1b0feb5b 100644 --- a/src/GPU/pair_sph_taitwater_gpu.cpp +++ b/src/GPU/pair_sph_taitwater_gpu.cpp @@ -35,24 +35,26 @@ using namespace LAMMPS_NS; // External functions from cuda library for atom decomposition int sph_taitwater_gpu_init(const int ntypes, double **cutsq, double** host_cut, - double **host_viscosity, double* host_rho0, + double **host_viscosity, double* host_mass, double* host_rho0, double* host_soundspeed, double* host_B, const int dimension, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); void sph_taitwater_gpu_clear(); -int **sph_taitwater_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, - int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, double **host_v, - double *boxlo, double *prd); -void sph_taitwater_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, - int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, tagint *tag, double **host_v, - const int nlocal, double *boxlo, double *prd); -void sph_taitwater_gpu_get_extra_data(double *host_rho, double *host_mass); +int **sph_taitwater_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v); +void sph_taitwater_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **host_v, const int nlocal); +void sph_taitwater_gpu_get_extra_data(double *host_rho); void sph_taitwater_gpu_update_drhoE(void **drhoE_ptr); double sph_taitwater_gpu_bytes(); @@ -90,8 +92,7 @@ void PairSPHTaitwaterGPU::compute(int eflag, int vflag) int *ilist, *numneigh, **firstneigh; double *rho = atom->rho; - double *mass = atom->mass; - sph_taitwater_gpu_get_extra_data(rho, mass); + sph_taitwater_gpu_get_extra_data(rho); if (gpu_mode != GPU_FORCE) { double sublo[3], subhi[3]; @@ -109,7 +110,7 @@ void PairSPHTaitwaterGPU::compute(int eflag, int vflag) firstneigh = sph_taitwater_gpu_compute_n( neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->tag, atom->nspecial, atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, - cpu_time, success, atom->v, domain->boxlo, domain->prd); + cpu_time, success, atom->v); } else { inum = list->inum; ilist = list->ilist; @@ -117,7 +118,7 @@ void PairSPHTaitwaterGPU::compute(int eflag, int vflag) firstneigh = list->firstneigh; sph_taitwater_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, - atom->tag, atom->v, atom->nlocal, domain->boxlo, domain->prd); + atom->tag, atom->v, atom->nlocal); } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); @@ -178,7 +179,7 @@ void PairSPHTaitwaterGPU::init_style() if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; int mnf = 5e-2 * neighbor->oneatom; int success = - sph_taitwater_gpu_init(atom->ntypes + 1, cutsq, cut, viscosity, + sph_taitwater_gpu_init(atom->ntypes + 1, cutsq, cut, viscosity, atom->mass, rho0, soundspeed, B, domain->dimension, force->special_lj, atom->nlocal, atom->nlocal + atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); From 54a6143e9e1fad717faf8ac96e3b1bf019770cfb Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 8 Dec 2023 14:27:00 -0600 Subject: [PATCH 16/30] Updated the doc pages for sph/lj/gpu and sph/taitwater/gpu --- doc/src/pair_coul_slater.rst | 2 +- doc/src/pair_sph_lj.rst | 3 +++ doc/src/pair_sph_taitwater.rst | 3 +++ lib/gpu/lal_base_sph.cpp | 2 +- lib/gpu/lal_base_sph.h | 2 +- lib/gpu/lal_coul_slater_long.cpp | 2 +- lib/gpu/lal_coul_slater_long.cu | 2 +- lib/gpu/lal_coul_slater_long_ext.cpp | 2 +- lib/gpu/lal_edpd.h | 2 +- lib/gpu/lal_lj_coul_long_soft.cpp | 2 +- lib/gpu/lal_lj_coul_long_soft.cu | 2 +- lib/gpu/lal_lj_coul_long_soft.h | 2 +- lib/gpu/lal_lj_coul_long_soft_ext.cpp | 4 ++-- lib/gpu/lal_sph_lj.cpp | 2 +- lib/gpu/lal_sph_lj.h | 2 +- lib/gpu/lal_sph_lj_ext.cpp | 2 +- 16 files changed, 21 insertions(+), 15 deletions(-) diff --git a/doc/src/pair_coul_slater.rst b/doc/src/pair_coul_slater.rst index 074f10ba834..bde14276db6 100644 --- a/doc/src/pair_coul_slater.rst +++ b/doc/src/pair_coul_slater.rst @@ -12,7 +12,7 @@ pair_style coul/slater/cut command pair_style coul/slater/long command =================================== -Accelerator Variants: *coul/slater/gpu* +Accelerator Variants: *coul/slater/long/gpu* Syntax """""" diff --git a/doc/src/pair_sph_lj.rst b/doc/src/pair_sph_lj.rst index b5c02c41ff4..5ac7ab9c6b2 100644 --- a/doc/src/pair_sph_lj.rst +++ b/doc/src/pair_sph_lj.rst @@ -1,8 +1,11 @@ .. index:: pair_style sph/lj +.. index:: pair_style sph/lj/gpu pair_style sph/lj command ========================= +Accelerator Variants: *sph/lj/gpu* + Syntax """""" diff --git a/doc/src/pair_sph_taitwater.rst b/doc/src/pair_sph_taitwater.rst index 34eb65f0051..79972660c42 100644 --- a/doc/src/pair_sph_taitwater.rst +++ b/doc/src/pair_sph_taitwater.rst @@ -1,8 +1,11 @@ .. index:: pair_style sph/taitwater +.. index:: pair_style sph/taitwater/gpu pair_style sph/taitwater command ================================ +Accelerator Variants: *sph/taitwater/gpu* + Syntax """""" diff --git a/lib/gpu/lal_base_sph.cpp b/lib/gpu/lal_base_sph.cpp index c6876a7dcd0..f373c0ebb6b 100644 --- a/lib/gpu/lal_base_sph.cpp +++ b/lib/gpu/lal_base_sph.cpp @@ -1,7 +1,7 @@ /*************************************************************************** base_sph.cpp ------------------- - Trung Dac Nguyen (ORNL) + Trung Nguyen (U Chicago) Base class for SPH pair styles needing per-particle data for position, velocity, and type. diff --git a/lib/gpu/lal_base_sph.h b/lib/gpu/lal_base_sph.h index 950462e9989..e1e57315732 100644 --- a/lib/gpu/lal_base_sph.h +++ b/lib/gpu/lal_base_sph.h @@ -1,7 +1,7 @@ /*************************************************************************** base_sph.h ------------------- - Trung Dac Nguyen (U Chicago) + Trung Nguyen (U Chicago) Base class for SPH pair styles needing per-particle data for position, velocity, and type. diff --git a/lib/gpu/lal_coul_slater_long.cpp b/lib/gpu/lal_coul_slater_long.cpp index 98b73ab7535..42eb86e8ffb 100644 --- a/lib/gpu/lal_coul_slater_long.cpp +++ b/lib/gpu/lal_coul_slater_long.cpp @@ -1,6 +1,6 @@ /*************************************************************************** coul_slater_long_ext.cpp - ------------------- + ------------------------ Trung Nguyen (U Chicago) Class for acceleration of the coul/slater/long pair style. diff --git a/lib/gpu/lal_coul_slater_long.cu b/lib/gpu/lal_coul_slater_long.cu index f8902c59d74..eccc5942112 100644 --- a/lib/gpu/lal_coul_slater_long.cu +++ b/lib/gpu/lal_coul_slater_long.cu @@ -1,6 +1,6 @@ // ************************************************************************** // coul_slater_long.cu -// ------------------- +// ------------------- // Trung Nguyen (U Chicago) // // Device code for acceleration of the coul/slater/long pair style diff --git a/lib/gpu/lal_coul_slater_long_ext.cpp b/lib/gpu/lal_coul_slater_long_ext.cpp index fb5d9725c91..8c34cc55529 100644 --- a/lib/gpu/lal_coul_slater_long_ext.cpp +++ b/lib/gpu/lal_coul_slater_long_ext.cpp @@ -1,6 +1,6 @@ /*************************************************************************** coul_slater_long_ext.cpp - ------------------- + ------------------------ Trung Nguyen (U Chicago) Functions for LAMMPS access to coul/slater/long acceleration routines. diff --git a/lib/gpu/lal_edpd.h b/lib/gpu/lal_edpd.h index c293b86ee66..e5f7b0633bf 100644 --- a/lib/gpu/lal_edpd.h +++ b/lib/gpu/lal_edpd.h @@ -3,7 +3,7 @@ ------------------- Trung Dac Nguyen (U Chicago) - Class for acceleration of the dpd pair style. + Class for acceleration of the edpd pair style. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) diff --git a/lib/gpu/lal_lj_coul_long_soft.cpp b/lib/gpu/lal_lj_coul_long_soft.cpp index 2b90333bae0..80eaaca94a0 100644 --- a/lib/gpu/lal_lj_coul_long_soft.cpp +++ b/lib/gpu/lal_lj_coul_long_soft.cpp @@ -10,7 +10,7 @@ __________________________________________________________________________ begin : - email : ndtrung@uchicago.edu + email : ndactrung@gmail.com ***************************************************************************/ #if defined(USE_OPENCL) diff --git a/lib/gpu/lal_lj_coul_long_soft.cu b/lib/gpu/lal_lj_coul_long_soft.cu index 7e55f34a45c..e311bb5d3b3 100644 --- a/lib/gpu/lal_lj_coul_long_soft.cu +++ b/lib/gpu/lal_lj_coul_long_soft.cu @@ -10,7 +10,7 @@ // __________________________________________________________________________ // // begin : -// email : ndtrung@uchicago.edu +// email : ndactrung@gmail.com // *************************************************************************** #if defined(NV_KERNEL) || defined(USE_HIP) diff --git a/lib/gpu/lal_lj_coul_long_soft.h b/lib/gpu/lal_lj_coul_long_soft.h index 1ef5f8134da..b3d4bff4a49 100644 --- a/lib/gpu/lal_lj_coul_long_soft.h +++ b/lib/gpu/lal_lj_coul_long_soft.h @@ -10,7 +10,7 @@ __________________________________________________________________________ begin : - email : ndtrung@uchicago.edu + email : ndactrung@gmail.com ***************************************************************************/ #ifndef LAL_LJ_COUL_LONG_SOFT_H diff --git a/lib/gpu/lal_lj_coul_long_soft_ext.cpp b/lib/gpu/lal_lj_coul_long_soft_ext.cpp index efccbd6ab86..cb2657c03bd 100644 --- a/lib/gpu/lal_lj_coul_long_soft_ext.cpp +++ b/lib/gpu/lal_lj_coul_long_soft_ext.cpp @@ -1,6 +1,6 @@ /*************************************************************************** lj_coul_long_soft_ext.cpp - ------------------- + ------------------------- Trung Nguyen (U Chicago) Functions for LAMMPS access to lj/cut/coul/long/soft acceleration routines. @@ -10,7 +10,7 @@ __________________________________________________________________________ begin : - email : ndtrung@uchicago.edu + email : ndactrung@gmail.com ***************************************************************************/ #include diff --git a/lib/gpu/lal_sph_lj.cpp b/lib/gpu/lal_sph_lj.cpp index dbe44bf30a9..66c2a5c3027 100644 --- a/lib/gpu/lal_sph_lj.cpp +++ b/lib/gpu/lal_sph_lj.cpp @@ -1,7 +1,7 @@ /*************************************************************************** sph_lj.cpp ------------------- - Trung Dac Nguyen (U Chicago) + Trung Nguyen (U Chicago) Class for acceleration of the sph_lj pair style. diff --git a/lib/gpu/lal_sph_lj.h b/lib/gpu/lal_sph_lj.h index 245bf872903..e79c2ba265e 100644 --- a/lib/gpu/lal_sph_lj.h +++ b/lib/gpu/lal_sph_lj.h @@ -1,7 +1,7 @@ /*************************************************************************** sph_lj.h ------------------- - Trung Dac Nguyen (U Chicago) + Trung Nguyen (U Chicago) Class for acceleration of the sph lj pair style. diff --git a/lib/gpu/lal_sph_lj_ext.cpp b/lib/gpu/lal_sph_lj_ext.cpp index 488f7b12007..55f85c030e2 100644 --- a/lib/gpu/lal_sph_lj_ext.cpp +++ b/lib/gpu/lal_sph_lj_ext.cpp @@ -3,7 +3,7 @@ ------------------- Trung Dac Nguyen (U Chicago) - Functions for LAMMPS access to sph lj acceleration routines. + Functions for LAMMPS access to sph/lj acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) From 3830711decd56c6cf304524c3cc56524ccb68223 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 9 Dec 2023 15:15:13 -0600 Subject: [PATCH 17/30] Added the GPU version of sph/heatconduction --- lib/gpu/lal_sph_heatconduction.cpp | 222 ++++++++++++++++++++ lib/gpu/lal_sph_heatconduction.cu | 265 ++++++++++++++++++++++++ lib/gpu/lal_sph_heatconduction.h | 95 +++++++++ lib/gpu/lal_sph_heatconduction_ext.cpp | 129 ++++++++++++ lib/gpu/lal_sph_lj.cu | 10 +- src/GPU/pair_sph_heatconduction_gpu.cpp | 197 ++++++++++++++++++ src/GPU/pair_sph_heatconduction_gpu.h | 48 +++++ 7 files changed, 961 insertions(+), 5 deletions(-) create mode 100644 lib/gpu/lal_sph_heatconduction.cpp create mode 100644 lib/gpu/lal_sph_heatconduction.cu create mode 100644 lib/gpu/lal_sph_heatconduction.h create mode 100644 lib/gpu/lal_sph_heatconduction_ext.cpp create mode 100644 src/GPU/pair_sph_heatconduction_gpu.cpp create mode 100644 src/GPU/pair_sph_heatconduction_gpu.h diff --git a/lib/gpu/lal_sph_heatconduction.cpp b/lib/gpu/lal_sph_heatconduction.cpp new file mode 100644 index 00000000000..e8e366e93a6 --- /dev/null +++ b/lib/gpu/lal_sph_heatconduction.cpp @@ -0,0 +1,222 @@ +/*************************************************************************** + sph_heatconduction.cpp + ------------------- + Trung Nguyen (U Chicago) + + Class for acceleration of the sph_heatconduction pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "sph_heatconduction_cl.h" +#elif defined(USE_CUDART) +const char *sph_heatconduction=0; +#else +#include "sph_heatconduction_cubin.h" +#endif + +#include "lal_sph_heatconduction.h" +#include +namespace LAMMPS_AL { +#define SPHHeatConductionT SPHHeatConduction + +extern Device device; + +template +SPHHeatConductionT::SPHHeatConduction() : BaseSPH(), _allocated(false) { + _max_dE_size = 0; +} + +template +SPHHeatConductionT::~SPHHeatConduction() { + clear(); +} + +template +int SPHHeatConductionT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int SPHHeatConductionT::init(const int ntypes, + double **host_cutsq, double **host_cut, + double **host_alpha, double* host_mass, + const int dimension, double *host_special_lj, + const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen) { + const int max_shared_types=this->device->max_shared_types(); + + int onetype=0; + #ifdef USE_OPENCL + if (maxspecial==0) + for (int i=1; i0) { + if (onetype>0) + onetype=-1; + else if (onetype==0) + onetype=i*max_shared_types+j; + } + if (onetype<0) onetype=0; + #endif + + int success; + int extra_fields = 4; // round up to accomodate quadruples of numtyp values + // rho, esph + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,sph_heatconduction,"k_sph_heatconduction", + onetype,extra_fields); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_alpha, + host_cut, host_cutsq); + + UCL_H_Vec dview_mass(ntypes, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < ntypes; i++) + dview_mass[i] = host_mass[i]; + mass.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(mass,dview_mass,false); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + // allocate per-atom array Q + + int ef_nall=nall; + if (ef_nall==0) + ef_nall=2000; + + _max_dE_size=static_cast(static_cast(ef_nall)*1.10); + dE.alloc(_max_dE_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + + _dimension = dimension; + + _allocated=true; + this->_max_bytes=coeff.row_bytes()+dE.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void SPHHeatConductionT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff.clear(); + mass.clear(); + dE.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double SPHHeatConductionT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(SPHHeatConduction); +} + +template +void SPHHeatConductionT::update_dE(void **dE_ptr) { + *dE_ptr=dE.host.begin(); + dE.update_host(_max_dE_size,false); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int SPHHeatConductionT::loop(const int eflag, const int vflag) { + + int nall = this->atom->nall(); + + // Resize dE array if necessary + if (nall > _max_dE_size) { + _max_dE_size=static_cast(static_cast(nall)*1.10); + dE.resize(_max_dE_size); + } + + // signal that we need to transfer extra data from the host + + this->atom->extra_data_unavail(); + + numtyp4 *pextra=reinterpret_cast(&(this->atom->extra[0])); + + int n = 0; + int nstride = 1; + for (int i = 0; i < nall; i++) { + int idx = n+i*nstride; + numtyp4 v; + v.x = rho[i]; + v.y = esph[i]; + v.z = 0; + v.w = 0; + pextra[idx] = v; + } + this->atom->add_extra_data(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &mass, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &dE, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->v, &_dimension, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &mass, + &_lj_types, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &dE, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->v, &_dimension, &this->_threads_per_atom); + } + + this->time_pair.stop(); + return GX; +} + +// --------------------------------------------------------------------------- +// Get the extra data pointers from host +// --------------------------------------------------------------------------- + +template +void SPHHeatConductionT::get_extra_data(double *host_rho, double *host_esph) { + rho = host_rho; + esph = host_esph; +} + +template class SPHHeatConduction; +} diff --git a/lib/gpu/lal_sph_heatconduction.cu b/lib/gpu/lal_sph_heatconduction.cu new file mode 100644 index 00000000000..c88853e2cf0 --- /dev/null +++ b/lib/gpu/lal_sph_heatconduction.cu @@ -0,0 +1,265 @@ +// ************************************************************************** +// sph_heatconduction.cu +// --------------------- +// Trung Dac Nguyen (U Chicago) +// +// Device code for acceleration of the sph/heatconduction pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : September 2023 +// email : ndactrung@gmail.com +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( vel_tex,float4); +#else +_texture_2d( pos_tex,int4); +_texture_2d( vel_tex,int4); +#endif +#else +#define pos_tex x_ +#define vel_tex v_ +#endif + +#if (SHUFFLE_AVAIL == 0) + +#define store_dE(dEacc, ii, inum, tid, t_per_atom, offset, dE) \ + if (t_per_atom>1) { \ + simdsync(); \ + simd_reduce_add1(t_per_atom, red_acc, offset, tid, dEacc); \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + dEacc += shfl_down(dEacc, s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii +class SPHHeatConduction : public BaseSPH { + public: + SPHHeatConduction(); + ~SPHHeatConduction(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double** host_cut, double **host_alpha, double *host_mass, + const int dimension, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + void get_extra_data(double *host_rho, double *host_esph); + + /// copy desph from device to host + void update_dE(void **dE_ptr); + + // --------------------------- TYPE DATA -------------------------- + + /// coeff.x = alpha, coeff.y = cut, coeff.z = cutsq + UCL_D_Vec coeff; + + /// per-type coeffs + UCL_D_Vec mass; + + /// Special LJ values + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + /// Per-atom arrays + UCL_Vector dE; + int _max_dE_size; + + int _dimension; + + /// pointer to host data + double *rho, *esph, *cv; + + private: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_sph_heatconduction_ext.cpp b/lib/gpu/lal_sph_heatconduction_ext.cpp new file mode 100644 index 00000000000..1317ecaccc2 --- /dev/null +++ b/lib/gpu/lal_sph_heatconduction_ext.cpp @@ -0,0 +1,129 @@ +/*************************************************************************** + sph_heatconduction_ext.cpp + -------------------------- + Trung Dac Nguyen (U Chicago) + + Functions for LAMMPS access to sph/heatconduction acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#include +#include +#include + +#include "lal_sph_heatconduction.h" + +using namespace std; +using namespace LAMMPS_AL; + +static SPHHeatConduction SPHHeatConductionMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int sph_heatconduction_gpu_init(const int ntypes, double **cutsq, double** host_cut, + double **host_alpha, double* host_mass, const int dimension, + double *special_lj, const int inum, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + SPHHeatConductionMF.clear(); + gpu_mode=SPHHeatConductionMF.device->gpu_mode(); + double gpu_split=SPHHeatConductionMF.device->particle_split(); + int first_gpu=SPHHeatConductionMF.device->first_device(); + int last_gpu=SPHHeatConductionMF.device->last_device(); + int world_me=SPHHeatConductionMF.device->world_me(); + int gpu_rank=SPHHeatConductionMF.device->gpu_rank(); + int procs_per_gpu=SPHHeatConductionMF.device->procs_per_gpu(); + + SPHHeatConductionMF.device->init_message(screen,"sph_lj",first_gpu,last_gpu); + + bool message=false; + if (SPHHeatConductionMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=SPHHeatConductionMF.init(ntypes, cutsq, host_cut, host_alpha, host_mass, + dimension, special_lj, inum, nall, max_nbors, maxspecial, + cell_size, gpu_split, screen); + + SPHHeatConductionMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; iserialize_init(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + SPHHeatConductionMF.estimate_gpu_overhead(); + return init_ok; +} + +void sph_heatconduction_gpu_clear() { + SPHHeatConductionMF.clear(); +} + +int ** sph_heatconduction_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *host_tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v) { + return SPHHeatConductionMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, host_tag, nspecial, special, eflag, vflag, + eatom, vatom, host_start, ilist, jnum, cpu_time, success, + host_v); +} + +void sph_heatconduction_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *host_tag, + double **host_v, const int nlocal) { + SPHHeatConductionMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, + firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, + host_tag, host_v, nlocal); +} + +void sph_heatconduction_gpu_get_extra_data(double *host_rho, double *host_esph) { + SPHHeatConductionMF.get_extra_data(host_rho, host_esph); +} + +void sph_heatconduction_gpu_update_dE(void **dE_ptr) { + SPHHeatConductionMF.update_dE(dE_ptr); +} + +double sph_heatconduction_gpu_bytes() { + return SPHHeatConductionMF.host_memory_usage(); +} diff --git a/lib/gpu/lal_sph_lj.cu b/lib/gpu/lal_sph_lj.cu index b105632330e..b9fdc5e433f 100644 --- a/lib/gpu/lal_sph_lj.cu +++ b/lib/gpu/lal_sph_lj.cu @@ -224,7 +224,7 @@ __kernel void k_sph_lj(const __global numtyp4 *restrict x_, f.z+=delz*force; // and change in density, drho[i] - drhoEacc.x += massj * delVdotDelR * wfd; + drhoEacc.x += mass_jtype * delVdotDelR * wfd; // change in thermal energy, desph[i] drhoEacc.y += deltaE; @@ -313,7 +313,6 @@ __kernel void k_sph_lj_fast(const __global numtyp4 *restrict x_, numtyp rhoi = extrai.x; numtyp esphi = extrai.y; numtyp cvi = extrai.z; - numtyp massi= extrai.w; // compute pressure of particle i with LJ EOS numtyp fci[2]; @@ -331,6 +330,7 @@ __kernel void k_sph_lj_fast(const __global numtyp4 *restrict x_, #endif numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; + int jtype = jx.w; #ifndef ONETYPE int mtype=itype+jx.w; const numtyp cutsq_p=cutsq[mtype]; @@ -345,6 +345,7 @@ __kernel void k_sph_lj_fast(const __global numtyp4 *restrict x_, numtyp rsq = delx*delx+dely*dely+delz*delz; if (rsq + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int sph_heatconduction_gpu_init(const int ntypes, double **cutsq, double** host_cut, + double **host_alpha, double* host_mass, + const int dimension, double *special_lj, + const int inum, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen); +void sph_heatconduction_gpu_clear(); +int **sph_heatconduction_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *host_tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v); +void sph_heatconduction_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *host_tag, + double **host_v, const int nlocal); +void sph_heatconduction_gpu_get_extra_data(double *host_rho, double *host_esph); +void sph_heatconduction_gpu_update_dE(void **dE_ptr); +double sph_heatconduction_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairSPHHeatConductionGPU::PairSPHHeatConductionGPU(LAMMPS *lmp) : + PairSPHHeatConduction(lmp), gpu_mode(GPU_FORCE) +{ + dE_pinned = nullptr; + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairSPHHeatConductionGPU::~PairSPHHeatConductionGPU() +{ + sph_heatconduction_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairSPHHeatConductionGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double *rho = atom->rho; + double *esph = atom->esph; + double *cv = atom->cv; + sph_heatconduction_gpu_get_extra_data(rho, esph); + + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + inum = atom->nlocal; + firstneigh = sph_heatconduction_gpu_compute_n( + neighbor->ago, inum, nall, atom->x, atom->type, + sublo, subhi, atom->tag, atom->nspecial, atom->special, eflag, vflag, + eflag_atom, vflag_atom, host_start, &ilist, &numneigh, + cpu_time, success, atom->v); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + sph_heatconduction_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, + eflag_atom, vflag_atom, host_start, cpu_time, success, + atom->tag, atom->v, atom->nlocal); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + // get the drho and dE from device + + double *desph = atom->desph; + sph_heatconduction_gpu_update_dE(&dE_pinned); + + int nlocal = atom->nlocal; + if (acc_float) { + auto dE_ptr = (float *)dE_pinned; + for (int i = 0; i < nlocal; i++) { + desph[i] = dE_ptr[i]; + } + + } else { + auto dE_ptr = (float *)dE_pinned; + for (int i = 0; i < nlocal; i++) { + desph[i] = dE_ptr[i]; + } + } + + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairSPHHeatConductionGPU::init_style() +{ + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double mcut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + mcut = init_one(i, j); + mcut *= mcut; + if (mcut > maxcut) maxcut = mcut; + cutsq[i][j] = cutsq[j][i] = mcut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + int success = + sph_heatconduction_gpu_init(atom->ntypes + 1, cutsq, cut, alpha, atom->mass, + domain->dimension, force->special_lj, atom->nlocal, + atom->nlocal + atom->nghost, + mnf, maxspecial, cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success, error, world); + + acc_float = Info::has_accelerator_feature("GPU", "precision", "single"); + + if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); +} + +/* ---------------------------------------------------------------------- */ + +double PairSPHHeatConductionGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + sph_heatconduction_gpu_bytes(); +} diff --git a/src/GPU/pair_sph_heatconduction_gpu.h b/src/GPU/pair_sph_heatconduction_gpu.h new file mode 100644 index 00000000000..571334017db --- /dev/null +++ b/src/GPU/pair_sph_heatconduction_gpu.h @@ -0,0 +1,48 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(sph/heatconduction/gpu,PairSPHHeatConductionGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_SPH_HEATCONDUCTION_GPU_H +#define LMP_PAIR_SPH_HEATCONDUCTION_GPU_H + +#include "pair_sph_heatconduction.h" + +namespace LAMMPS_NS { + +class PairSPHHeatConductionGPU : public PairSPHHeatConduction { + public: + PairSPHHeatConductionGPU(LAMMPS *lmp); + ~PairSPHHeatConductionGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + void *dE_pinned; + bool acc_float; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif From 267e360bacf64a0ae8e3f2604bbebaea772e294f Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 9 Dec 2023 15:39:54 -0600 Subject: [PATCH 18/30] Fixed bugs with acquiring depsh from lib/gpu, updated the doc page --- doc/src/pair_sph_heatconduction.rst | 3 +++ lib/gpu/lal_sph_heatconduction.cu | 18 +++++------------- lib/gpu/lal_sph_heatconduction_ext.cpp | 2 +- src/GPU/pair_sph_heatconduction_gpu.cpp | 3 +-- 4 files changed, 10 insertions(+), 16 deletions(-) diff --git a/doc/src/pair_sph_heatconduction.rst b/doc/src/pair_sph_heatconduction.rst index 4716ed54fb5..e9004cb5a48 100644 --- a/doc/src/pair_sph_heatconduction.rst +++ b/doc/src/pair_sph_heatconduction.rst @@ -1,8 +1,11 @@ .. index:: pair_style sph/heatconduction +.. index:: pair_style sph/heatconduction/gpu pair_style sph/heatconduction command ===================================== +Accelerator Variants: *sph/heatconduction/gpu* + Syntax """""" diff --git a/lib/gpu/lal_sph_heatconduction.cu b/lib/gpu/lal_sph_heatconduction.cu index c88853e2cf0..4cea1433e35 100644 --- a/lib/gpu/lal_sph_heatconduction.cu +++ b/lib/gpu/lal_sph_heatconduction.cu @@ -82,7 +82,6 @@ __kernel void k_sph_heatconduction(const __global numtyp4 *restrict x_, numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; numtyp mass_itype = mass[itype]; - numtyp4 iv; fetch4(iv,i,vel_tex); //v_[i]; const numtyp4 extrai = extra[i]; numtyp rhoi = extrai.x; @@ -96,7 +95,6 @@ __kernel void k_sph_heatconduction(const __global numtyp4 *restrict x_, numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int jtype=jx.w; - numtyp4 jv; fetch4(jv,j,vel_tex); //v_[j]; // Compute r12 numtyp delx = ix.x-jx.x; @@ -115,9 +113,8 @@ __kernel void k_sph_heatconduction(const __global numtyp4 *restrict x_, numtyp esphj = extraj.y; numtyp h = coeffy; // cut[itype][jtype] - ih = ucl_recip(h); // (numtyp)1.0 / h; + numtyp ih = ucl_recip(h); // (numtyp)1.0 / h; numtyp ihsq = ih * ih; - numtyp ihcub = ihsq * ih; numtyp wfd = h - ucl_sqrt(rsq); if (dimension == 3) { @@ -141,7 +138,7 @@ __kernel void k_sph_heatconduction(const __global numtyp4 *restrict x_, } // for nbor } // if ii - store_drhoE(dEacc,ii,inum,tid,t_per_atom,offset,drhoE); + store_drhoE(dEacc,ii,inum,tid,t_per_atom,offset,dE); } __kernel void k_sph_heatconduction_fast(const __global numtyp4 *restrict x_, @@ -172,7 +169,7 @@ __kernel void k_sph_heatconduction_fast(const __global numtyp4 *restrict x_, } __syncthreads(); #else - const numtyp coeffx=coeff_in[ONETYPE].x; // viscosity[itype][jtype] + const numtyp coeffx=coeff_in[ONETYPE].x; // alpha[itype][jtype] const numtyp coeffy=coeff_in[ONETYPE].y; // cut[itype][jtype] const numtyp cutsq_p=coeff_in[ONETYPE].z; // cutsq[itype][jtype] #endif @@ -193,8 +190,6 @@ __kernel void k_sph_heatconduction_fast(const __global numtyp4 *restrict x_, #ifndef ONETYPE int itype=fast_mul((int)MAX_SHARED_TYPES,iw); #endif - numtyp4 iv; fetch4(iv,i,vel_tex); //v_[i]; - int itag=iv.w; const numtyp4 extrai = extra[i]; numtyp rhoi = extrai.x; @@ -214,8 +209,6 @@ __kernel void k_sph_heatconduction_fast(const __global numtyp4 *restrict x_, int mtype=itype+jx.w; const numtyp cutsq_p=cutsq[mtype]; #endif - numtyp4 jv; fetch4(jv,j,vel_tex); //v_[j]; - int jtag=jv.w; // Compute r12 numtyp delx = ix.x-jx.x; @@ -226,7 +219,7 @@ __kernel void k_sph_heatconduction_fast(const __global numtyp4 *restrict x_, if (rsqgpu_rank(); int procs_per_gpu=SPHHeatConductionMF.device->procs_per_gpu(); - SPHHeatConductionMF.device->init_message(screen,"sph_lj",first_gpu,last_gpu); + SPHHeatConductionMF.device->init_message(screen,"sph_heatconduction",first_gpu,last_gpu); bool message=false; if (SPHHeatConductionMF.device->replica_me()==0 && screen) diff --git a/src/GPU/pair_sph_heatconduction_gpu.cpp b/src/GPU/pair_sph_heatconduction_gpu.cpp index c14a6df9376..0f0aa079c8b 100644 --- a/src/GPU/pair_sph_heatconduction_gpu.cpp +++ b/src/GPU/pair_sph_heatconduction_gpu.cpp @@ -94,7 +94,6 @@ void PairSPHHeatConductionGPU::compute(int eflag, int vflag) double *rho = atom->rho; double *esph = atom->esph; - double *cv = atom->cv; sph_heatconduction_gpu_get_extra_data(rho, esph); if (gpu_mode != GPU_FORCE) { @@ -140,7 +139,7 @@ void PairSPHHeatConductionGPU::compute(int eflag, int vflag) } } else { - auto dE_ptr = (float *)dE_pinned; + auto dE_ptr = (double *)dE_pinned; for (int i = 0; i < nlocal; i++) { desph[i] = dE_ptr[i]; } From e727ec1eace97dfb55bbe2a44c134a5bfbf0ecf1 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 9 Dec 2023 16:46:10 -0600 Subject: [PATCH 19/30] Fixed compiling bugs revealed by CUDA builds, removed unused variables --- lib/gpu/lal_sph_heatconduction.cu | 6 +----- lib/gpu/lal_sph_lj.cu | 24 +++++++++--------------- lib/gpu/lal_sph_taitwater.cu | 10 ---------- 3 files changed, 10 insertions(+), 30 deletions(-) diff --git a/lib/gpu/lal_sph_heatconduction.cu b/lib/gpu/lal_sph_heatconduction.cu index 4cea1433e35..21c936347a7 100644 --- a/lib/gpu/lal_sph_heatconduction.cu +++ b/lib/gpu/lal_sph_heatconduction.cu @@ -160,10 +160,6 @@ __kernel void k_sph_heatconduction_fast(const __global numtyp4 *restrict x_, #ifndef ONETYPE __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __local numtyp sp_lj[4]; - if (tid<4) { - sp_lj[tid]=sp_lj_in[tid]; - } if (tid Date: Sun, 10 Dec 2023 14:42:27 -0600 Subject: [PATCH 20/30] Added -allow-unsupported-compiler to nvcc for both CMake and traditional CUDA builds --- cmake/Modules/Packages/GPU.cmake | 4 ++-- lib/gpu/Makefile.linux_multi | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake index 47be8b85388..4c3288df842 100644 --- a/cmake/Modules/Packages/GPU.cmake +++ b/cmake/Modules/Packages/GPU.cmake @@ -151,10 +151,10 @@ if(GPU_API STREQUAL "CUDA") endif() cuda_compile_fatbin(GPU_GEN_OBJS ${GPU_LIB_CU} OPTIONS ${CUDA_REQUEST_PIC} - -DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -DNV_KERNEL -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES}) + -DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -allow-unsupported-compiler -DNV_KERNEL -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES}) cuda_compile(GPU_OBJS ${GPU_LIB_CUDPP_CU} OPTIONS ${CUDA_REQUEST_PIC} - -DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES}) + -DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -allow-unsupported-compiler -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES}) foreach(CU_OBJ ${GPU_GEN_OBJS}) get_filename_component(CU_NAME ${CU_OBJ} NAME_WE) diff --git a/lib/gpu/Makefile.linux_multi b/lib/gpu/Makefile.linux_multi index 3299bbec3a4..005f6590794 100644 --- a/lib/gpu/Makefile.linux_multi +++ b/lib/gpu/Makefile.linux_multi @@ -65,7 +65,7 @@ CUDA_PRECISION = -D_SINGLE_DOUBLE CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/lib64/stubs -CUDA_OPTS = -DUNIX -O3 --use_fast_math $(LMP_INC) -Xcompiler -fPIC +CUDA_OPTS = -DUNIX -O3 --use_fast_math $(LMP_INC) -Xcompiler -fPIC -allow-unsupported-compiler CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC -std=c++11 CUDR_OPTS = -O2 $(LMP_INC) # -xHost -no-prec-div -ansi-alias From 15f8488fc44bb259b67a55e148a9a3564ba59eeb Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 11 Dec 2023 13:43:13 -0500 Subject: [PATCH 21/30] update developer contact email --- src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp | 2 +- src/GPU/pair_lj_cut_coul_cut_soft_gpu.h | 2 +- src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp | 2 +- src/GPU/pair_lj_cut_coul_long_soft_gpu.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp index 7b0d115d96f..20cbafd6c1e 100644 --- a/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp +++ b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp @@ -1,7 +1,7 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov + LAMMPS Development team: developers@lammps.org Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains diff --git a/src/GPU/pair_lj_cut_coul_cut_soft_gpu.h b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.h index fcd5439f7a3..8298d778275 100644 --- a/src/GPU/pair_lj_cut_coul_cut_soft_gpu.h +++ b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.h @@ -1,7 +1,7 @@ /* -*- c++ -*- ---------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov + LAMMPS Development team: developers@lammps.org Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains diff --git a/src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp b/src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp index 45d26f5155c..4846b66a300 100644 --- a/src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp +++ b/src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp @@ -1,7 +1,7 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov + LAMMPS Development team: developers@lammps.org Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains diff --git a/src/GPU/pair_lj_cut_coul_long_soft_gpu.h b/src/GPU/pair_lj_cut_coul_long_soft_gpu.h index d4ff0e375fd..cb6790d3332 100644 --- a/src/GPU/pair_lj_cut_coul_long_soft_gpu.h +++ b/src/GPU/pair_lj_cut_coul_long_soft_gpu.h @@ -1,7 +1,7 @@ /* -*- c++ -*- ---------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov + LAMMPS Development team: developers@lammps.org Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains From 6e0e1dc44fba098782269b2150ccc990f0f8c022 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 11 Dec 2023 17:10:43 -0600 Subject: [PATCH 22/30] Fixed atom force issues with the coul/long/slater kernels --- lib/gpu/lal_coul_slater_long.cu | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/lib/gpu/lal_coul_slater_long.cu b/lib/gpu/lal_coul_slater_long.cu index eccc5942112..678c9703e55 100644 --- a/lib/gpu/lal_coul_slater_long.cu +++ b/lib/gpu/lal_coul_slater_long.cu @@ -72,6 +72,7 @@ __kernel void k_coul_slater_long(const __global numtyp4 *restrict x_, numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; numtyp qtmp; fetch(qtmp,i,q_tex); + numtyp lamdainv = ucl_recip(lamda); for ( ; nbor (numtyp)0) force -= factor_coul*prefactor*((numtyp)1.0-slater_term); + force *= r2inv; f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (EVFLAG && eflag) { - numtyp e_slater = (1 + r/lamda)*ucl_exp(-2*r/lamda); - e_coul += prefactor*(_erfc-e_slater - factor_coul); + numtyp e_slater = ((numtyp)1.0 + rlamdainv)*ucl_exp((numtyp)-2.0*rlamdainv); + e_coul += prefactor*(_erfc-e_slater-factor_coul); } if (EVFLAG && vflag) { virial[0] += delx*delx*force; @@ -177,6 +180,7 @@ __kernel void k_coul_slater_long_fast(const __global numtyp4 *restrict x_, numtyp qtmp; fetch(qtmp,i,q_tex); int iw=ix.w; int itype=fast_mul((int)MAX_SHARED_TYPES,iw); + numtyp lamdainv = ucl_recip(lamda); for ( ; nbor (numtyp)0) force -= factor_coul*prefactor*((numtyp)1.0-slater_term); + force *= r2inv; f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (EVFLAG && eflag) { - numtyp e_slater = (1 + r/lamda)*ucl_exp(-2*r/lamda); + numtyp e_slater = ((numtyp)1.0 + rlamdainv)*ucl_exp((numtyp)-2.0*rlamdainv); e_coul += prefactor*(_erfc-e_slater-factor_coul); } if (EVFLAG && vflag) { From 1651a7741afe2de30ab895f46722bcb7b7c2cdc8 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 11 Dec 2023 18:51:09 -0500 Subject: [PATCH 23/30] relax epsilon for coul/slater long test a little for GPU forces --- unittest/force-styles/tests/mol-pair-coul_slater_long.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unittest/force-styles/tests/mol-pair-coul_slater_long.yaml b/unittest/force-styles/tests/mol-pair-coul_slater_long.yaml index ba11503a2cd..51b04f301c4 100644 --- a/unittest/force-styles/tests/mol-pair-coul_slater_long.yaml +++ b/unittest/force-styles/tests/mol-pair-coul_slater_long.yaml @@ -1,7 +1,7 @@ --- lammps_version: 23 Jun 2022 date_generated: Thu Jul 7 09:00:39 2022 -epsilon: 2e-13 +epsilon: 1e-12 skip_tests: prerequisites: ! | atom full From 93f8ada8dc2053b6188d5aba27a6b6c28d302acd Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 11 Dec 2023 21:21:31 -0600 Subject: [PATCH 24/30] Fixed ecoul issue with the coul/slater/long kernels --- lib/gpu/lal_coul_slater_long.cu | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/gpu/lal_coul_slater_long.cu b/lib/gpu/lal_coul_slater_long.cu index 678c9703e55..1817faf2a33 100644 --- a/lib/gpu/lal_coul_slater_long.cu +++ b/lib/gpu/lal_coul_slater_long.cu @@ -114,7 +114,9 @@ __kernel void k_coul_slater_long(const __global numtyp4 *restrict x_, if (EVFLAG && eflag) { numtyp e_slater = ((numtyp)1.0 + rlamdainv)*ucl_exp((numtyp)-2.0*rlamdainv); - e_coul += prefactor*(_erfc-e_slater-factor_coul); + numtyp e = prefactor*(_erfc-e_slater); + if (factor_coul > (numtyp)0) e -= factor_coul*prefactor*((numtyp)1.0 - e_slater); + e_coul += e; } if (EVFLAG && vflag) { virial[0] += delx*delx*force; @@ -222,7 +224,9 @@ __kernel void k_coul_slater_long_fast(const __global numtyp4 *restrict x_, if (EVFLAG && eflag) { numtyp e_slater = ((numtyp)1.0 + rlamdainv)*ucl_exp((numtyp)-2.0*rlamdainv); - e_coul += prefactor*(_erfc-e_slater-factor_coul); + numtyp e = prefactor*(_erfc-e_slater); + if (factor_coul > (numtyp)0) e -= factor_coul*prefactor*((numtyp)1.0 - e_slater); + e_coul += e; } if (EVFLAG && vflag) { virial[0] += delx*delx*force; From 669782cd5fe1f85698bba33ece6d382944ec91f8 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 11 Dec 2023 21:26:12 -0600 Subject: [PATCH 25/30] Saved some exp operations --- lib/gpu/lal_coul_slater_long.cu | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/gpu/lal_coul_slater_long.cu b/lib/gpu/lal_coul_slater_long.cu index 1817faf2a33..1fc8ab8be4e 100644 --- a/lib/gpu/lal_coul_slater_long.cu +++ b/lib/gpu/lal_coul_slater_long.cu @@ -103,7 +103,8 @@ __kernel void k_coul_slater_long(const __global numtyp4 *restrict x_, _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; fetch(prefactor,j,q_tex); numtyp rlamdainv = r * lamdainv; - numtyp slater_term = ucl_exp(-(numtyp)2.0*rlamdainv)*((numtyp)1.0 + ((numtyp)2.0*rlamdainv*((numtyp)1.0+rlamdainv))); + numtyp exprlmdainv = ucl_exp((numtyp)-2.0*rlamdainv); + numtyp slater_term = exprlmdainv*((numtyp)1.0 + ((numtyp)2.0*rlamdainv*((numtyp)1.0+rlamdainv))); force = prefactor*(_erfc + EWALD_F*grij*expm2-slater_term); if (factor_coul > (numtyp)0) force -= factor_coul*prefactor*((numtyp)1.0-slater_term); force *= r2inv; @@ -113,7 +114,7 @@ __kernel void k_coul_slater_long(const __global numtyp4 *restrict x_, f.z+=delz*force; if (EVFLAG && eflag) { - numtyp e_slater = ((numtyp)1.0 + rlamdainv)*ucl_exp((numtyp)-2.0*rlamdainv); + numtyp e_slater = ((numtyp)1.0 + rlamdainv)*exprlmdainv; numtyp e = prefactor*(_erfc-e_slater); if (factor_coul > (numtyp)0) e -= factor_coul*prefactor*((numtyp)1.0 - e_slater); e_coul += e; @@ -213,7 +214,8 @@ __kernel void k_coul_slater_long_fast(const __global numtyp4 *restrict x_, fetch(prefactor,j,q_tex); prefactor *= qqrd2e * scale[mtype] * qtmp/r; numtyp rlamdainv = r * lamdainv; - numtyp slater_term = ucl_exp((numtyp)-2.0*rlamdainv)*((numtyp)1.0 + ((numtyp)2.0*rlamdainv*((numtyp)1.0+rlamdainv))); + numtyp exprlmdainv = ucl_exp((numtyp)-2.0*rlamdainv); + numtyp slater_term = exprlmdainv*((numtyp)1.0 + ((numtyp)2.0*rlamdainv*((numtyp)1.0+rlamdainv))); force = prefactor*(_erfc + EWALD_F*grij*expm2-slater_term); if (factor_coul > (numtyp)0) force -= factor_coul*prefactor*((numtyp)1.0-slater_term); force *= r2inv; @@ -223,7 +225,7 @@ __kernel void k_coul_slater_long_fast(const __global numtyp4 *restrict x_, f.z+=delz*force; if (EVFLAG && eflag) { - numtyp e_slater = ((numtyp)1.0 + rlamdainv)*ucl_exp((numtyp)-2.0*rlamdainv); + numtyp e_slater = ((numtyp)1.0 + rlamdainv)*exprlmdainv; numtyp e = prefactor*(_erfc-e_slater); if (factor_coul > (numtyp)0) e -= factor_coul*prefactor*((numtyp)1.0 - e_slater); e_coul += e; From 7881eeaa1c42b6adad32c2abeb5be193556a8df7 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 11 Dec 2023 23:15:13 -0500 Subject: [PATCH 26/30] silence warnings about unused variables --- src/GPU/pair_coul_slater_long_gpu.cpp | 3 +-- src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp | 4 ++-- src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/GPU/pair_coul_slater_long_gpu.cpp b/src/GPU/pair_coul_slater_long_gpu.cpp index d812ebe2004..4ace8bd7619 100644 --- a/src/GPU/pair_coul_slater_long_gpu.cpp +++ b/src/GPU/pair_coul_slater_long_gpu.cpp @@ -183,9 +183,8 @@ double PairCoulSlaterLongGPU::memory_usage() void PairCoulSlaterLongGPU::cpu_compute(int start, int inum, int eflag, int /* vflag */, int *ilist, int *numneigh, int **firstneigh) { - int i, j, ii, jj, jnum, itable; + int i, j, ii, jj, jnum; double qtmp, xtmp, ytmp, ztmp, delx, dely, delz, ecoul, fpair; - double fraction, table; double r, r2inv, forcecoul, factor_coul; double grij, expm2, prefactor, t, erfc; int *jlist; diff --git a/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp index 20cbafd6c1e..d65e3bd4cf8 100644 --- a/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp +++ b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp @@ -166,9 +166,9 @@ double PairLJCutCoulCutSoftGPU::memory_usage() void PairLJCutCoulCutSoftGPU::cpu_compute(int start, int inum, int eflag, int /* vflag */, int *ilist, int *numneigh, int **firstneigh) { - int i, j, ii, jj, jnum, itype, jtype, itable; + int i, j, ii, jj, jnum, itype, jtype; double qtmp, xtmp, ytmp, ztmp, delx, dely, delz, evdwl, ecoul, fpair; - double r, r2inv, r6inv, forcecoul, forcelj, factor_coul, factor_lj; + double forcecoul, forcelj, factor_coul, factor_lj; double denc, denlj, r4sig6; int *jlist; double rsq; diff --git a/src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp b/src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp index 4846b66a300..e8342b65308 100644 --- a/src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp +++ b/src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp @@ -201,9 +201,9 @@ double PairLJCutCoulLongSoftGPU::memory_usage() void PairLJCutCoulLongSoftGPU::cpu_compute(int start, int inum, int eflag, int /* vflag */, int *ilist, int *numneigh, int **firstneigh) { - int i, j, ii, jj, jnum, itype, jtype, itable; + int i, j, ii, jj, jnum, itype, jtype; double qtmp, xtmp, ytmp, ztmp, delx, dely, delz, evdwl, ecoul, fpair; - double r, r2inv, r6inv, forcecoul, forcelj, factor_coul, factor_lj; + double r, r2inv, forcecoul, forcelj, factor_coul, factor_lj; double denc, denlj, r4sig6; double grij, expm2, prefactor, t, erfc; int *jlist; From 7ed8779d2849ca479edcf3cea8eb2a823ca4d962 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 11 Dec 2023 23:18:08 -0500 Subject: [PATCH 27/30] tweak another epsilon for new GPU pair style --- unittest/force-styles/tests/mol-pair-lj_cut_coul_long_soft.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unittest/force-styles/tests/mol-pair-lj_cut_coul_long_soft.yaml b/unittest/force-styles/tests/mol-pair-lj_cut_coul_long_soft.yaml index 8eca0650920..a1e89e54c0e 100644 --- a/unittest/force-styles/tests/mol-pair-lj_cut_coul_long_soft.yaml +++ b/unittest/force-styles/tests/mol-pair-lj_cut_coul_long_soft.yaml @@ -1,7 +1,7 @@ --- lammps_version: 17 Feb 2022 date_generated: Fri Mar 18 22:17:31 2022 -epsilon: 5e-12 +epsilon: 7.5e-12 skip_tests: prerequisites: ! | atom full From ecc460b3588d63d794eb798e4399476ab4d3afe3 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 12 Dec 2023 17:17:32 -0600 Subject: [PATCH 28/30] Added the new pair styles to Command_pairs.rst and pair_style.rst --- doc/src/Commands_pair.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst index 828f0b10d94..e7761e7bee7 100644 --- a/doc/src/Commands_pair.rst +++ b/doc/src/Commands_pair.rst @@ -87,7 +87,7 @@ OPT. * :doc:`coul/long/soft (o) ` * :doc:`coul/msm (o) ` * :doc:`coul/slater/cut ` - * :doc:`coul/slater/long ` + * :doc:`coul/slater/long (g) ` * :doc:`coul/shield ` * :doc:`coul/streitz ` * :doc:`coul/tt ` @@ -110,7 +110,7 @@ OPT. * :doc:`eam/he ` * :doc:`edip (o) ` * :doc:`edip/multi ` - * :doc:`edpd ` + * :doc:`edpd (g) ` * :doc:`eff/cut ` * :doc:`eim (o) ` * :doc:`exp6/rx (k) ` @@ -158,14 +158,14 @@ OPT. * :doc:`lj/cut (gikot) ` * :doc:`lj/cut/coul/cut (gko) ` * :doc:`lj/cut/coul/cut/dielectric (o) ` - * :doc:`lj/cut/coul/cut/soft (o) ` + * :doc:`lj/cut/coul/cut/soft (go) ` * :doc:`lj/cut/coul/debye (gko) ` * :doc:`lj/cut/coul/debye/dielectric (o) ` * :doc:`lj/cut/coul/dsf (gko) ` * :doc:`lj/cut/coul/long (gikot) ` * :doc:`lj/cut/coul/long/cs ` * :doc:`lj/cut/coul/long/dielectric (o) ` - * :doc:`lj/cut/coul/long/soft (o) ` + * :doc:`lj/cut/coul/long/soft (go) ` * :doc:`lj/cut/coul/msm (go) ` * :doc:`lj/cut/coul/msm/dielectric ` * :doc:`lj/cut/coul/wolf (o) ` @@ -202,7 +202,7 @@ OPT. * :doc:`lubricate/poly (o) ` * :doc:`lubricateU ` * :doc:`lubricateU/poly ` - * :doc:`mdpd ` + * :doc:`mdpd (g) ` * :doc:`mdpd/rhosum ` * :doc:`meam (k) ` * :doc:`meam/ms (k) ` @@ -268,11 +268,11 @@ OPT. * :doc:`smtbq ` * :doc:`snap (ik) ` * :doc:`soft (go) ` - * :doc:`sph/heatconduction ` + * :doc:`sph/heatconduction (g) ` * :doc:`sph/idealgas ` - * :doc:`sph/lj ` + * :doc:`sph/lj (g) ` * :doc:`sph/rhosum ` - * :doc:`sph/taitwater ` + * :doc:`sph/taitwater (g) ` * :doc:`sph/taitwater/morris ` * :doc:`spin/dipole/cut ` * :doc:`spin/dipole/long ` From 8f7d7f9178ec9146977942004cf52d11d3796039 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 12 Dec 2023 20:15:05 -0600 Subject: [PATCH 29/30] Fixed typo in the pair style lj/cut/coul/cut/soft/gpu header --- src/GPU/pair_lj_cut_coul_cut_soft_gpu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/GPU/pair_lj_cut_coul_cut_soft_gpu.h b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.h index 8298d778275..0776695ba3d 100644 --- a/src/GPU/pair_lj_cut_coul_cut_soft_gpu.h +++ b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.h @@ -13,7 +13,7 @@ #ifdef PAIR_CLASS // clang-format off -PairStyle(lj/cut/coul/soft/soft/gpu,PairLJCutCoulCutSoftGPU); +PairStyle(lj/cut/coul/cut/soft/gpu,PairLJCutCoulCutSoftGPU); // clang-format on #else From 569c23a1e60fd6cb447e26df5360cf59de3d4653 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 12 Dec 2023 22:54:50 -0600 Subject: [PATCH 30/30] Fixed typo with the kernel names in lj/cut/coul/cut/soft --- lib/gpu/lal_lj_coul_soft.cpp | 2 +- lib/gpu/lal_lj_coul_soft.cu | 4 ++-- lib/gpu/lal_lj_coul_soft_ext.cpp | 2 +- src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/gpu/lal_lj_coul_soft.cpp b/lib/gpu/lal_lj_coul_soft.cpp index 96bcf41aa36..9ee6486817e 100644 --- a/lib/gpu/lal_lj_coul_soft.cpp +++ b/lib/gpu/lal_lj_coul_soft.cpp @@ -56,7 +56,7 @@ int LJCoulSoftT::init(const int ntypes, double *host_special_coul, const double qqrd2e) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,lj_coul_soft,"lj_coul_soft"); + _screen,lj_coul_soft,"k_lj_coul_soft"); if (success!=0) return success; diff --git a/lib/gpu/lal_lj_coul_soft.cu b/lib/gpu/lal_lj_coul_soft.cu index 74da36e2c17..1fc564bde67 100644 --- a/lib/gpu/lal_lj_coul_soft.cu +++ b/lib/gpu/lal_lj_coul_soft.cu @@ -225,7 +225,7 @@ __kernel void k_lj_coul_soft_fast(const __global numtyp4 *restrict x_, numtyp forcecoul, force_lj, force; numtyp r4sig6, denlj, denc; - if (rsq < lj1[mtype].z) { + if (rsq < lj1[mtype].z) { // cut_ljsq[itype][jtype] r4sig6 = rsq*rsq / lj1[mtype].y; denlj = lj3[mtype].x + rsq*r4sig6; force_lj = lj1[mtype].x * lj3[mtype].w * @@ -234,7 +234,7 @@ __kernel void k_lj_coul_soft_fast(const __global numtyp4 *restrict x_, } else force_lj = (numtyp)0.0; - if (rsq < lj1[mtype].w) { + if (rsq < lj1[mtype].w) { // cut_coulsq[itype][jtype] fetch(forcecoul,j,q_tex); denc = sqrt(lj3[mtype].y + rsq); forcecoul *= qqrd2e * lj1[mtype].x * qtmp / (denc*denc*denc); diff --git a/lib/gpu/lal_lj_coul_soft_ext.cpp b/lib/gpu/lal_lj_coul_soft_ext.cpp index 8c10db14f69..02d367b3c74 100644 --- a/lib/gpu/lal_lj_coul_soft_ext.cpp +++ b/lib/gpu/lal_lj_coul_soft_ext.cpp @@ -43,7 +43,7 @@ int ljcs_gpu_init(const int ntypes, double **cutsq, double **host_lj1, int gpu_rank=LJCSMF.device->gpu_rank(); int procs_per_gpu=LJCSMF.device->procs_per_gpu(); - LJCSMF.device->init_message(screen,"lj/cut/coul/cut",first_gpu,last_gpu); + LJCSMF.device->init_message(screen,"lj/cut/coul/cut/soft",first_gpu,last_gpu); bool message=false; if (LJCSMF.device->replica_me()==0 && screen) diff --git a/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp index d65e3bd4cf8..cfde3ab632d 100644 --- a/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp +++ b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp @@ -110,6 +110,8 @@ void PairLJCutCoulCutSoftGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); @@ -227,13 +229,11 @@ void PairLJCutCoulCutSoftGPU::cpu_compute(int start, int inum, int eflag, int /* f[i][1] += dely * fpair; f[i][2] += delz * fpair; - if (eflag) { if (rsq < cut_coulsq[itype][jtype]) ecoul = factor_coul * qqrd2e * lj1[itype][jtype] * qtmp*q[j] / denc; else ecoul = 0.0; - if (rsq < cut_ljsq[itype][jtype]) { evdwl = lj1[itype][jtype] * 4.0 * epsilon[itype][jtype] * (1.0/(denlj*denlj) - 1.0/denlj) - offset[itype][jtype];