Skip to content

Commit

Permalink
Merge pull request #767 from brucefan1983/remove_syncwarp
Browse files Browse the repository at this point in the history
remove syncwarp
  • Loading branch information
brucefan1983 authored Oct 23, 2024
2 parents 208d4d1 + bf3a0cb commit dafd341
Show file tree
Hide file tree
Showing 10 changed files with 24 additions and 120 deletions.
16 changes: 2 additions & 14 deletions src/force/force.cu
Original file line number Diff line number Diff line change
Expand Up @@ -227,18 +227,12 @@ static __global__ void gpu_sum_force(int N, double* g_fx, double* g_fy, double*
__syncthreads();

#pragma unroll
for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) {
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
if (tid < offset) {
s_f[tid] += s_f[tid + offset];
}
__syncthreads();
}
for (int offset = 32; offset > 0; offset >>= 1) {
if (tid < offset) {
s_f[tid] += s_f[tid + offset];
}
__syncwarp();
}

if (tid == 0) {
g_f[bid] = s_f[0];
Expand Down Expand Up @@ -654,18 +648,12 @@ static __global__ void gpu_sum_tensor(int N, double* g_tensor, double* g_sum_ten
__syncthreads();

#pragma unroll
for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) {
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
if (tid < offset) {
s_t[tid] += s_t[tid + offset];
}
__syncthreads();
}
for (int offset = 32; offset > 0; offset >>= 1) {
if (tid < offset) {
s_t[tid] += s_t[tid + offset];
}
__syncwarp();
}

if (tid == 0) {
g_sum_tensor[bid] = s_t[0];
Expand Down
10 changes: 1 addition & 9 deletions src/integrate/ensemble_pimd.cu
Original file line number Diff line number Diff line change
Expand Up @@ -371,22 +371,14 @@ gpu_find_momentum_beads(const int number_of_atoms, const double* g_mass, double*
}
__syncthreads();

for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) {
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
if (tid < offset) {
for (int d = 0; d < 4; ++d) {
s_momentum[d][tid] += s_momentum[d][tid + offset];
}
}
__syncthreads();
}
for (int offset = 32; offset > 0; offset >>= 1) {
if (tid < offset) {
for (int d = 0; d < 4; ++d) {
s_momentum[d][tid] += s_momentum[d][tid + offset];
}
}
__syncwarp();
}

if (tid == 0) {
for (int d = 0; d < 4; ++d) {
Expand Down
8 changes: 1 addition & 7 deletions src/integrate/langevin_utilities.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -97,18 +97,12 @@ static __global__ void gpu_find_momentum(
s_momentum[tid] = momentum;
__syncthreads();

for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) {
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
if (tid < offset) {
s_momentum[tid] += s_momentum[tid + offset];
}
__syncthreads();
}
for (int offset = 32; offset > 0; offset >>= 1) {
if (tid < offset) {
s_momentum[tid] += s_momentum[tid + offset];
}
__syncwarp();
}

if (tid == 0) {
device_momentum[bid] = s_momentum[0];
Expand Down
8 changes: 1 addition & 7 deletions src/main_gpumd/add_random_force.cu
Original file line number Diff line number Diff line change
Expand Up @@ -89,18 +89,12 @@ static __global__ void gpu_sum_force(int N, double* g_fx, double* g_fy, double*
__syncthreads();

#pragma unroll
for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) {
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
if (tid < offset) {
s_f[tid] += s_f[tid + offset];
}
__syncthreads();
}
for (int offset = 32; offset > 0; offset >>= 1) {
if (tid < offset) {
s_f[tid] += s_f[tid + offset];
}
__syncwarp();
}

if (tid == 0) {
device_total_force[bid] = s_f[0];
Expand Down
8 changes: 1 addition & 7 deletions src/main_gpumd/electron_stop.cu
Original file line number Diff line number Diff line change
Expand Up @@ -94,18 +94,12 @@ static __global__ void find_force_average(int num_atoms, double* g_force)
s_f[tid] = f;
__syncthreads();

for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) {
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
if (tid < offset) {
s_f[tid] += s_f[tid + offset];
}
__syncthreads();
}
for (int offset = 32; offset > 0; offset >>= 1) {
if (tid < offset) {
s_f[tid] += s_f[tid + offset];
}
__syncwarp();
}

if (tid == 0) {
device_force_average[bid] = s_f[0] / num_atoms;
Expand Down
38 changes: 4 additions & 34 deletions src/main_nep/dataset.cu
Original file line number Diff line number Diff line change
Expand Up @@ -380,20 +380,13 @@ static __global__ void gpu_sum_force_error(
}
__syncthreads();

for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) {
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
if (tid < offset) {
s_error[tid] += s_error[tid + offset];
}
__syncthreads();
}

for (int offset = 32; offset > 0; offset >>= 1) {
if (tid < offset) {
s_error[tid] += s_error[tid + offset];
}
__syncwarp();
}

if (tid == 0) {
error_gpu[bid] = s_error[0];
}
Expand Down Expand Up @@ -456,20 +449,13 @@ gpu_get_energy_shift(int* g_Na, int* g_Na_sum, float* g_pe, float* g_pe_ref, flo
}
__syncthreads();

for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) {
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
if (tid < offset) {
s_pe[tid] += s_pe[tid + offset];
}
__syncthreads();
}

for (int offset = 32; offset > 0; offset >>= 1) {
if (tid < offset) {
s_pe[tid] += s_pe[tid + offset];
}
__syncwarp();
}

if (tid == 0) {
float diff = s_pe[0] / Na - g_pe_ref[bid];
g_energy_shift[bid] = diff;
Expand All @@ -492,20 +478,13 @@ static __global__ void gpu_sum_pe_error(
}
__syncthreads();

for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) {
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
if (tid < offset) {
s_pe[tid] += s_pe[tid + offset];
}
__syncthreads();
}

for (int offset = 32; offset > 0; offset >>= 1) {
if (tid < offset) {
s_pe[tid] += s_pe[tid + offset];
}
__syncwarp();
}

if (tid == 0) {
float diff = s_pe[0] / Na - g_pe_ref[bid] - energy_shift;
error_gpu[bid] = diff * diff;
Expand Down Expand Up @@ -589,7 +568,7 @@ static __global__ void gpu_sum_virial_error(
}
__syncthreads();

for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) {
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
if (tid < offset) {
for (int d = 0; d < 6; ++d) {
s_virial[d * blockDim.x + tid] += s_virial[d * blockDim.x + tid + offset];
Expand All @@ -598,15 +577,6 @@ static __global__ void gpu_sum_virial_error(
__syncthreads();
}

for (int offset = 32; offset > 0; offset >>= 1) {
if (tid < offset) {
for (int d = 0; d < 6; ++d) {
s_virial[d * blockDim.x + tid] += s_virial[d * blockDim.x + tid + offset];
}
}
__syncwarp();
}

if (tid == 0) {
float error_sum = 0.0f;
for (int d = 0; d < 6; ++d) {
Expand Down
20 changes: 2 additions & 18 deletions src/main_nep/snes.cu
Original file line number Diff line number Diff line change
Expand Up @@ -330,22 +330,14 @@ static __global__ void gpu_find_L1_L2_NEP4(
}
__syncthreads();

for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) {
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
if (tid < offset) {
s_cost_L1reg[tid] += s_cost_L1reg[tid + offset];
s_cost_L2reg[tid] += s_cost_L2reg[tid + offset];
}
__syncthreads();
}

for (int offset = 32; offset > 0; offset >>= 1) {
if (tid < offset) {
s_cost_L1reg[tid] += s_cost_L1reg[tid + offset];
s_cost_L2reg[tid] += s_cost_L2reg[tid + offset];
}
__syncwarp();
}

if (tid == 0) {
gpu_cost_L1reg[bid] = s_cost_L1reg[0];
gpu_cost_L2reg[bid] = s_cost_L2reg[0];
Expand Down Expand Up @@ -406,22 +398,14 @@ static __global__ void gpu_find_L1_L2(
}
__syncthreads();

for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) {
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
if (tid < offset) {
s_cost_L1reg[tid] += s_cost_L1reg[tid + offset];
s_cost_L2reg[tid] += s_cost_L2reg[tid + offset];
}
__syncthreads();
}

for (int offset = 32; offset > 0; offset >>= 1) {
if (tid < offset) {
s_cost_L1reg[tid] += s_cost_L1reg[tid + offset];
s_cost_L2reg[tid] += s_cost_L2reg[tid + offset];
}
__syncwarp();
}

if (tid == 0) {
gpu_cost_L1reg[bid] = s_cost_L1reg[0];
gpu_cost_L2reg[bid] = s_cost_L2reg[0];
Expand Down
8 changes: 1 addition & 7 deletions src/measure/dump_dipole.cu
Original file line number Diff line number Diff line change
Expand Up @@ -52,18 +52,12 @@ static __global__ void sum_dipole(

// aggregate the patches in parallel
#pragma unroll
for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) {
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
if (tid < offset) {
s_d[tid] += s_d[tid + offset];
}
__syncthreads();
}
for (int offset = 32; offset > 0; offset >>= 1) {
if (tid < offset) {
s_d[tid] += s_d[tid + offset];
}
__syncwarp();
}

// save the final value
if (tid == 0) {
Expand Down
8 changes: 1 addition & 7 deletions src/measure/dump_polarizability.cu
Original file line number Diff line number Diff line change
Expand Up @@ -56,18 +56,12 @@ static __global__ void sum_polarizability(

// aggregate the patches in parallel
#pragma unroll
for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) {
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
if (tid < offset) {
s_p[tid] += s_p[tid + offset];
}
__syncthreads();
}
for (int offset = 32; offset > 0; offset >>= 1) {
if (tid < offset) {
s_p[tid] += s_p[tid + offset];
}
__syncwarp();
}

// save the final value
if (tid == 0) {
Expand Down
20 changes: 10 additions & 10 deletions tests/gpumd/carbon/thermo1.out
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
3.0070239595e+02 2.4876054593e+03 -1.6374740626e+05 -4.9616458347e-01 -1.6912919950e-01 3.5673276051e-03 2.4386945299e-03 -2.6912030640e-01 1.0069715166e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01
3.0028361711e+02 2.4841410488e+03 -1.6374398830e+05 -5.5494401174e-01 -8.7591048145e-02 -5.0432348237e-03 -2.3679409769e-02 -2.4354585861e-01 1.9885808946e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01
3.0090760001e+02 2.4893030406e+03 -1.6374911269e+05 -4.8069691755e-01 -9.9976336074e-02 6.1171437172e-03 5.0698276414e-02 -2.6632055902e-01 1.7746335386e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01
2.9928310328e+02 2.4758641489e+03 -1.6373553821e+05 -5.3300657578e-01 -8.9159184288e-02 1.7190277772e-02 -1.7661720635e-02 -2.8009973914e-01 1.3277642913e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01
3.0056896760e+02 2.4865016534e+03 -1.6374628179e+05 -4.6676187670e-01 -1.9398202792e-01 6.0657253169e-02 2.4843680762e-02 -2.6412937631e-01 1.2556339662e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01
3.0177745121e+02 2.4964990145e+03 -1.6375641970e+05 -5.7855570392e-01 -1.3026857442e-01 -2.4558633045e-02 3.1861800608e-02 -3.1819298120e-01 1.3576836171e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01
2.9932973117e+02 2.4762498851e+03 -1.6373603799e+05 -5.4164328240e-01 -1.2870624839e-01 7.3976663441e-02 -3.7364902470e-03 -2.4645646405e-01 1.1570002894e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01
3.0019241526e+02 2.4833865679e+03 -1.6374314826e+05 -5.7517205747e-01 -1.9081967489e-01 3.0024095278e-02 3.6496184974e-03 -2.9928554061e-01 1.2112666925e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01
3.0102978417e+02 2.4903138273e+03 -1.6375013168e+05 -5.4688206625e-01 -1.4423854074e-01 -2.9289295474e-02 -6.9361765237e-02 -2.9886508948e-01 1.1227931829e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01
3.0053797977e+02 2.4862453020e+03 -1.6374602890e+05 -4.7482065484e-01 -7.5619589181e-02 2.2007463507e-02 -5.4715753719e-02 -3.1145417781e-01 1.2338681735e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01
3.0070239696e+02 2.4876054677e+03 -1.6374740681e+05 -4.9616458721e-01 -1.6912905599e-01 3.5674571088e-03 2.4386255848e-03 -2.6912030874e-01 1.0069714895e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01
3.0028361454e+02 2.4841410276e+03 -1.6374398954e+05 -5.5494394538e-01 -8.7591274243e-02 -5.0432026470e-03 -2.3679457451e-02 -2.4354571568e-01 1.9885801505e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01
3.0090759673e+02 2.4893030134e+03 -1.6374911288e+05 -4.8069681517e-01 -9.9975851141e-02 6.1175898070e-03 5.0698137114e-02 -2.6632069655e-01 1.7746338551e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01
2.9928310219e+02 2.4758641398e+03 -1.6373553882e+05 -5.3300646757e-01 -8.9159458064e-02 1.7190636806e-02 -1.7661811588e-02 -2.8009998150e-01 1.3277622094e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01
3.0056897132e+02 2.4865016842e+03 -1.6374628142e+05 -4.6676182293e-01 -1.9398195233e-01 6.0657145328e-02 2.4843735679e-02 -2.6412948829e-01 1.2556328772e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01
3.0177745152e+02 2.4964990170e+03 -1.6375641977e+05 -5.7855531452e-01 -1.3026849337e-01 -2.4558613392e-02 3.1861445090e-02 -3.1819302526e-01 1.3576828877e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01
2.9932973734e+02 2.4762499361e+03 -1.6373603818e+05 -5.4164297247e-01 -1.2870619654e-01 7.3976383615e-02 -3.7365440902e-03 -2.4645624599e-01 1.1570049569e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01
3.0019240941e+02 2.4833865196e+03 -1.6374314724e+05 -5.7517215884e-01 -1.9081958194e-01 3.0024138347e-02 3.6494197887e-03 -2.9928541140e-01 1.2112686234e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01
3.0102978268e+02 2.4903138149e+03 -1.6375013226e+05 -5.4688259664e-01 -1.4423825322e-01 -2.9289226679e-02 -6.9361762204e-02 -2.9886511551e-01 1.1227907599e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01
3.0053797919e+02 2.4862452971e+03 -1.6374602908e+05 -4.7481977198e-01 -7.5619344922e-02 2.2007601622e-02 -5.4715715225e-02 -3.1145404686e-01 1.2338681910e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01

0 comments on commit dafd341

Please sign in to comment.