Skip to content

Commit

Permalink
Add Nano's changes to replace deprecated texture syntax.
Browse files Browse the repository at this point in the history
The code doesn't compile with CUDA 12.
More changes must be done to compile LIO with GPU again.
  • Loading branch information
charlyqchm committed Apr 12, 2024
1 parent 5e20365 commit 35fef86
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 68 deletions.
59 changes: 36 additions & 23 deletions g2g/cuda/iteration.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@
#endif

namespace G2G {
#if FULL_DOUBLE
/*#if FULL_DOUBLE
texture<int2, 2, cudaReadModeElementType> rmm_input_gpu_tex;
texture<int2, 2, cudaReadModeElementType> rmm_input_gpu_tex2;
#else
texture<float, 2, cudaReadModeElementType> rmm_input_gpu_tex;
texture<float, 2, cudaReadModeElementType> rmm_input_gpu_tex2;
#endif
#endif*/
/** KERNELS **/
#include "gpu_variables.h"
#include "kernels/accumulate_point.h"
Expand Down Expand Up @@ -210,19 +210,22 @@ void PointGroupGPU<scalar_type>::solve_closed(
}
}

CudaMatrix<scalar_type> rmm_input_gpu;

rmm_input_gpu=rmm_input_cpu;
/*
**********************************************************************
* Pasando RDM (rmm) a texturas
**********************************************************************
*/

/*
cudaArray* cuArray;
cudaMallocArray(&cuArray, &rmm_input_gpu_tex.channelDesc, rmm_input_cpu.width, rmm_input_cpu.height);
cudaMemcpyToArray(cuArray, 0, 0, rmm_input_cpu.data, sizeof(scalar_type)*rmm_input_cpu.width*rmm_input_cpu.height, cudaMemcpyHostToDevice);
cudaBindTextureToArray(rmm_input_gpu_tex, cuArray);
rmm_input_gpu_tex.normalized = false;

*/
#if USE_LIBXC
fortran_vars.fexc = fortran_vars.func_coef[0];
#define libxc_init_param \
Expand Down Expand Up @@ -256,7 +259,7 @@ void PointGroupGPU<scalar_type>::solve_closed(
#define compute_parameters \
energy_gpu.data, factors_gpu.data, point_weights_gpu.data, this->number_of_points, function_values_transposed.data, \
gradient_values_transposed.data, hessian_values_transposed.data, group_m, partial_densities_gpu.data, dxyz_gpu.data, \
dd1_gpu.data,dd2_gpu.data
dd1_gpu.data,dd2_gpu.data, rmm_input_gpu.data

#define accumulate_parameters \
energy_gpu.data, factors_gpu.data, point_weights_gpu.data, this->number_of_points, block_height, \
Expand Down Expand Up @@ -363,7 +366,7 @@ void PointGroupGPU<scalar_type>::solve_closed(
#undef accumulate_parameters

#define compute_parameters \
NULL,factors_gpu.data,point_weights_gpu.data,this->number_of_points,function_values_transposed.data,gradient_values_transposed.data,hessian_values_transposed.data,group_m,partial_densities_gpu.data,dxyz_gpu.data,dd1_gpu.data,dd2_gpu.data
NULL,factors_gpu.data,point_weights_gpu.data,this->number_of_points,function_values_transposed.data,gradient_values_transposed.data,hessian_values_transposed.data,group_m,partial_densities_gpu.data,dxyz_gpu.data,dd1_gpu.data,dd2_gpu.data,rmm_input_gpu.data
#define accumulate_parameters \
NULL,factors_gpu.data,point_weights_gpu.data,this->number_of_points,block_height,partial_densities_gpu.data,dxyz_gpu.data,dd1_gpu.data,dd2_gpu.data, fortran_vars.fexc
if (lda)
Expand Down Expand Up @@ -428,9 +431,12 @@ void PointGroupGPU<scalar_type>::solve_closed(
}
}

CudaMatrix<scalar_type> rmm_input_gpu_sinceros;

rmm_input_gpu_sinceros=rmm_input_cpu; // la versión en la GPU ya existe, puede que haya problemas con eso
timers.density_derivs.start_and_sync();
cudaMemcpyToArray(cuArray, 0, 0,rmm_input_cpu.data,
sizeof(scalar_type)*rmm_input_cpu.width*rmm_input_cpu.height, cudaMemcpyHostToDevice);
// cudaMemcpyToArray(cuArray, 0, 0,rmm_input_cpu.data,
// sizeof(scalar_type)*rmm_input_cpu.width*rmm_input_cpu.height, cudaMemcpyHostToDevice);

timers.density_derivs.start_and_sync();
dim3 threads = dim3(this->number_of_points);
Expand All @@ -441,7 +447,7 @@ void PointGroupGPU<scalar_type>::solve_closed(
CudaMatrixUInt nuc_gpu(this->func2local_nuc); // TODO: esto en realidad se podria guardar una sola vez durante su construccion

gpu_compute_density_derivs<<<threadGrid, threadBlock>>>(
function_values.data, gradient_values.data, nuc_gpu.data, dd_gpu.data, this->number_of_points, group_m, this->total_nucleii());
function_values.data, gradient_values.data, nuc_gpu.data, dd_gpu.data, this->number_of_points, group_m, this->total_nucleii(),rmm_input_gpu_sinceros.data);
cudaAssertNoError("density_derivs");
timers.density_derivs.pause_and_sync();

Expand Down Expand Up @@ -524,8 +530,8 @@ void PointGroupGPU<scalar_type>::solve_closed(
hessian_values_transposed.deallocate();
}
//Deshago el bind de textura de rmm
cudaUnbindTexture(rmm_input_gpu_tex); //Enroque el Unbind con el Free, asi parece mas logico. Nano
cudaFreeArray(cuArray);
// cudaUnbindTexture(rmm_input_gpu_tex); //Enroque el Unbind con el Free, asi parece mas logico. Nano
// cudaFreeArray(cuArray);
}

//======================
Expand Down Expand Up @@ -637,13 +643,18 @@ void PointGroupGPU<scalar_type>::solve_opened(
}
}
}
CudaMatrix<scalar_type> rmm_input_gpu1;
CudaMatrix<scalar_type> rmm_input_gpu2;

rmm_input_gpu1=rmm_input_a_cpu;
rmm_input_gpu2=rmm_input_b_cpu;

/*
**********************************************************************
* Pasando RDM (rmm) a texturas/
**********************************************************************
*/

/*
cudaArray* cuArray1;
cudaArray* cuArray2;
cudaMallocArray(&cuArray1, &rmm_input_gpu_tex.channelDesc, rmm_input_a_cpu.width,rmm_input_a_cpu.height);
Expand All @@ -655,7 +666,7 @@ void PointGroupGPU<scalar_type>::solve_opened(
rmm_input_gpu_tex.normalized = false;
rmm_input_gpu_tex2.normalized = false;

*/
// For CDFT and becke partitioning.
CudaMatrix<scalar_type> becke_w_gpu;
CudaMatrix<scalar_type> cdft_factors_gpu;
Expand All @@ -681,7 +692,7 @@ void PointGroupGPU<scalar_type>::solve_opened(
point_weights_gpu.data,this->number_of_points, function_values_transposed.data,
gradient_values_transposed.data,hessian_values_transposed.data, group_m,
partial_densities_a_gpu.data, dxyz_a_gpu.data, dd1_a_gpu.data, dd2_a_gpu.data,
partial_densities_b_gpu.data, dxyz_b_gpu.data, dd1_b_gpu.data, dd2_b_gpu.data);
partial_densities_b_gpu.data, dxyz_b_gpu.data, dd1_b_gpu.data, dd2_b_gpu.data, rmm_input_gpu1.data, rmm_input_gpu2.data);
gpu_accumulate_point_open<scalar_type, true, true, false><<<threadGrid_accumulate, threadBlock_accumulate>>> (
energy_gpu.data,
factors_a_gpu.data, factors_b_gpu.data, point_weights_gpu.data,this->number_of_points,block_height,
Expand All @@ -705,7 +716,7 @@ void PointGroupGPU<scalar_type>::solve_opened(
point_weights_gpu.data,this->number_of_points, function_values_transposed.data,
gradient_values_transposed.data,hessian_values_transposed.data, group_m,
partial_densities_a_gpu.data, dxyz_a_gpu.data, dd1_a_gpu.data, dd2_a_gpu.data,
partial_densities_b_gpu.data, dxyz_b_gpu.data, dd1_b_gpu.data, dd2_b_gpu.data);
partial_densities_b_gpu.data, dxyz_b_gpu.data, dd1_b_gpu.data, dd2_b_gpu.data, rmm_input_gpu1.data, rmm_input_gpu2.data);
gpu_accumulate_point_open<scalar_type, true, false, false><<<threadGrid_accumulate, threadBlock_accumulate>>> (
energy_gpu.data, factors_a_gpu.data, factors_b_gpu.data, point_weights_gpu.data,
this->number_of_points, block_height,
Expand Down Expand Up @@ -745,7 +756,7 @@ void PointGroupGPU<scalar_type>::solve_opened(
point_weights_gpu.data, this->number_of_points, function_values_transposed.data,
gradient_values_transposed.data,hessian_values_transposed.data, group_m,
partial_densities_a_gpu.data, dxyz_a_gpu.data, dd1_a_gpu.data, dd2_a_gpu.data,
partial_densities_b_gpu.data, dxyz_b_gpu.data, dd1_b_gpu.data, dd2_b_gpu.data);
partial_densities_b_gpu.data, dxyz_b_gpu.data, dd1_b_gpu.data, dd2_b_gpu.data, rmm_input_gpu1.data, rmm_input_gpu2.data);
gpu_accumulate_point_open<scalar_type, false, true, false><<<threadGrid_accumulate, threadBlock_accumulate>>> (
NULL,
factors_a_gpu.data, factors_b_gpu.data, point_weights_gpu.data,this->number_of_points,block_height,
Expand Down Expand Up @@ -786,8 +797,10 @@ void PointGroupGPU<scalar_type>::solve_opened(
}
}

cudaMemcpyToArray(cuArray1, 0, 0,rmm_input_a_cpu.data,sizeof(scalar_type)*rmm_input_a_cpu.width*rmm_input_a_cpu.height, cudaMemcpyHostToDevice);
cudaMemcpyToArray(cuArray2, 0, 0,rmm_input_b_cpu.data,sizeof(scalar_type)*rmm_input_b_cpu.width*rmm_input_b_cpu.height, cudaMemcpyHostToDevice);
// cudaMemcpyToArray(cuArray1, 0, 0,rmm_input_a_cpu.data,sizeof(scalar_type)*rmm_input_a_cpu.width*rmm_input_a_cpu.height, cudaMemcpyHostToDevice);
// cudaMemcpyToArray(cuArray2, 0, 0,rmm_input_b_cpu.data,sizeof(scalar_type)*rmm_input_b_cpu.width*rmm_input_b_cpu.height, cudaMemcpyHostToDevice);
rmm_input_gpu1=rmm_input_a_cpu;
rmm_input_gpu2=rmm_input_b_cpu;


dim3 threads;
Expand All @@ -803,7 +816,7 @@ void PointGroupGPU<scalar_type>::solve_opened(
CudaMatrixUInt nuc_gpu(this->func2local_nuc);

// Kernel
gpu_compute_density_derivs_open<<<threadGrid, threadBlock>>>(function_values.data, gradient_values.data, nuc_gpu.data, dd_gpu_a.data, dd_gpu_b.data, this->number_of_points, group_m, this->total_nucleii());
gpu_compute_density_derivs_open<<<threadGrid, threadBlock>>>(function_values.data, gradient_values.data, nuc_gpu.data, dd_gpu_a.data, dd_gpu_b.data, this->number_of_points, group_m, this->total_nucleii(),rmm_input_gpu1.data,rmm_input_gpu2.data);

cudaAssertNoError("density_derivs");
timers.density_derivs.pause_and_sync();
Expand Down Expand Up @@ -928,10 +941,10 @@ void PointGroupGPU<scalar_type>::solve_opened(
}

//Deshago el bind de textura de rmm
cudaUnbindTexture(rmm_input_gpu_tex); //Enroque el Unbind con el Free, asi parece mas logico. Nano
cudaUnbindTexture(rmm_input_gpu_tex2); //Enroque el Unbind con el Free, asi parece mas logico. Nano
cudaFreeArray(cuArray1);
cudaFreeArray(cuArray2);
// cudaUnbindTexture(rmm_input_gpu_tex); //Enroque el Unbind con el Free, asi parece mas logico. Nano
// cudaUnbindTexture(rmm_input_gpu_tex2); //Enroque el Unbind con el Free, asi parece mas logico. Nano
// cudaFreeArray(cuArray1);
// cudaFreeArray(cuArray2);

//uint free_memory, total_memory;
//cudaGetMemoryInfo(free_memory, total_memory);
Expand Down
40 changes: 20 additions & 20 deletions g2g/cuda/kernels/energy.h
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
#if FULL_DOUBLE
static __inline__ __device__ double fetch_double(texture<int2, 2> t, float x,
float y) {
int2 v = tex2D(t, x, y);
return __hiloint2double(v.y, v.x);
}
#define fetch(t, x, y) fetch_double(t, x, y)
#else
#define fetch(t, x, y) tex2D(t, x, y)
#endif

/*#if FULL_DOUBLE
//static __inline__ __device__ double fetch_double(texture<int2, 2> t, float x,
// float y) {
// int2 v = tex2D(t, x, y);
// return __hiloint2double(v.y, v.x);
//}
//#define fetch(t, x, y) fetch_double(t, x, y)
//#else
//#define fetch(t, x, y) tex2D(t, x, y)
//#endif
*/
template <class scalar_type, bool compute_energy, bool compute_factor, bool lda>
__global__ void gpu_compute_density(
scalar_type* const energy, scalar_type* const factor,
Expand All @@ -17,15 +17,15 @@ __global__ void gpu_compute_density(
const vec_type<scalar_type, 4>* gradient_values,
const vec_type<scalar_type, 4>* hessian_values, uint m,
scalar_type* out_partial_density, vec_type<scalar_type, 4>* out_dxyz,
vec_type<scalar_type, 4>* out_dd1, vec_type<scalar_type, 4>* out_dd2) {
vec_type<scalar_type, 4>* out_dd1, vec_type<scalar_type, 4>* out_dd2, const scalar_type* rmm_input_gpu) {
uint point = blockIdx.x;

uint i = threadIdx.x + blockIdx.y * 2 * DENSITY_BLOCK_SIZE;

uint i2 = i + DENSITY_BLOCK_SIZE;

uint min_i = blockIdx.y * 2 * DENSITY_BLOCK_SIZE + DENSITY_BLOCK_SIZE;

uint mc=COALESCED_DIMENSION(m);
bool valid_thread = (i < m);
bool valid_thread2 = (i2 < m);

Expand Down Expand Up @@ -84,8 +84,8 @@ __global__ void gpu_compute_density(
// fetch es una macro para tex2D

if ((bj + j) <= i) {
scalar_type rdm_this_thread =
fetch(rmm_input_gpu_tex, (float)(bj + j), (float)i);
scalar_type rdm_this_thread = rmm_input_gpu[(bj + j)+mc*i];
// fetch(rmm_input_gpu_tex, (float)(bj + j), (float)i);
w += rdm_this_thread * fjreg;

if (!lda) {
Expand All @@ -96,8 +96,8 @@ __global__ void gpu_compute_density(
}

if (valid_thread2 && ((bj + j) <= i2)) {
scalar_type rdm_this_thread2 =
fetch(rmm_input_gpu_tex, (float)(bj + j), (float)i2);
scalar_type rdm_this_thread2 = rmm_input_gpu[(bj + j)+mc*i2];
// fetch(rmm_input_gpu_tex, (float)(bj + j), (float)i2);
w2 += rdm_this_thread2 * fjreg;

if (!lda) {
Expand Down Expand Up @@ -180,14 +180,14 @@ __global__ void gpu_compute_density(
for (int j = 2; j <= DENSITY_BLOCK_SIZE; j = j * 2) {
int index = position + DENSITY_BLOCK_SIZE / j;
if (position < DENSITY_BLOCK_SIZE / j) {
fj_sh[position] += fj_sh[index];
fgj_sh[position] += fgj_sh[index];
fj_sh[position] += fj_sh[index];
fgj_sh[position] += fgj_sh[index];
fh1j_sh[position] += fh1j_sh[index];
fh2j_sh[position] += fh2j_sh[index];
}
__syncthreads();
}

__syncthreads();
if (threadIdx.x == 0) {
const int myPoint = blockIdx.y * points + blockIdx.x;
out_partial_density[myPoint] = fj_sh[position];
Expand Down
23 changes: 14 additions & 9 deletions g2g/cuda/kernels/energy_derivs.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ template <class scalar_type>
__global__ void gpu_compute_density_derivs(
scalar_type* function_values, vec_type<scalar_type, 4>* gradient_values,
uint* nuc, vec_type<scalar_type, 4>* density_deriv, uint points, uint m,
uint nuc_count) {
uint nuc_count, const scalar_type * rmm_input_gpu) {
uint point = index_x(blockDim, blockIdx, threadIdx);
bool valid_thread = (point < points);

uint mc=COALESCED_DIMENSION(m);
__shared__ scalar_type rdm_sh[DENSITY_DERIV_BATCH_SIZE];
__shared__ uint nuc_sh[DENSITY_DERIV_BATCH_SIZE2];

Expand All @@ -27,8 +27,9 @@ __global__ void gpu_compute_density_derivs(
if (threadIdx.x < DENSITY_DERIV_BATCH_SIZE) {
// fetch es una macro para tex2D definida en energy.h
if (bj + threadIdx.x < m)
rdm_sh[threadIdx.x] = fetch(rmm_input_gpu_tex, (float)(bi + i),
(float)(bj + threadIdx.x));
rdm_sh[threadIdx.x] = rmm_input_gpu[(bi+i)*mc+bj+threadIdx.x];
//fetch(rmm_input_gpu_tex, (float)(bi + i),
// (float)(bj + threadIdx.x));
else
rdm_sh[threadIdx.x] = 0.0f;
}
Expand Down Expand Up @@ -61,9 +62,10 @@ __global__ void gpu_compute_density_derivs_open(
scalar_type* function_values, vec_type<scalar_type, 4>* gradient_values,
uint* nuc, vec_type<scalar_type, 4>* density_deriv_a,
vec_type<scalar_type, 4>* density_deriv_b, uint points, uint m,
uint nuc_count) {
uint nuc_count, const scalar_type * rmm_input_gpu, const scalar_type * rmm_input_gpu2) {
uint point = index_x(blockDim, blockIdx, threadIdx);
bool valid_thread = (point < points);
uint mc=COALESCED_DIMENSION(m);

__shared__ scalar_type rdm_a_sh[DENSITY_DERIV_BATCH_SIZE];
__shared__ scalar_type rdm_b_sh[DENSITY_DERIV_BATCH_SIZE];
Expand All @@ -90,10 +92,13 @@ __global__ void gpu_compute_density_derivs_open(
// scalar_type rmd_local = fetch(rmm_input_gpu_tex, (float)(bi+i),
// (float)(bj+threadIdx.x));
if (bj + threadIdx.x < m) {
rdm_a_sh[threadIdx.x] = fetch(rmm_input_gpu_tex, (float)(bi + i),
(float)(bj + threadIdx.x));
rdm_b_sh[threadIdx.x] = fetch(rmm_input_gpu_tex2, (float)(bi + i),
(float)(bj + threadIdx.x));
rdm_a_sh[threadIdx.x] = rmm_input_gpu[(bi+i)*mc+bj+threadIdx.x];

// fetch(rmm_input_gpu_tex, (float)(bi + i),
// (float)(bj + threadIdx.x));
rdm_b_sh[threadIdx.x] = rmm_input_gpu2[(bi+i)*mc+bj+threadIdx.x];
// rdm_b_sh[threadIdx.x] = fetch(rmm_input_gpu_tex2, (float)(bi + i),
// (float)(bj + threadIdx.x));
} // rdm[COALESCED_DIMENSION(m) * (bi + i) + (bj + threadIdx.x)];
else {
rdm_a_sh[threadIdx.x] = 0.0f;
Expand Down
29 changes: 15 additions & 14 deletions g2g/cuda/kernels/energy_open.h
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
#if FULL_DOUBLE
//#if FULL_DOUBLE
/*
static __inline__ __device__ double fetch_double(texture<int2, 2> t, float x,
float y)
{
int2 v = tex2D(t,x,y);
return __hiloint2double(v.y, v.x);
}*/
#define fetch(t, x, y) fetch_double(t, x, y)
#else
#define fetch(t, x, y) tex2D(t, x, y)
#endif
//#define fetch(t, x, y) fetch_double(t, x, y)
//#else
//#define fetch(t, x, y) tex2D(t, x, y)
//#endif

template <class scalar_type, bool compute_energy, bool compute_factor, bool lda>
__global__ void gpu_compute_density_opened(
Expand All @@ -20,11 +20,12 @@ __global__ void gpu_compute_density_opened(
scalar_type* out_partial_density_a, vec_type<scalar_type, 4>* out_dxyz_a,
vec_type<scalar_type, 4>* out_dd1_a, vec_type<scalar_type, 4>* out_dd2_a,
scalar_type* out_partial_density_b, vec_type<scalar_type, 4>* out_dxyz_b,
vec_type<scalar_type, 4>* out_dd1_b, vec_type<scalar_type, 4>* out_dd2_b) {
vec_type<scalar_type, 4>* out_dd1_b, vec_type<scalar_type, 4>* out_dd2_b, const scalar_type * rmm_input_gpu, const scalar_type * rmm_input_gpu2) {
uint point = blockIdx.x;
uint i = threadIdx.x + blockIdx.y * 2 * DENSITY_BLOCK_SIZE;
uint i2 = i + DENSITY_BLOCK_SIZE;
uint min_i = blockIdx.y * 2 * DENSITY_BLOCK_SIZE + DENSITY_BLOCK_SIZE;
uint mc=COALESCED_DIMENSION(m);

scalar_type partial_density_a(0.0f);
scalar_type partial_density_b(0.0f);
Expand Down Expand Up @@ -126,10 +127,10 @@ __global__ void gpu_compute_density_opened(

if ((bj + j) <= i) {
// Fetch is a macro for tex2D
scalar_type rdm_this_thread_a =
fetch(rmm_input_gpu_tex, (float)(bj + j), (float)i);
scalar_type rdm_this_thread_b =
fetch(rmm_input_gpu_tex2, (float)(bj + j), (float)i);
scalar_type rdm_this_thread_a = rmm_input_gpu[(bj + j)+mc*i];
// fetch(rmm_input_gpu_tex, (float)(bj + j), (float)i);
scalar_type rdm_this_thread_b = rmm_input_gpu2[(bj + j)+mc*i];
//fetch(rmm_input_gpu_tex2, (float)(bj + j), (float)i);

w_a += rdm_this_thread_a * fjreg;
w_b += rdm_this_thread_b * fjreg;
Expand All @@ -146,10 +147,10 @@ __global__ void gpu_compute_density_opened(
}

if (valid_thread2 && ((bj + j) <= i2)) {
scalar_type rdm_this_thread2_a =
fetch(rmm_input_gpu_tex, (float)(bj + j), (float)i2);
scalar_type rdm_this_thread2_b =
fetch(rmm_input_gpu_tex2, (float)(bj + j), (float)i2);
scalar_type rdm_this_thread2_a = rmm_input_gpu[(bj + j)+mc*i2];
// fetch(rmm_input_gpu_tex, (float)(bj + j), (float)i2);
scalar_type rdm_this_thread2_b =rmm_input_gpu2[(bj + j)+mc*i2];
//fetch(rmm_input_gpu_tex2, (float)(bj + j), (float)i2);
w2_a += rdm_this_thread2_a * fjreg;
w2_b += rdm_this_thread2_b * fjreg;
if (!lda) {
Expand Down
Loading

0 comments on commit 35fef86

Please sign in to comment.