Skip to content

Commit

Permalink
Change texture in excited states
Browse files Browse the repository at this point in the history
  • Loading branch information
gonzalodm committed Apr 17, 2024
1 parent 35fef86 commit e776251
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 111 deletions.
19 changes: 5 additions & 14 deletions g2g/excited/cuda/ES_compute_3rd_partial.h
Original file line number Diff line number Diff line change
@@ -1,25 +1,16 @@
#if FULL_DOUBLE
static __inline__ __device__ double fetch_double(texture<int2, 2> t, float x,
float y) {
int2 v = tex2D(t, x, y);
return __hiloint2double(v.y, v.x);
}
#define fetch(t, x, y) fetch_double(t, x, y)
#else
#define fetch(t, x, y) tex2D(t, x, y)
#endif

template <class scalar_type, bool compute_energy, bool compute_factor, bool lda>
__global__ void ES_compute_3rd_partial(uint points,
const scalar_type* function_values, uint m,
const vec_type<scalar_type, 4>* gradient_values,
scalar_type* out_partial_tred, vec_type<scalar_type, 4>* out_tredxyz)
scalar_type* out_partial_tred, vec_type<scalar_type, 4>* out_tredxyz,
const scalar_type* tred_gpu_3rd)
{

uint point = blockIdx.x;
uint i = threadIdx.x + blockIdx.y * 2 * DENSITY_BLOCK_SIZE;
uint i2 = i + DENSITY_BLOCK_SIZE;
uint min_i = blockIdx.y * 2 * DENSITY_BLOCK_SIZE + DENSITY_BLOCK_SIZE;
uint mc=COALESCED_DIMENSION(m);
bool valid_thread = (i < m);
bool valid_thread2 = (i2 < m);

Expand Down Expand Up @@ -60,7 +51,7 @@ __global__ void ES_compute_3rd_partial(uint points,
scalar_type rdm_this_thread;

// Transition density
rdm_this_thread = fetch(tred_gpu_3rd_tex, (float)(bj + j), (float)i);
rdm_this_thread = tred_gpu_3rd[(bj + j) + mc*i];
z += rdm_this_thread * fjreg;
z3 += fgjreg * rdm_this_thread;
}
Expand All @@ -69,7 +60,7 @@ __global__ void ES_compute_3rd_partial(uint points,
scalar_type rdm_this_thread2;

// Transition density
rdm_this_thread2 = fetch(tred_gpu_3rd_tex, (float)(bj + j), (float)i2);
rdm_this_thread = tred_gpu_3rd[(bj + j) + mc*i2];
z2 += rdm_this_thread2 * fjreg;
z32 += fgjreg * rdm_this_thread2;
}
Expand Down
23 changes: 7 additions & 16 deletions g2g/excited/cuda/ES_compute_for_partial.h
Original file line number Diff line number Diff line change
@@ -1,26 +1,17 @@
#if FULL_DOUBLE
static __inline__ __device__ double fetch_double(texture<int2, 2> t, float x,
float y) {
int2 v = tex2D(t, x, y);
return __hiloint2double(v.y, v.x);
}
#define fetch(t, x, y) fetch_double(t, x, y)
#else
#define fetch(t, x, y) tex2D(t, x, y)
#endif

template <class scalar_type, bool compute_energy, bool compute_factor, bool lda>
__global__ void ES_compute_for_partial(const scalar_type* const point_weights,uint points,
const scalar_type* function_values, uint m,
const vec_type<scalar_type, 4>* gradient_values,
scalar_type* out_partial_tred, vec_type<scalar_type, 4>* out_tredxyz,
scalar_type* out_partial_diff, vec_type<scalar_type, 4>* out_diffxyz)
scalar_type* out_partial_diff, vec_type<scalar_type, 4>* out_diffxyz,
const scalar_type* tred_gpu_for, const scalar_type* diff_gpu_for)
{

uint point = blockIdx.x;
uint i = threadIdx.x + blockIdx.y * 2 * DENSITY_BLOCK_SIZE;
uint i2 = i + DENSITY_BLOCK_SIZE;
uint min_i = blockIdx.y * 2 * DENSITY_BLOCK_SIZE + DENSITY_BLOCK_SIZE;
uint mc=COALESCED_DIMENSION(m);
bool valid_thread = (i < m);
bool valid_thread2 = (i2 < m);

Expand Down Expand Up @@ -68,12 +59,12 @@ __global__ void ES_compute_for_partial(const scalar_type* const point_weights,ui
scalar_type rdm_this_thread;

// Transition density
rdm_this_thread = fetch(tred_gpu_for_tex, (float)(bj + j), (float)i);
rdm_this_thread = tred_gpu_for[(bj + j) + mc*i];
z += rdm_this_thread * fjreg;
z3 += fgjreg * rdm_this_thread;

// Difference density
rdm_this_thread = fetch(diff_gpu_for_tex, (float)(bj + j), (float)i);
rdm_this_thread = diff_gpu_for[(bj + j) + mc*i];
x += rdm_this_thread * fjreg;
x3 += fgjreg * rdm_this_thread;
}
Expand All @@ -82,12 +73,12 @@ __global__ void ES_compute_for_partial(const scalar_type* const point_weights,ui
scalar_type rdm_this_thread2;

// Transition density
rdm_this_thread2 = fetch(tred_gpu_for_tex, (float)(bj + j), (float)i2);
rdm_this_thread = tred_gpu_for[(bj + j) + mc*i2];
z2 += rdm_this_thread2 * fjreg;
z32 += fgjreg * rdm_this_thread2;

// Difference density
rdm_this_thread2 = fetch(diff_gpu_for_tex, (float)(bj + j), (float)i2);
rdm_this_thread = diff_gpu_for[(bj + j) + mc*i2];
x2 += rdm_this_thread2 * fjreg;
x32 += fgjreg * rdm_this_thread2;
}
Expand Down
8 changes: 5 additions & 3 deletions g2g/excited/cuda/ES_compute_partial.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@ template <class scalar_type, bool compute_energy, bool compute_factor, bool lda>
__global__ void ES_compute_partial(uint points,
const scalar_type* function_values, uint m,
const vec_type<scalar_type, 4>* gradient_values,
scalar_type* out_partial_tred, vec_type<scalar_type, 4>* out_tredxyz)
scalar_type* out_partial_tred, vec_type<scalar_type, 4>* out_tredxyz,
const scalar_type* tred_gpu)
{

uint point = blockIdx.x;
uint i = threadIdx.x + blockIdx.y * 2 * DENSITY_BLOCK_SIZE;
uint i2 = i + DENSITY_BLOCK_SIZE;
uint min_i = blockIdx.y * 2 * DENSITY_BLOCK_SIZE + DENSITY_BLOCK_SIZE;
uint mc=COALESCED_DIMENSION(m);
bool valid_thread = (i < m);
bool valid_thread2 = (i2 < m);

Expand Down Expand Up @@ -49,7 +51,7 @@ __global__ void ES_compute_partial(uint points,
scalar_type rdm_this_thread;

// Transition density
rdm_this_thread = fetch(tred_gpu_tex, (float)(bj + j), (float)i);
rdm_this_thread = tred_gpu[(bj + j) + mc*i];
z += rdm_this_thread * fjreg;
z3 += fgjreg * rdm_this_thread;
}
Expand All @@ -58,7 +60,7 @@ __global__ void ES_compute_partial(uint points,
scalar_type rdm_this_thread2;

// Transition density
rdm_this_thread2 = fetch(tred_gpu_tex, (float)(bj + j), (float)i2);
rdm_this_thread = tred_gpu[(bj + j) + mc*i2];
z2 += rdm_this_thread2 * fjreg;
z32 += fgjreg * rdm_this_thread2;
}
Expand Down
19 changes: 5 additions & 14 deletions g2g/excited/cuda/GS_compute_partial.h
Original file line number Diff line number Diff line change
@@ -1,25 +1,16 @@
#if FULL_DOUBLE
static __inline__ __device__ double fetch_double(texture<int2, 2> t, float x,
float y) {
int2 v = tex2D(t, x, y);
return __hiloint2double(v.y, v.x);
}
#define fetch(t, x, y) fetch_double(t, x, y)
#else
#define fetch(t, x, y) tex2D(t, x, y)
#endif

template <class scalar_type, bool compute_energy, bool compute_factor, bool lda>
__global__ void GS_compute_partial(uint points,
const scalar_type* function_values, uint m,
const vec_type<scalar_type, 4>* gradient_values,
scalar_type* out_partial_density, vec_type<scalar_type, 4>* out_dxyz)
scalar_type* out_partial_density, vec_type<scalar_type, 4>* out_dxyz,
const scalar_type* rmm_gpu)
{

uint point = blockIdx.x;
uint i = threadIdx.x + blockIdx.y * 2 * DENSITY_BLOCK_SIZE;
uint i2 = i + DENSITY_BLOCK_SIZE;
uint min_i = blockIdx.y * 2 * DENSITY_BLOCK_SIZE + DENSITY_BLOCK_SIZE;
uint mc=COALESCED_DIMENSION(m);
bool valid_thread = (i < m);
bool valid_thread2 = (i2 < m);

Expand Down Expand Up @@ -57,7 +48,7 @@ __global__ void GS_compute_partial(uint points,
scalar_type rdm_this_thread;

// GS density
rdm_this_thread = fetch(rmm_gpu_tex, (float)(bj + j), (float)i);
rdm_this_thread = rmm_gpu[(bj + j) + mc*i];
w += rdm_this_thread * fjreg;
w3 += fgjreg * rdm_this_thread;
}
Expand All @@ -66,7 +57,7 @@ __global__ void GS_compute_partial(uint points,
scalar_type rdm_this_thread2;

// GS density
rdm_this_thread2 = fetch(rmm_gpu_tex, (float)(bj + j), (float)i2);
rdm_this_thread = rmm_gpu[(bj + j) + mc*i2];
w2 += rdm_this_thread2 * fjreg;
w32 += fgjreg * rdm_this_thread2;
}
Expand Down
31 changes: 7 additions & 24 deletions g2g/excited/cuda/g2g_calcgradXC.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,6 @@
using namespace std;

namespace G2G {
#if FULL_DOUBLE
texture<int2, 2, cudaReadModeElementType> tred_gpu_for_tex;
texture<int2, 2, cudaReadModeElementType> diff_gpu_for_tex;
#else
texture<float, 2, cudaReadModeElementType> tred_gpu_for_tex;
texture<float, 2, cudaReadModeElementType> diff_gpu_for_tex;
#endif

#include "../../cuda/kernels/transpose.h"
#include "ES_compute_for_partial.h"

Expand Down Expand Up @@ -142,23 +134,18 @@ template<class scalar_type> void PointGroupGPU<scalar_type>::
}
}

// Form Bind Texture
cudaArray* cuArraytred;
cudaMallocArray(&cuArraytred, &tred_gpu_for_tex.channelDesc, tred_cpu.width, tred_cpu.height);
cudaMemcpyToArray(cuArraytred,0,0,tred_cpu.data,sizeof(scalar_type)*tred_cpu.width*tred_cpu.height,cudaMemcpyHostToDevice);
cudaBindTextureToArray(tred_gpu_for_tex, cuArraytred);
cudaArray* cuArraydiff;
cudaMallocArray(&cuArraydiff, &diff_gpu_for_tex.channelDesc, diff_cpu.width, diff_cpu.height);
cudaMemcpyToArray(cuArraydiff,0,0,diff_cpu.data,sizeof(scalar_type)*diff_cpu.width*diff_cpu.height,cudaMemcpyHostToDevice);
cudaBindTextureToArray(diff_gpu_for_tex, cuArraydiff);

// Transition and Difference densities on GPU
CudaMatrix<scalar_type> tred_gpu_for;
tred_gpu_for=tred_cpu;
CudaMatrix<scalar_type> diff_gpu_for;
diff_gpu_for=diff_cpu;
tred_cpu.deallocate(); diff_cpu.deallocate();

// CALCULATE PARTIAL DENSITIES
#define compden_parameter \
point_weights_gpu.data,this->number_of_points,function_values_transposed.data,\
group_m,gradient_values_transposed.data, partial_tred_gpu.data,tredxyz_gpu.data, \
partial_diff_gpu.data, diffxyz_gpu.data
partial_diff_gpu.data, diffxyz_gpu.data, tred_gpu_for.data, diff_gpu_for.data
ES_compute_for_partial<scalar_type,true,true,false><<<threadGrid, threadBlock>>>(compden_parameter);

// ACCUMULATE DENSITIES
Expand Down Expand Up @@ -258,11 +245,7 @@ template<class scalar_type> void PointGroupGPU<scalar_type>::
gdens.deallocate(); tdens.deallocate(); ddens.deallocate();
gdens_xyz.deallocate(); tdens_xyz.deallocate(); ddens_xyz.deallocate();

// Free Texture and Memory
cudaUnbindTexture(tred_gpu_for_tex);
cudaUnbindTexture(diff_gpu_for_tex);
cudaFreeArray(cuArraytred);
cudaFreeArray(cuArraydiff);
// Free Memory
mat_dens_gpu.deallocate(); mat_diff_gpu.deallocate();
mat_tred_gpu.deallocate(); diff_accum_gpu.deallocate();
tred_accum_gpu.deallocate(); diffxyz_accum_gpu.deallocate();
Expand Down
19 changes: 4 additions & 15 deletions g2g/excited/cuda/g2g_calculateG.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,6 @@
using namespace std;

namespace G2G {
#if FULL_DOUBLE
texture<int2, 2, cudaReadModeElementType> tred_gpu_3rd_tex;
#else
texture<float, 2, cudaReadModeElementType> tred_gpu_3rd_tex;
#endif

#include "../../cuda/kernels/transpose.h"
#include "obtain_fock_cuda.h"
#include "obtain_terms.h"
Expand Down Expand Up @@ -122,17 +116,14 @@ void PointGroupGPU<scalar_type>::solve_3rd_der(double* T, HostMatrix<double>& Fo
}
}

// Form Bind Textures
cudaArray* cuArraytred;
cudaMallocArray(&cuArraytred, &tred_gpu_3rd_tex.channelDesc, tred_cpu.width, tred_cpu.height);
cudaMemcpyToArray(cuArraytred,0,0,tred_cpu.data,sizeof(scalar_type)*tred_cpu.width*tred_cpu.height,cudaMemcpyHostToDevice);
cudaBindTextureToArray(tred_gpu_3rd_tex, cuArraytred);
tred_cpu.deallocate();
// Transition density on GPU
CudaMatrix<scalar_type> tred_gpu_3rd;
tred_gpu_3rd=tred_cpu;

// CALCULATE PARTIAL DENSITIES
#define compden_parameter \
this->number_of_points,function_values_transposed.data,group_m,gradient_values_transposed.data,\
partial_tred_gpu.data,tredxyz_gpu.data
partial_tred_gpu.data,tredxyz_gpu.data,tred_gpu_3rd.data
ES_compute_3rd_partial<scalar_type,true,true,false><<<threadGrid, threadBlock>>>(compden_parameter);

// ACCUMULATE DENSITIES
Expand Down Expand Up @@ -212,8 +203,6 @@ void PointGroupGPU<scalar_type>::solve_3rd_der(double* T, HostMatrix<double>& Fo

// Free Memory
smallFock.deallocate();
cudaUnbindTexture(tred_gpu_3rd_tex);
cudaFreeArray(cuArraytred);
Txyz.deallocate();
Dxyz.deallocate();
partial_tred_gpu.deallocate();
Expand Down
32 changes: 7 additions & 25 deletions g2g/excited/cuda/g2g_calculateXC.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,6 @@
using namespace std;

namespace G2G {
#if FULL_DOUBLE
texture<int2, 2, cudaReadModeElementType> rmm_gpu_tex;
texture<int2, 2, cudaReadModeElementType> tred_gpu_tex;
#else
texture<float, 2, cudaReadModeElementType> rmm_gpu_tex;
texture<float, 2, cudaReadModeElementType> tred_gpu_tex;
#endif

#include "../../cuda/kernels/transpose.h"
#include "obtain_fock_cuda.h"
#include "obtain_terms.h"
Expand Down Expand Up @@ -129,12 +121,9 @@ void PointGroupGPU<scalar_type>::solve_closed_lr(double* T, HostMatrix<double>&
}
}

// Form Bind Textures
cudaArray* cuArraytred;
cudaMallocArray(&cuArraytred, &tred_gpu_tex.channelDesc, tred_cpu.width, tred_cpu.height);
cudaMemcpyToArray(cuArraytred,0,0,tred_cpu.data,sizeof(scalar_type)*tred_cpu.width*tred_cpu.height,cudaMemcpyHostToDevice);
cudaBindTextureToArray(tred_gpu_tex, cuArraytred);
tred_cpu.deallocate();
// Transition density on GPU
CudaMatrix<scalar_type> tred_gpu;
tred_gpu=tred_cpu;

// CALCULATE PARTIAL DENSITIES
#define compden_parameter \
Expand Down Expand Up @@ -211,8 +200,6 @@ void PointGroupGPU<scalar_type>::solve_closed_lr(double* T, HostMatrix<double>&

// Free Memory
smallFock.deallocate();
cudaUnbindTexture(tred_gpu_tex);
cudaFreeArray(cuArraytred);
Txyz.deallocate();
Dxyz.deallocate();
partial_tred_gpu.deallocate();
Expand Down Expand Up @@ -302,17 +289,14 @@ template<class scalar_type> void PointGroupGPU<scalar_type>::
}
}

// Form Bind Textures
cudaArray* cuArrayrmm;
cudaMallocArray(&cuArrayrmm, &rmm_gpu_tex.channelDesc, rmm_cpu.width, rmm_cpu.height);
cudaMemcpyToArray(cuArrayrmm,0,0,rmm_cpu.data,sizeof(scalar_type)*rmm_cpu.width*rmm_cpu.height,cudaMemcpyHostToDevice);
cudaBindTextureToArray(rmm_gpu_tex, cuArrayrmm);
rmm_cpu.deallocate();
// GS Density on GPU
CudaMatrix<scalar_type> rmm_gpu;
rmm_gpu=rmm_cpu;

// CALCULATE PARTIAL DENSITIES
#define compden_parameter \
this->number_of_points,function_values_transposed.data,group_m,gradient_values_transposed.data, \
partial_densities_gpu.data,dxyz_gpu.data
partial_densities_gpu.data,dxyz_gpu.data,rmm_gpu.data
GS_compute_partial<scalar_type,true,true,false><<<threadGrid, threadBlock>>>(compden_parameter);

// ACCUMULATE DENSITIES
Expand All @@ -325,8 +309,6 @@ template<class scalar_type> void PointGroupGPU<scalar_type>::
#undef accumulate_parameters

// FREE MEMORY
cudaUnbindTexture(rmm_gpu_tex);
cudaFreeArray(cuArrayrmm);
partial_densities_gpu.deallocate();
dxyz_gpu.deallocate();
function_values_transposed.deallocate();
Expand Down

0 comments on commit e776251

Please sign in to comment.