diff --git a/g2g/excited/cuda/ES_compute_3rd_partial.h b/g2g/excited/cuda/ES_compute_3rd_partial.h index c772e0de0..048b46c88 100644 --- a/g2g/excited/cuda/ES_compute_3rd_partial.h +++ b/g2g/excited/cuda/ES_compute_3rd_partial.h @@ -1,25 +1,16 @@ -#if FULL_DOUBLE -static __inline__ __device__ double fetch_double(texture t, float x, - float y) { - int2 v = tex2D(t, x, y); - return __hiloint2double(v.y, v.x); -} -#define fetch(t, x, y) fetch_double(t, x, y) -#else -#define fetch(t, x, y) tex2D(t, x, y) -#endif - template __global__ void ES_compute_3rd_partial(uint points, const scalar_type* function_values, uint m, const vec_type* gradient_values, - scalar_type* out_partial_tred, vec_type* out_tredxyz) + scalar_type* out_partial_tred, vec_type* out_tredxyz, + const scalar_type* tred_gpu_3rd) { uint point = blockIdx.x; uint i = threadIdx.x + blockIdx.y * 2 * DENSITY_BLOCK_SIZE; uint i2 = i + DENSITY_BLOCK_SIZE; uint min_i = blockIdx.y * 2 * DENSITY_BLOCK_SIZE + DENSITY_BLOCK_SIZE; + uint mc=COALESCED_DIMENSION(m); bool valid_thread = (i < m); bool valid_thread2 = (i2 < m); @@ -60,7 +51,7 @@ __global__ void ES_compute_3rd_partial(uint points, scalar_type rdm_this_thread; // Transition density - rdm_this_thread = fetch(tred_gpu_3rd_tex, (float)(bj + j), (float)i); + rdm_this_thread = tred_gpu_3rd[(bj + j) + mc*i]; z += rdm_this_thread * fjreg; z3 += fgjreg * rdm_this_thread; } @@ -69,7 +60,7 @@ __global__ void ES_compute_3rd_partial(uint points, scalar_type rdm_this_thread2; // Transition density - rdm_this_thread2 = fetch(tred_gpu_3rd_tex, (float)(bj + j), (float)i2); + rdm_this_thread = tred_gpu_3rd[(bj + j) + mc*i2]; z2 += rdm_this_thread2 * fjreg; z32 += fgjreg * rdm_this_thread2; } diff --git a/g2g/excited/cuda/ES_compute_for_partial.h b/g2g/excited/cuda/ES_compute_for_partial.h index f0679233f..bac302879 100644 --- a/g2g/excited/cuda/ES_compute_for_partial.h +++ b/g2g/excited/cuda/ES_compute_for_partial.h @@ -1,26 +1,17 @@ -#if FULL_DOUBLE -static __inline__ __device__ double fetch_double(texture t, float x, - float y) { - int2 v = tex2D(t, x, y); - return __hiloint2double(v.y, v.x); -} -#define fetch(t, x, y) fetch_double(t, x, y) -#else -#define fetch(t, x, y) tex2D(t, x, y) -#endif - template __global__ void ES_compute_for_partial(const scalar_type* const point_weights,uint points, const scalar_type* function_values, uint m, const vec_type* gradient_values, scalar_type* out_partial_tred, vec_type* out_tredxyz, - scalar_type* out_partial_diff, vec_type* out_diffxyz) + scalar_type* out_partial_diff, vec_type* out_diffxyz, + const scalar_type* tred_gpu_for, const scalar_type* diff_gpu_for) { uint point = blockIdx.x; uint i = threadIdx.x + blockIdx.y * 2 * DENSITY_BLOCK_SIZE; uint i2 = i + DENSITY_BLOCK_SIZE; uint min_i = blockIdx.y * 2 * DENSITY_BLOCK_SIZE + DENSITY_BLOCK_SIZE; + uint mc=COALESCED_DIMENSION(m); bool valid_thread = (i < m); bool valid_thread2 = (i2 < m); @@ -68,12 +59,12 @@ __global__ void ES_compute_for_partial(const scalar_type* const point_weights,ui scalar_type rdm_this_thread; // Transition density - rdm_this_thread = fetch(tred_gpu_for_tex, (float)(bj + j), (float)i); + rdm_this_thread = tred_gpu_for[(bj + j) + mc*i]; z += rdm_this_thread * fjreg; z3 += fgjreg * rdm_this_thread; // Difference density - rdm_this_thread = fetch(diff_gpu_for_tex, (float)(bj + j), (float)i); + rdm_this_thread = diff_gpu_for[(bj + j) + mc*i]; x += rdm_this_thread * fjreg; x3 += fgjreg * rdm_this_thread; } @@ -82,12 +73,12 @@ __global__ void ES_compute_for_partial(const scalar_type* const point_weights,ui scalar_type rdm_this_thread2; // Transition density - rdm_this_thread2 = fetch(tred_gpu_for_tex, (float)(bj + j), (float)i2); + rdm_this_thread = tred_gpu_for[(bj + j) + mc*i2]; z2 += rdm_this_thread2 * fjreg; z32 += fgjreg * rdm_this_thread2; // Difference density - rdm_this_thread2 = fetch(diff_gpu_for_tex, (float)(bj + j), (float)i2); + rdm_this_thread = diff_gpu_for[(bj + j) + mc*i2]; x2 += rdm_this_thread2 * fjreg; x32 += fgjreg * rdm_this_thread2; } diff --git a/g2g/excited/cuda/ES_compute_partial.h b/g2g/excited/cuda/ES_compute_partial.h index cb0332b80..b2e6fc833 100644 --- a/g2g/excited/cuda/ES_compute_partial.h +++ b/g2g/excited/cuda/ES_compute_partial.h @@ -2,13 +2,15 @@ template __global__ void ES_compute_partial(uint points, const scalar_type* function_values, uint m, const vec_type* gradient_values, - scalar_type* out_partial_tred, vec_type* out_tredxyz) + scalar_type* out_partial_tred, vec_type* out_tredxyz, + const scalar_type* tred_gpu) { uint point = blockIdx.x; uint i = threadIdx.x + blockIdx.y * 2 * DENSITY_BLOCK_SIZE; uint i2 = i + DENSITY_BLOCK_SIZE; uint min_i = blockIdx.y * 2 * DENSITY_BLOCK_SIZE + DENSITY_BLOCK_SIZE; + uint mc=COALESCED_DIMENSION(m); bool valid_thread = (i < m); bool valid_thread2 = (i2 < m); @@ -49,7 +51,7 @@ __global__ void ES_compute_partial(uint points, scalar_type rdm_this_thread; // Transition density - rdm_this_thread = fetch(tred_gpu_tex, (float)(bj + j), (float)i); + rdm_this_thread = tred_gpu[(bj + j) + mc*i]; z += rdm_this_thread * fjreg; z3 += fgjreg * rdm_this_thread; } @@ -58,7 +60,7 @@ __global__ void ES_compute_partial(uint points, scalar_type rdm_this_thread2; // Transition density - rdm_this_thread2 = fetch(tred_gpu_tex, (float)(bj + j), (float)i2); + rdm_this_thread = tred_gpu[(bj + j) + mc*i2]; z2 += rdm_this_thread2 * fjreg; z32 += fgjreg * rdm_this_thread2; } diff --git a/g2g/excited/cuda/GS_compute_partial.h b/g2g/excited/cuda/GS_compute_partial.h index 90717a994..476d454cc 100644 --- a/g2g/excited/cuda/GS_compute_partial.h +++ b/g2g/excited/cuda/GS_compute_partial.h @@ -1,25 +1,16 @@ -#if FULL_DOUBLE -static __inline__ __device__ double fetch_double(texture t, float x, - float y) { - int2 v = tex2D(t, x, y); - return __hiloint2double(v.y, v.x); -} -#define fetch(t, x, y) fetch_double(t, x, y) -#else -#define fetch(t, x, y) tex2D(t, x, y) -#endif - template __global__ void GS_compute_partial(uint points, const scalar_type* function_values, uint m, const vec_type* gradient_values, - scalar_type* out_partial_density, vec_type* out_dxyz) + scalar_type* out_partial_density, vec_type* out_dxyz, + const scalar_type* rmm_gpu) { uint point = blockIdx.x; uint i = threadIdx.x + blockIdx.y * 2 * DENSITY_BLOCK_SIZE; uint i2 = i + DENSITY_BLOCK_SIZE; uint min_i = blockIdx.y * 2 * DENSITY_BLOCK_SIZE + DENSITY_BLOCK_SIZE; + uint mc=COALESCED_DIMENSION(m); bool valid_thread = (i < m); bool valid_thread2 = (i2 < m); @@ -57,7 +48,7 @@ __global__ void GS_compute_partial(uint points, scalar_type rdm_this_thread; // GS density - rdm_this_thread = fetch(rmm_gpu_tex, (float)(bj + j), (float)i); + rdm_this_thread = rmm_gpu[(bj + j) + mc*i]; w += rdm_this_thread * fjreg; w3 += fgjreg * rdm_this_thread; } @@ -66,7 +57,7 @@ __global__ void GS_compute_partial(uint points, scalar_type rdm_this_thread2; // GS density - rdm_this_thread2 = fetch(rmm_gpu_tex, (float)(bj + j), (float)i2); + rdm_this_thread = rmm_gpu[(bj + j) + mc*i2]; w2 += rdm_this_thread2 * fjreg; w32 += fgjreg * rdm_this_thread2; } diff --git a/g2g/excited/cuda/g2g_calcgradXC.cu b/g2g/excited/cuda/g2g_calcgradXC.cu index 7b83fd4c9..642980c6a 100644 --- a/g2g/excited/cuda/g2g_calcgradXC.cu +++ b/g2g/excited/cuda/g2g_calcgradXC.cu @@ -26,14 +26,6 @@ using namespace std; namespace G2G { -#if FULL_DOUBLE -texture tred_gpu_for_tex; -texture diff_gpu_for_tex; -#else -texture tred_gpu_for_tex; -texture diff_gpu_for_tex; -#endif - #include "../../cuda/kernels/transpose.h" #include "ES_compute_for_partial.h" @@ -142,23 +134,18 @@ template void PointGroupGPU:: } } -// Form Bind Texture - cudaArray* cuArraytred; - cudaMallocArray(&cuArraytred, &tred_gpu_for_tex.channelDesc, tred_cpu.width, tred_cpu.height); - cudaMemcpyToArray(cuArraytred,0,0,tred_cpu.data,sizeof(scalar_type)*tred_cpu.width*tred_cpu.height,cudaMemcpyHostToDevice); - cudaBindTextureToArray(tred_gpu_for_tex, cuArraytred); - cudaArray* cuArraydiff; - cudaMallocArray(&cuArraydiff, &diff_gpu_for_tex.channelDesc, diff_cpu.width, diff_cpu.height); - cudaMemcpyToArray(cuArraydiff,0,0,diff_cpu.data,sizeof(scalar_type)*diff_cpu.width*diff_cpu.height,cudaMemcpyHostToDevice); - cudaBindTextureToArray(diff_gpu_for_tex, cuArraydiff); - +// Transition and Difference densities on GPU + CudaMatrix tred_gpu_for; + tred_gpu_for=tred_cpu; + CudaMatrix diff_gpu_for; + diff_gpu_for=diff_cpu; tred_cpu.deallocate(); diff_cpu.deallocate(); // CALCULATE PARTIAL DENSITIES #define compden_parameter \ point_weights_gpu.data,this->number_of_points,function_values_transposed.data,\ group_m,gradient_values_transposed.data, partial_tred_gpu.data,tredxyz_gpu.data, \ - partial_diff_gpu.data, diffxyz_gpu.data + partial_diff_gpu.data, diffxyz_gpu.data, tred_gpu_for.data, diff_gpu_for.data ES_compute_for_partial<<>>(compden_parameter); // ACCUMULATE DENSITIES @@ -258,11 +245,7 @@ template void PointGroupGPU:: gdens.deallocate(); tdens.deallocate(); ddens.deallocate(); gdens_xyz.deallocate(); tdens_xyz.deallocate(); ddens_xyz.deallocate(); -// Free Texture and Memory - cudaUnbindTexture(tred_gpu_for_tex); - cudaUnbindTexture(diff_gpu_for_tex); - cudaFreeArray(cuArraytred); - cudaFreeArray(cuArraydiff); +// Free Memory mat_dens_gpu.deallocate(); mat_diff_gpu.deallocate(); mat_tred_gpu.deallocate(); diff_accum_gpu.deallocate(); tred_accum_gpu.deallocate(); diffxyz_accum_gpu.deallocate(); diff --git a/g2g/excited/cuda/g2g_calculateG.cu b/g2g/excited/cuda/g2g_calculateG.cu index 577a19402..24c469e85 100644 --- a/g2g/excited/cuda/g2g_calculateG.cu +++ b/g2g/excited/cuda/g2g_calculateG.cu @@ -22,12 +22,6 @@ using namespace std; namespace G2G { -#if FULL_DOUBLE -texture tred_gpu_3rd_tex; -#else -texture tred_gpu_3rd_tex; -#endif - #include "../../cuda/kernels/transpose.h" #include "obtain_fock_cuda.h" #include "obtain_terms.h" @@ -122,17 +116,14 @@ void PointGroupGPU::solve_3rd_der(double* T, HostMatrix& Fo } } -// Form Bind Textures - cudaArray* cuArraytred; - cudaMallocArray(&cuArraytred, &tred_gpu_3rd_tex.channelDesc, tred_cpu.width, tred_cpu.height); - cudaMemcpyToArray(cuArraytred,0,0,tred_cpu.data,sizeof(scalar_type)*tred_cpu.width*tred_cpu.height,cudaMemcpyHostToDevice); - cudaBindTextureToArray(tred_gpu_3rd_tex, cuArraytred); - tred_cpu.deallocate(); +// Transition density on GPU + CudaMatrix tred_gpu_3rd; + tred_gpu_3rd=tred_cpu; // CALCULATE PARTIAL DENSITIES #define compden_parameter \ this->number_of_points,function_values_transposed.data,group_m,gradient_values_transposed.data,\ - partial_tred_gpu.data,tredxyz_gpu.data + partial_tred_gpu.data,tredxyz_gpu.data,tred_gpu_3rd.data ES_compute_3rd_partial<<>>(compden_parameter); // ACCUMULATE DENSITIES @@ -212,8 +203,6 @@ void PointGroupGPU::solve_3rd_der(double* T, HostMatrix& Fo // Free Memory smallFock.deallocate(); - cudaUnbindTexture(tred_gpu_3rd_tex); - cudaFreeArray(cuArraytred); Txyz.deallocate(); Dxyz.deallocate(); partial_tred_gpu.deallocate(); diff --git a/g2g/excited/cuda/g2g_calculateXC.cu b/g2g/excited/cuda/g2g_calculateXC.cu index fda983903..a3366213d 100644 --- a/g2g/excited/cuda/g2g_calculateXC.cu +++ b/g2g/excited/cuda/g2g_calculateXC.cu @@ -22,14 +22,6 @@ using namespace std; namespace G2G { -#if FULL_DOUBLE -texture rmm_gpu_tex; -texture tred_gpu_tex; -#else -texture rmm_gpu_tex; -texture tred_gpu_tex; -#endif - #include "../../cuda/kernels/transpose.h" #include "obtain_fock_cuda.h" #include "obtain_terms.h" @@ -129,12 +121,9 @@ void PointGroupGPU::solve_closed_lr(double* T, HostMatrix& } } -// Form Bind Textures - cudaArray* cuArraytred; - cudaMallocArray(&cuArraytred, &tred_gpu_tex.channelDesc, tred_cpu.width, tred_cpu.height); - cudaMemcpyToArray(cuArraytred,0,0,tred_cpu.data,sizeof(scalar_type)*tred_cpu.width*tred_cpu.height,cudaMemcpyHostToDevice); - cudaBindTextureToArray(tred_gpu_tex, cuArraytred); - tred_cpu.deallocate(); +// Transition density on GPU + CudaMatrix tred_gpu; + tred_gpu=tred_cpu; // CALCULATE PARTIAL DENSITIES #define compden_parameter \ @@ -211,8 +200,6 @@ void PointGroupGPU::solve_closed_lr(double* T, HostMatrix& // Free Memory smallFock.deallocate(); - cudaUnbindTexture(tred_gpu_tex); - cudaFreeArray(cuArraytred); Txyz.deallocate(); Dxyz.deallocate(); partial_tred_gpu.deallocate(); @@ -302,17 +289,14 @@ template void PointGroupGPU:: } } -// Form Bind Textures - cudaArray* cuArrayrmm; - cudaMallocArray(&cuArrayrmm, &rmm_gpu_tex.channelDesc, rmm_cpu.width, rmm_cpu.height); - cudaMemcpyToArray(cuArrayrmm,0,0,rmm_cpu.data,sizeof(scalar_type)*rmm_cpu.width*rmm_cpu.height,cudaMemcpyHostToDevice); - cudaBindTextureToArray(rmm_gpu_tex, cuArrayrmm); - rmm_cpu.deallocate(); +// GS Density on GPU + CudaMatrix rmm_gpu; + rmm_gpu=rmm_cpu; // CALCULATE PARTIAL DENSITIES #define compden_parameter \ this->number_of_points,function_values_transposed.data,group_m,gradient_values_transposed.data, \ - partial_densities_gpu.data,dxyz_gpu.data + partial_densities_gpu.data,dxyz_gpu.data,rmm_gpu.data GS_compute_partial<<>>(compden_parameter); // ACCUMULATE DENSITIES @@ -325,8 +309,6 @@ template void PointGroupGPU:: #undef accumulate_parameters // FREE MEMORY - cudaUnbindTexture(rmm_gpu_tex); - cudaFreeArray(cuArrayrmm); partial_densities_gpu.deallocate(); dxyz_gpu.deallocate(); function_values_transposed.deallocate();