diff --git a/g2g/excited/cuda/ES_compute_3rd_partial.h b/g2g/excited/cuda/ES_compute_3rd_partial.h
index c772e0de0..048b46c88 100644
--- a/g2g/excited/cuda/ES_compute_3rd_partial.h
+++ b/g2g/excited/cuda/ES_compute_3rd_partial.h
@@ -1,25 +1,16 @@
-#if FULL_DOUBLE
-static __inline__ __device__ double fetch_double(texture<int2, 2> t, float x,
-                                                 float y) {
-  int2 v = tex2D(t, x, y);
-  return __hiloint2double(v.y, v.x);
-}
-#define fetch(t, x, y) fetch_double(t, x, y)
-#else
-#define fetch(t, x, y) tex2D(t, x, y)
-#endif
-
 template <class scalar_type, bool compute_energy, bool compute_factor, bool lda>
 __global__ void ES_compute_3rd_partial(uint points,
                                 const scalar_type* function_values, uint m,
                                 const vec_type<scalar_type, 4>* gradient_values,
-                                scalar_type* out_partial_tred, vec_type<scalar_type, 4>* out_tredxyz)
+                                scalar_type* out_partial_tred, vec_type<scalar_type, 4>* out_tredxyz,
+				const scalar_type* tred_gpu_3rd)
 {
 
   uint point = blockIdx.x;
   uint i = threadIdx.x + blockIdx.y * 2 * DENSITY_BLOCK_SIZE;
   uint i2 = i + DENSITY_BLOCK_SIZE;
   uint min_i = blockIdx.y * 2 * DENSITY_BLOCK_SIZE + DENSITY_BLOCK_SIZE;
+  uint mc=COALESCED_DIMENSION(m);
   bool valid_thread = (i < m);
   bool valid_thread2 = (i2 < m);
 
@@ -60,7 +51,7 @@ __global__ void ES_compute_3rd_partial(uint points,
           scalar_type rdm_this_thread;
 
           // Transition density
-          rdm_this_thread = fetch(tred_gpu_3rd_tex, (float)(bj + j), (float)i);
+          rdm_this_thread = tred_gpu_3rd[(bj + j) + mc*i];
           z  += rdm_this_thread * fjreg;
           z3 += fgjreg * rdm_this_thread;
         }
@@ -69,7 +60,7 @@ __global__ void ES_compute_3rd_partial(uint points,
           scalar_type rdm_this_thread2;
 
           // Transition density
-          rdm_this_thread2 = fetch(tred_gpu_3rd_tex, (float)(bj + j), (float)i2);
+          rdm_this_thread = tred_gpu_3rd[(bj + j) + mc*i2];
           z2  += rdm_this_thread2 * fjreg;
           z32 += fgjreg * rdm_this_thread2;
         }
diff --git a/g2g/excited/cuda/ES_compute_for_partial.h b/g2g/excited/cuda/ES_compute_for_partial.h
index f0679233f..bac302879 100644
--- a/g2g/excited/cuda/ES_compute_for_partial.h
+++ b/g2g/excited/cuda/ES_compute_for_partial.h
@@ -1,26 +1,17 @@
-#if FULL_DOUBLE
-static __inline__ __device__ double fetch_double(texture<int2, 2> t, float x,
-                                                 float y) {
-  int2 v = tex2D(t, x, y);
-  return __hiloint2double(v.y, v.x);
-}
-#define fetch(t, x, y) fetch_double(t, x, y)
-#else
-#define fetch(t, x, y) tex2D(t, x, y)
-#endif
-
 template <class scalar_type, bool compute_energy, bool compute_factor, bool lda>
 __global__ void ES_compute_for_partial(const scalar_type* const point_weights,uint points,
                                 const scalar_type* function_values, uint m,
                                 const vec_type<scalar_type, 4>* gradient_values,
                                 scalar_type* out_partial_tred, vec_type<scalar_type, 4>* out_tredxyz,
-                                scalar_type* out_partial_diff, vec_type<scalar_type, 4>* out_diffxyz)
+                                scalar_type* out_partial_diff, vec_type<scalar_type, 4>* out_diffxyz,
+				const scalar_type* tred_gpu_for, const scalar_type* diff_gpu_for)
 {
 
   uint point = blockIdx.x;
   uint i = threadIdx.x + blockIdx.y * 2 * DENSITY_BLOCK_SIZE;
   uint i2 = i + DENSITY_BLOCK_SIZE;
   uint min_i = blockIdx.y * 2 * DENSITY_BLOCK_SIZE + DENSITY_BLOCK_SIZE;
+  uint mc=COALESCED_DIMENSION(m);
   bool valid_thread = (i < m);
   bool valid_thread2 = (i2 < m);
 
@@ -68,12 +59,12 @@ __global__ void ES_compute_for_partial(const scalar_type* const point_weights,ui
           scalar_type rdm_this_thread;
 
           // Transition density
-          rdm_this_thread = fetch(tred_gpu_for_tex, (float)(bj + j), (float)i);
+          rdm_this_thread = tred_gpu_for[(bj + j) + mc*i];
           z  += rdm_this_thread * fjreg;
           z3 += fgjreg * rdm_this_thread;
 
           // Difference density
-          rdm_this_thread = fetch(diff_gpu_for_tex, (float)(bj + j), (float)i);
+          rdm_this_thread = diff_gpu_for[(bj + j) + mc*i];
           x  += rdm_this_thread * fjreg;
           x3 += fgjreg * rdm_this_thread;
         }
@@ -82,12 +73,12 @@ __global__ void ES_compute_for_partial(const scalar_type* const point_weights,ui
           scalar_type rdm_this_thread2;
 
           // Transition density
-          rdm_this_thread2 = fetch(tred_gpu_for_tex, (float)(bj + j), (float)i2);
+          rdm_this_thread = tred_gpu_for[(bj + j) + mc*i2];
           z2  += rdm_this_thread2 * fjreg;
           z32 += fgjreg * rdm_this_thread2;
 
           // Difference density
-          rdm_this_thread2 = fetch(diff_gpu_for_tex, (float)(bj + j), (float)i2);
+          rdm_this_thread = diff_gpu_for[(bj + j) + mc*i2];
           x2  += rdm_this_thread2 * fjreg;
           x32 += fgjreg * rdm_this_thread2;
         }
diff --git a/g2g/excited/cuda/ES_compute_partial.h b/g2g/excited/cuda/ES_compute_partial.h
index cb0332b80..b2e6fc833 100644
--- a/g2g/excited/cuda/ES_compute_partial.h
+++ b/g2g/excited/cuda/ES_compute_partial.h
@@ -2,13 +2,15 @@ template <class scalar_type, bool compute_energy, bool compute_factor, bool lda>
 __global__ void ES_compute_partial(uint points,
                                 const scalar_type* function_values, uint m,
                                 const vec_type<scalar_type, 4>* gradient_values,
-                                scalar_type* out_partial_tred, vec_type<scalar_type, 4>* out_tredxyz)
+                                scalar_type* out_partial_tred, vec_type<scalar_type, 4>* out_tredxyz,
+				const scalar_type* tred_gpu)
 {
 
   uint point = blockIdx.x;
   uint i = threadIdx.x + blockIdx.y * 2 * DENSITY_BLOCK_SIZE;
   uint i2 = i + DENSITY_BLOCK_SIZE;
   uint min_i = blockIdx.y * 2 * DENSITY_BLOCK_SIZE + DENSITY_BLOCK_SIZE;
+  uint mc=COALESCED_DIMENSION(m);
   bool valid_thread = (i < m);
   bool valid_thread2 = (i2 < m);
 
@@ -49,7 +51,7 @@ __global__ void ES_compute_partial(uint points,
           scalar_type rdm_this_thread;
 
           // Transition density
-          rdm_this_thread = fetch(tred_gpu_tex, (float)(bj + j), (float)i);
+          rdm_this_thread = tred_gpu[(bj + j) + mc*i];
           z  += rdm_this_thread * fjreg;
           z3 += fgjreg * rdm_this_thread;
         }
@@ -58,7 +60,7 @@ __global__ void ES_compute_partial(uint points,
           scalar_type rdm_this_thread2;
 
           // Transition density
-          rdm_this_thread2 = fetch(tred_gpu_tex, (float)(bj + j), (float)i2);
+          rdm_this_thread = tred_gpu[(bj + j) + mc*i2];
           z2  += rdm_this_thread2 * fjreg;
           z32 += fgjreg * rdm_this_thread2;
         }
diff --git a/g2g/excited/cuda/GS_compute_partial.h b/g2g/excited/cuda/GS_compute_partial.h
index 90717a994..476d454cc 100644
--- a/g2g/excited/cuda/GS_compute_partial.h
+++ b/g2g/excited/cuda/GS_compute_partial.h
@@ -1,25 +1,16 @@
-#if FULL_DOUBLE
-static __inline__ __device__ double fetch_double(texture<int2, 2> t, float x,
-                                                 float y) {
-  int2 v = tex2D(t, x, y);
-  return __hiloint2double(v.y, v.x);
-}
-#define fetch(t, x, y) fetch_double(t, x, y)
-#else
-#define fetch(t, x, y) tex2D(t, x, y)
-#endif
-
 template <class scalar_type, bool compute_energy, bool compute_factor, bool lda>
 __global__ void GS_compute_partial(uint points,
                                 const scalar_type* function_values, uint m,
                                 const vec_type<scalar_type, 4>* gradient_values,
-                                scalar_type* out_partial_density, vec_type<scalar_type, 4>* out_dxyz)
+                                scalar_type* out_partial_density, vec_type<scalar_type, 4>* out_dxyz,
+				const scalar_type* rmm_gpu)
 {
 
   uint point = blockIdx.x;
   uint i = threadIdx.x + blockIdx.y * 2 * DENSITY_BLOCK_SIZE;
   uint i2 = i + DENSITY_BLOCK_SIZE;
   uint min_i = blockIdx.y * 2 * DENSITY_BLOCK_SIZE + DENSITY_BLOCK_SIZE;
+  uint mc=COALESCED_DIMENSION(m);
   bool valid_thread = (i < m);
   bool valid_thread2 = (i2 < m);
 
@@ -57,7 +48,7 @@ __global__ void GS_compute_partial(uint points,
           scalar_type rdm_this_thread;
 
           // GS density
-          rdm_this_thread = fetch(rmm_gpu_tex, (float)(bj + j), (float)i);
+	  rdm_this_thread = rmm_gpu[(bj + j) + mc*i];
           w  += rdm_this_thread * fjreg;
           w3 += fgjreg * rdm_this_thread;
         }
@@ -66,7 +57,7 @@ __global__ void GS_compute_partial(uint points,
           scalar_type rdm_this_thread2;
 
           // GS density
-          rdm_this_thread2 = fetch(rmm_gpu_tex, (float)(bj + j), (float)i2);
+	  rdm_this_thread = rmm_gpu[(bj + j) + mc*i2];
           w2  += rdm_this_thread2 * fjreg;
           w32 += fgjreg * rdm_this_thread2;
         }
diff --git a/g2g/excited/cuda/g2g_calcgradXC.cu b/g2g/excited/cuda/g2g_calcgradXC.cu
index 7b83fd4c9..642980c6a 100644
--- a/g2g/excited/cuda/g2g_calcgradXC.cu
+++ b/g2g/excited/cuda/g2g_calcgradXC.cu
@@ -26,14 +26,6 @@
 using namespace std;
 
 namespace G2G {
-#if FULL_DOUBLE
-texture<int2, 2, cudaReadModeElementType> tred_gpu_for_tex;
-texture<int2, 2, cudaReadModeElementType> diff_gpu_for_tex;
-#else
-texture<float, 2, cudaReadModeElementType> tred_gpu_for_tex;
-texture<float, 2, cudaReadModeElementType> diff_gpu_for_tex;
-#endif
-
 #include "../../cuda/kernels/transpose.h"
 #include "ES_compute_for_partial.h"
 
@@ -142,23 +134,18 @@ template<class scalar_type> void PointGroupGPU<scalar_type>::
      }
    }
 
-// Form Bind Texture
-   cudaArray* cuArraytred;
-   cudaMallocArray(&cuArraytred, &tred_gpu_for_tex.channelDesc, tred_cpu.width, tred_cpu.height);
-   cudaMemcpyToArray(cuArraytred,0,0,tred_cpu.data,sizeof(scalar_type)*tred_cpu.width*tred_cpu.height,cudaMemcpyHostToDevice);
-   cudaBindTextureToArray(tred_gpu_for_tex, cuArraytred);
-   cudaArray* cuArraydiff;
-   cudaMallocArray(&cuArraydiff, &diff_gpu_for_tex.channelDesc, diff_cpu.width, diff_cpu.height);
-   cudaMemcpyToArray(cuArraydiff,0,0,diff_cpu.data,sizeof(scalar_type)*diff_cpu.width*diff_cpu.height,cudaMemcpyHostToDevice);
-   cudaBindTextureToArray(diff_gpu_for_tex, cuArraydiff);
-
+// Transition and Difference densities on GPU
+   CudaMatrix<scalar_type> tred_gpu_for;
+   tred_gpu_for=tred_cpu;
+   CudaMatrix<scalar_type> diff_gpu_for;
+   diff_gpu_for=diff_cpu;
    tred_cpu.deallocate(); diff_cpu.deallocate();
 
 // CALCULATE PARTIAL DENSITIES
 #define compden_parameter \
    point_weights_gpu.data,this->number_of_points,function_values_transposed.data,\
    group_m,gradient_values_transposed.data, partial_tred_gpu.data,tredxyz_gpu.data, \
-   partial_diff_gpu.data, diffxyz_gpu.data
+   partial_diff_gpu.data, diffxyz_gpu.data, tred_gpu_for.data, diff_gpu_for.data
    ES_compute_for_partial<scalar_type,true,true,false><<<threadGrid, threadBlock>>>(compden_parameter);
 
 // ACCUMULATE DENSITIES
@@ -258,11 +245,7 @@ template<class scalar_type> void PointGroupGPU<scalar_type>::
    gdens.deallocate(); tdens.deallocate(); ddens.deallocate();
    gdens_xyz.deallocate(); tdens_xyz.deallocate(); ddens_xyz.deallocate();
 
-// Free Texture and Memory
-   cudaUnbindTexture(tred_gpu_for_tex);
-   cudaUnbindTexture(diff_gpu_for_tex);
-   cudaFreeArray(cuArraytred);
-   cudaFreeArray(cuArraydiff);
+// Free Memory
    mat_dens_gpu.deallocate(); mat_diff_gpu.deallocate(); 
    mat_tred_gpu.deallocate(); diff_accum_gpu.deallocate(); 
    tred_accum_gpu.deallocate(); diffxyz_accum_gpu.deallocate(); 
diff --git a/g2g/excited/cuda/g2g_calculateG.cu b/g2g/excited/cuda/g2g_calculateG.cu
index 577a19402..24c469e85 100644
--- a/g2g/excited/cuda/g2g_calculateG.cu
+++ b/g2g/excited/cuda/g2g_calculateG.cu
@@ -22,12 +22,6 @@
 using namespace std;
 
 namespace G2G {
-#if FULL_DOUBLE
-texture<int2, 2, cudaReadModeElementType> tred_gpu_3rd_tex;
-#else
-texture<float, 2, cudaReadModeElementType> tred_gpu_3rd_tex;
-#endif
-
 #include "../../cuda/kernels/transpose.h"
 #include "obtain_fock_cuda.h"
 #include "obtain_terms.h"
@@ -122,17 +116,14 @@ void PointGroupGPU<scalar_type>::solve_3rd_der(double* T, HostMatrix<double>& Fo
      }
    }
 
-// Form Bind Textures
-   cudaArray* cuArraytred;
-   cudaMallocArray(&cuArraytred, &tred_gpu_3rd_tex.channelDesc, tred_cpu.width, tred_cpu.height);
-   cudaMemcpyToArray(cuArraytred,0,0,tred_cpu.data,sizeof(scalar_type)*tred_cpu.width*tred_cpu.height,cudaMemcpyHostToDevice);
-   cudaBindTextureToArray(tred_gpu_3rd_tex, cuArraytred);
-   tred_cpu.deallocate();
+// Transition density on GPU
+   CudaMatrix<scalar_type> tred_gpu_3rd;
+   tred_gpu_3rd=tred_cpu;
 
 // CALCULATE PARTIAL DENSITIES
 #define compden_parameter \
    this->number_of_points,function_values_transposed.data,group_m,gradient_values_transposed.data,\
-   partial_tred_gpu.data,tredxyz_gpu.data
+   partial_tred_gpu.data,tredxyz_gpu.data,tred_gpu_3rd.data
    ES_compute_3rd_partial<scalar_type,true,true,false><<<threadGrid, threadBlock>>>(compden_parameter);
 
 // ACCUMULATE DENSITIES
@@ -212,8 +203,6 @@ void PointGroupGPU<scalar_type>::solve_3rd_der(double* T, HostMatrix<double>& Fo
 
 // Free Memory
    smallFock.deallocate();
-   cudaUnbindTexture(tred_gpu_3rd_tex);
-   cudaFreeArray(cuArraytred);
    Txyz.deallocate();
    Dxyz.deallocate();
    partial_tred_gpu.deallocate();
diff --git a/g2g/excited/cuda/g2g_calculateXC.cu b/g2g/excited/cuda/g2g_calculateXC.cu
index fda983903..a3366213d 100644
--- a/g2g/excited/cuda/g2g_calculateXC.cu
+++ b/g2g/excited/cuda/g2g_calculateXC.cu
@@ -22,14 +22,6 @@
 using namespace std;
 
 namespace G2G {
-#if FULL_DOUBLE
-texture<int2, 2, cudaReadModeElementType> rmm_gpu_tex;
-texture<int2, 2, cudaReadModeElementType> tred_gpu_tex;
-#else
-texture<float, 2, cudaReadModeElementType> rmm_gpu_tex;
-texture<float, 2, cudaReadModeElementType> tred_gpu_tex;
-#endif
-
 #include "../../cuda/kernels/transpose.h"
 #include "obtain_fock_cuda.h"
 #include "obtain_terms.h"
@@ -129,12 +121,9 @@ void PointGroupGPU<scalar_type>::solve_closed_lr(double* T, HostMatrix<double>&
      }
    }
 
-// Form Bind Textures
-   cudaArray* cuArraytred;
-   cudaMallocArray(&cuArraytred, &tred_gpu_tex.channelDesc, tred_cpu.width, tred_cpu.height);
-   cudaMemcpyToArray(cuArraytred,0,0,tred_cpu.data,sizeof(scalar_type)*tred_cpu.width*tred_cpu.height,cudaMemcpyHostToDevice);
-   cudaBindTextureToArray(tred_gpu_tex, cuArraytred);
-   tred_cpu.deallocate();
+// Transition density on GPU
+   CudaMatrix<scalar_type> tred_gpu;
+   tred_gpu=tred_cpu;
 
 // CALCULATE PARTIAL DENSITIES
 #define compden_parameter \
@@ -211,8 +200,6 @@ void PointGroupGPU<scalar_type>::solve_closed_lr(double* T, HostMatrix<double>&
 
 // Free Memory
    smallFock.deallocate();
-   cudaUnbindTexture(tred_gpu_tex);
-   cudaFreeArray(cuArraytred);
    Txyz.deallocate();
    Dxyz.deallocate();
    partial_tred_gpu.deallocate();
@@ -302,17 +289,14 @@ template<class scalar_type> void PointGroupGPU<scalar_type>::
      }
    }
 
-// Form Bind Textures
-   cudaArray* cuArrayrmm;
-   cudaMallocArray(&cuArrayrmm, &rmm_gpu_tex.channelDesc, rmm_cpu.width, rmm_cpu.height);
-   cudaMemcpyToArray(cuArrayrmm,0,0,rmm_cpu.data,sizeof(scalar_type)*rmm_cpu.width*rmm_cpu.height,cudaMemcpyHostToDevice);
-   cudaBindTextureToArray(rmm_gpu_tex, cuArrayrmm);
-   rmm_cpu.deallocate();
+// GS Density on GPU
+  CudaMatrix<scalar_type> rmm_gpu;
+  rmm_gpu=rmm_cpu;
 
 // CALCULATE PARTIAL DENSITIES
 #define compden_parameter \
    this->number_of_points,function_values_transposed.data,group_m,gradient_values_transposed.data, \
-   partial_densities_gpu.data,dxyz_gpu.data
+   partial_densities_gpu.data,dxyz_gpu.data,rmm_gpu.data
    GS_compute_partial<scalar_type,true,true,false><<<threadGrid, threadBlock>>>(compden_parameter);
 
 // ACCUMULATE DENSITIES
@@ -325,8 +309,6 @@ template<class scalar_type> void PointGroupGPU<scalar_type>::
 #undef accumulate_parameters
 
 // FREE MEMORY
-   cudaUnbindTexture(rmm_gpu_tex);
-   cudaFreeArray(cuArrayrmm);
    partial_densities_gpu.deallocate();
    dxyz_gpu.deallocate();
    function_values_transposed.deallocate();