diff --git a/src/dplasmaaux_cuda.c b/src/dplasmaaux_cuda.c
index ff410102..e90cf82a 100644
--- a/src/dplasmaaux_cuda.c
+++ b/src/dplasmaaux_cuda.c
@@ -5,11 +5,12 @@
  * $COPYRIGHT
  *
  */
-#include <cublas_v2.h>
-#include <cusolverDn.h>
-#include "potrf_cublas_utils.h"
+#include "dplasma/config.h"
 #include "parsec/utils/zone_malloc.h"
+#include "parsec/utils/show_help.h"
+#include <cublas_v2.h>
 #include "dplasmaaux_cuda.h"
+#include "potrf_cublas_utils.h"
 
 /* 
  * Global info ID's for cublas handles and workspaces
@@ -95,4 +96,4 @@ void *dplasma_create_cuda_handles(void *obj, void *_n)
     new->cusolverDn_handle = cusolver_handle;
 
     return new;
-}
\ No newline at end of file
+}
diff --git a/src/dplasmaaux_cuda.h b/src/dplasmaaux_cuda.h
index f0620f06..f7e57ed2 100644
--- a/src/dplasmaaux_cuda.h
+++ b/src/dplasmaaux_cuda.h
@@ -11,6 +11,52 @@
 
 #include "parsec/mca/device/cuda/device_cuda.h"
 
+/**
+ * DPLASMA currently supports a mix of cublas v1 and v2, but not in the same source file. Thus,
+ * the simplest way to provide common headers is to require the developer to manually specify
+ * when cublas_v2 is needed by including the header before dplasmaaux.h. Otherwise, we will include
+ * cublas.h (v1) automatically if CUDA is enabled.
+ */
+#if !defined(CUBLAS_V2_H_)
+#include <cublas.h>
+#endif  /* !defined(CUBLAS_V2_H_) */
+
+#define dplasma_cublas_side(side)                                         \
+    assert( (side == dplasmaRight) || (side == dplasmaLeft) );            \
+    side = (side == dplasmaRight) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
+
+
+#define dplasma_cublas_diag(diag)                                              \
+    assert( (diag == dplasmaNonUnit) || (diag == dplasmaUnit) );               \
+    diag = (diag == dplasmaNonUnit) ? CUBLAS_DIAG_NON_UNIT : CUBLAS_DIAG_UNIT;
+
+#define dplasma_cublas_fill(fill)                                                    \
+    assert( (fill == dplasmaLower) || (fill == dplasmaUpper) );                      \
+    fill = (fill == dplasmaLower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
+
+#if defined(PRECISION_z) || defined(PRECISION_c)
+#define dplasma_cublas_op(trans)                 \
+    assert( (trans == dplasmaNoTrans) || (trans == dplasmaTrans) || (trans == dplasmaConjTrans) ); \
+    switch(trans){                               \
+        case dplasmaNoTrans:                     \
+            trans = CUBLAS_OP_N;                 \
+            break;                               \
+        case dplasmaTrans:                       \
+            trans = CUBLAS_OP_T;                 \
+            break;                               \
+        case dplasmaConjTrans:                   \
+            trans = CUBLAS_OP_C;                 \
+            break;                               \
+        default:                                 \
+            trans = CUBLAS_OP_N;                 \
+            break;                               \
+    }
+#else
+#define dplasma_cublas_op(trans)                                    \
+    assert( (trans == dplasmaNoTrans) || (trans == dplasmaTrans) ); \
+    trans = (trans == dplasmaNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+#endif /* PRECISION_z || PRECISION_c */
+
 extern parsec_info_id_t CuHI;
 extern parsec_info_id_t WoSI;
 
@@ -31,6 +77,12 @@ void *dplasma_create_cuda_handles(void *obj, void *user);
         }                                                                    \
     } while(0)
 
+#if defined(CUBLAS_V2_H_)
+/* Support for cusolve requires cublas_v2 */
+#include <cusolverDn.h>
+
+char *dplasma_cusolver_error_to_string(cusolverStatus_t cusolver_status);
+
 #define DPLASMA_CUSOLVER_CHECK_STATUS( STR, STATUS, CODE )                                \
     do {                                                                                  \
         cusolverStatus_t __cusolver_status = (cusolverStatus_t) (STATUS);                 \
@@ -40,5 +92,6 @@ void *dplasma_create_cuda_handles(void *obj, void *user);
             CODE;                                                                         \
         }                                                                                 \
     } while(0)
+#endif  /* defined(CUBLAS_V2_H_) */
 
-#endif /* __DPLAMAAUX_CUDA_H__ */
\ No newline at end of file
+#endif /* __DPLAMAAUX_CUDA_H__ */
diff --git a/src/dplasmajdf.h b/src/dplasmajdf.h
index 8006c23f..4d2acbae 100644
--- a/src/dplasmajdf.h
+++ b/src/dplasmajdf.h
@@ -37,9 +37,5 @@
 #undef TEMP_TYPE
 #endif  /* PARSEC_HAVE_MPI */
 
-#if defined(DPLASMA_HAVE_CUDA)
-#include <cublas.h>
-#endif  /* defined(DPLASMA_HAVE_CUDA) */
-
 #endif /* _DPLASMAJDF_H_ */
 
diff --git a/src/dtd_wrappers/dplasma_z_dtd.h b/src/dtd_wrappers/dplasma_z_dtd.h
index 0a476aae..97c906da 100644
--- a/src/dtd_wrappers/dplasma_z_dtd.h
+++ b/src/dtd_wrappers/dplasma_z_dtd.h
@@ -20,7 +20,6 @@
 #include "parsec/utils/zone_malloc.h"
 #include "dplasmaaux.h"
 #include "potrf_cublas_utils.h"
-#include <cublas_v2.h>
 
 /* probably need to add this to substitions */
 #if defined(PRECISION_s)
@@ -52,4 +51,4 @@ parsec_task_class_t * parsec_dtd_create_ztrsm_task_class(parsec_taskpool_t * dtd
 parsec_task_class_t * parsec_dtd_create_zherk_task_class(parsec_taskpool_t * dtd_tp, int tile_full, int devices);
 parsec_task_class_t * parsec_dtd_create_zgemm_task_class(parsec_taskpool_t * dtd_tp, int tile_full, int devices);
 
-#endif /* __DTD_WRAPPERS_Z_H__ */
\ No newline at end of file
+#endif /* __DTD_WRAPPERS_Z_H__ */
diff --git a/src/dtd_wrappers/zgemm.c b/src/dtd_wrappers/zgemm.c
index 2496a1e5..80361892 100644
--- a/src/dtd_wrappers/zgemm.c
+++ b/src/dtd_wrappers/zgemm.c
@@ -6,6 +6,11 @@
  * @precisions normal z -> s d c
  *
  */
+#include "dplasma/config.h"
+
+#if defined(DPLASMA_HAVE_CUDA)
+#include <cublas_v2.h>
+#endif  /* defined(DPLASMA_HAVE_CUDA) */
 
 #include "dplasma_z_dtd.h"
 
@@ -77,6 +82,8 @@ parsec_core_zgemm_cuda(parsec_device_gpu_module_t* gpu_device,
 
     handles = parsec_info_get(&gpu_stream->infos, CuHI);
 
+    parsec_cuda_exec_stream_t* cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream;
+    cublasSetStream( handles->cublas_handle, cuda_stream->cuda_stream );
     status = cublasZgemm(handles->cublas_handle, transA, transB,
                           n, m, k,
                           &alphag, (cuDoubleComplex*)Ag, lda,
diff --git a/src/dtd_wrappers/zherk.c b/src/dtd_wrappers/zherk.c
index fb848b68..f0aed388 100644
--- a/src/dtd_wrappers/zherk.c
+++ b/src/dtd_wrappers/zherk.c
@@ -6,6 +6,11 @@
  * @precisions normal z -> s d c
  *
  */
+#include "dplasma/config.h"
+
+#if defined(DPLASMA_HAVE_CUDA)
+#include <cublas_v2.h>
+#endif  /* defined(DPLASMA_HAVE_CUDA) */
 
 #include "dplasma_z_dtd.h"
 
@@ -66,6 +71,8 @@ parsec_core_zherk_cuda(parsec_device_gpu_module_t* gpu_device,
     }
 #endif /* defined(PARSEC_DEBUG_NOISIER) */
 
+    parsec_cuda_exec_stream_t* cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream;
+    cublasSetStream( handles->cublas_handle, cuda_stream->cuda_stream );
     status = cublasZherk(handles->cublas_handle, uplo, trans,
                           m, n,
                           &alpha, (cuDoubleComplex*)Ag, lda,
@@ -105,4 +112,4 @@ parsec_dtd_create_zherk_task_class(parsec_taskpool_t* dtd_tp, int tile_full, int
         parsec_dtd_task_class_add_chore(dtd_tp, zherk_tc, PARSEC_DEV_CPU, parsec_core_zherk);
 
     return zherk_tc;
-}
\ No newline at end of file
+}
diff --git a/src/dtd_wrappers/zpotrf.c b/src/dtd_wrappers/zpotrf.c
index 1fe09c36..0b979be4 100644
--- a/src/dtd_wrappers/zpotrf.c
+++ b/src/dtd_wrappers/zpotrf.c
@@ -6,6 +6,11 @@
  * @precisions normal z -> s d c
  *
  */
+#include "dplasma/config.h"
+
+#if defined(DPLASMA_HAVE_CUDA)
+#include <cublas_v2.h>
+#endif  /* defined(DPLASMA_HAVE_CUDA) */
 
 #include "dplasma_z_dtd.h"
 
diff --git a/src/dtd_wrappers/ztrsm.c b/src/dtd_wrappers/ztrsm.c
index 6a415a87..612ec99e 100644
--- a/src/dtd_wrappers/ztrsm.c
+++ b/src/dtd_wrappers/ztrsm.c
@@ -6,6 +6,11 @@
  * @precisions normal z -> s d c
  *
  */
+#include "dplasma/config.h"
+
+#if defined(DPLASMA_HAVE_CUDA)
+#include <cublas_v2.h>
+#endif  /* defined(DPLASMA_HAVE_CUDA) */
 
 #include "dplasma_z_dtd.h"
 
@@ -74,6 +79,8 @@ parsec_core_ztrsm_cuda(parsec_device_gpu_module_t* gpu_device,
     }
 #endif /* defined(PARSEC_DEBUG_NOISIER) */
 
+    parsec_cuda_exec_stream_t* cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream;
+    cublasSetStream( handles->cublas_handle, cuda_stream->cuda_stream );
     status = cublasZtrsm(handles->cublas_handle,
                           side, uplo, trans, diag,
                           m, n, &alphag,
diff --git a/src/include/dplasma/constants.h b/src/include/dplasma/constants.h
index 79da941b..3d701a59 100644
--- a/src/include/dplasma/constants.h
+++ b/src/include/dplasma/constants.h
@@ -209,45 +209,4 @@ enum dplasma_matrix_type_e {
 extern char *dplasma_lapack_const_strings[];
 #define dplasma_lapack_const(plasma_const) (dplasma_lapack_const_strings[plasma_const][0])
 
-#if defined(DPLASMA_HAVE_CUDA)
-#include <cublas.h>
-
-#define dplasma_cublas_side(side)                                         \
-    assert( (side == dplasmaRight) || (side == dplasmaLeft) );            \
-    side = (side == dplasmaRight) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
-
-
-#define dplasma_cublas_diag(diag)                                              \
-    assert( (diag == dplasmaNonUnit) || (diag == dplasmaUnit) );               \
-    diag = (diag == dplasmaNonUnit) ? CUBLAS_DIAG_NON_UNIT : CUBLAS_DIAG_UNIT;
-
-#define dplasma_cublas_fill(fill)                                                    \
-    assert( (fill == dplasmaLower) || (fill == dplasmaUpper) );                      \
-    fill = (fill == dplasmaLower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
-
-#if defined(PRECISION_z) || defined(PRECISION_c)
-#define dplasma_cublas_op(trans)                 \
-    assert( (trans == dplasmaNoTrans) || (trans == dplasmaTrans) || (trans == dplasmaConjTrans) ); \
-    switch(trans){                               \
-        case dplasmaNoTrans:                     \
-            trans = CUBLAS_OP_N;                 \
-            break;                               \
-        case dplasmaTrans:                       \
-            trans = CUBLAS_OP_T;                 \
-            break;                               \
-        case dplasmaConjTrans:                   \
-            trans = CUBLAS_OP_C;                 \
-            break;                               \
-        default:                                 \
-            trans = CUBLAS_OP_N;                 \
-            break;                               \
-    }
-#else
-#define dplasma_cublas_op(trans)                                    \
-    assert( (trans == dplasmaNoTrans) || (trans == dplasmaTrans) ); \
-    trans = (trans == dplasmaNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-#endif /* PRECISION_z || PRECISION_c */
-
-#endif /* DPLASMA_HAVE_CUDA */
-
 #endif /* _DPLASMA_CONSTANTS_H_ */
diff --git a/src/include/dplasma/types_lapack.h b/src/include/dplasma/types_lapack.h
index 9c57b846..c5022c60 100644
--- a/src/include/dplasma/types_lapack.h
+++ b/src/include/dplasma/types_lapack.h
@@ -13,7 +13,7 @@
 
 /* Support for TILED/LAPACK matrix with non homogeneous datatypes across tiles.
  * NOTE: we are operating with the following condition:
- * For a given datacollection, if we reuse the datatype for one shape on different
+ * For a given data collection, if we reuse the datatype for one shape on different
  * locations, then other shapes will also be reusing a datatype for those locations.
  * The same happens between layouts.
  *
diff --git a/src/potrf_cublas_utils.h b/src/potrf_cublas_utils.h
index e3be832d..8ff02a9b 100644
--- a/src/potrf_cublas_utils.h
+++ b/src/potrf_cublas_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 The University of Tennessee and The University
+ * Copyright (c) 2020-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
@@ -8,8 +8,6 @@
 #define DPLASMA_POTRF_CUBLAS_UTILS_H
 
 #if defined(DPLASMA_HAVE_CUDA)
-#include <cublas.h>
-#include <cusolverDn.h>
 
 typedef struct {
   char         *tmpmem;
@@ -20,28 +18,6 @@ typedef struct {
   void*         host_buffer;
 } dplasma_potrf_workspace_t;
 
-typedef cusolverStatus_t (*cublas_spotrf_v2_t) (
-        cusolverDnHandle_t handle, cublasFillMode_t uplo,
-        int n, float *A, int lda,
-        float *Workspace, int Lwork, int *devInfo );
-
-typedef cusolverStatus_t  (*cublas_dpotrf_v2_t) (
-        cusolverDnHandle_t handle, cublasFillMode_t uplo,
-        int n, double *A, int lda,
-        double *Workspace, int Lwork, int *devInfo );
-
-typedef cusolverStatus_t (*cublas_cpotrf_v2_t) (
-        cusolverDnHandle_t handle, cublasFillMode_t uplo,
-        int n, cuComplex *A, int lda,
-        cuComplex *Workspace, int Lwork, int *devInfo );
-
-typedef cusolverStatus_t (*cublas_zpotrf_v2_t) (
-        cusolverDnHandle_t handle, cublasFillMode_t uplo,
-        int n, cuDoubleComplex *A, int lda,
-        cuDoubleComplex *Workspace, int Lwork, int *devInfo );
-
-char *dplasma_cusolver_error_to_string(cusolverStatus_t cusolver_status);
-
 #endif
 
 #endif //DPLASMA_POTRF_CUBLAS_UTILS_H
diff --git a/src/zpotrf_L.jdf b/src/zpotrf_L.jdf
index 58f47e33..2749324e 100644
--- a/src/zpotrf_L.jdf
+++ b/src/zpotrf_L.jdf
@@ -9,6 +9,12 @@ extern "C" %{
  * @precisions normal z -> s d c
  *
  */
+#include "dplasma/config.h"
+#if defined(DPLASMA_HAVE_CUDA)
+#include <cublas_v2.h>
+#include "potrf_cublas_utils.h"
+#endif  /* defined(DPLASMA_HAVE_CUDA) */
+
 #include "dplasmajdf.h"
 #include "parsec/data_dist/matrix/matrix.h"
 
@@ -16,11 +22,6 @@ extern "C" %{
 #include "parsec/recursive.h"
 static void zpotrf_L_update_INFO(parsec_taskpool_t* _tp, const parsec_recursive_callback_t* data);
 
-#if defined(DPLASMA_HAVE_CUDA)
-#include <cublas_v2.h>
-#include "potrf_cublas_utils.h"
-#endif  /* defined(DPLASMA_HAVE_CUDA) */
-
 /* Define the different shapes this JDF is using */
 #define DEFAULT 0
 
@@ -163,7 +164,7 @@ BODY [type=CUDA
 
     status = cusolverDnZpotrf( handles->cusolverDn_handle, cublas_uplo, tempkm, T, ldak, workspace, wp->lwork, d_iinfo);
     PARSEC_CUDA_CHECK_ERROR( "cublasZpotrf_v2 ", status,
-                            {return -1;} );
+                            {return PARSEC_HOOK_RETURN_ERROR;} );
 }
 END
 
@@ -228,13 +229,14 @@ BODY [type=CUDA]
     cublasStatus_t status;
     handles = parsec_info_get(&gpu_stream->infos, CuHandlesID);
     assert(NULL != handles);
+    cublasSetStream( handles->cublas_handle, parsec_body.stream );
     status = cublasZtrsm_v2(handles->cublas_handle,
                             CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER,
                             CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT,
                             tempmm, descA->nb,
                             &zone, T, ldak, C, ldam);
     PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status,
-                            {return -1;} );
+                            {return PARSEC_HOOK_RETURN_ERROR;} );
 }
 END
 
@@ -335,13 +337,14 @@ BODY [type=CUDA]
 
     handles = parsec_info_get(&gpu_stream->infos, CuHandlesID);
     assert(NULL != handles);
+    cublasSetStream( handles->cublas_handle, parsec_body.stream );
     status = cublasZherk_v2( handles->cublas_handle,
                              CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N,
                              tempmm, descA->mb,
                              &mzone, A, ldam_A,
                              &zone, T, ldam_T);
     PARSEC_CUDA_CHECK_ERROR( "cublasZherk_v2 ", status,
-                            {return -1;} );
+                            {return PARSEC_HOOK_RETURN_ERROR;} );
 }
 END
 
@@ -446,8 +449,9 @@ BODY [type=CUDA
     int ldan_B = LDA(ddescA, B);
     int ldam_C = LDA(ddescA, C);
 
-    dplasma_cuda_handles_t *handles;
     cublasStatus_t status;
+    dplasma_cuda_handles_t *handles;
+    
     assert( ldam_A <= descA->mb );
     assert( ldan_B <= descA->mb );
     assert( ldam_C <= descA->mb );
@@ -455,6 +459,7 @@ BODY [type=CUDA
     handles = parsec_info_get(&gpu_stream->infos, CuHandlesID);
     assert(NULL != handles);
 
+    cublasSetStream( handles->cublas_handle, parsec_body.stream );
     status = cublasZgemm_v2( handles->cublas_handle,
                              CUBLAS_OP_N, CUBLAS_OP_C,
                              tempmm, descA->mb, descA->mb,
@@ -462,7 +467,7 @@ BODY [type=CUDA
                                      (cuDoubleComplex*)B, ldan_B,
                              &zone,  (cuDoubleComplex*)C, ldam_C );
     PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status,
-                            {return -1;} );
+                            {return PARSEC_HOOK_RETURN_ERROR;} );
 }
 END
 
diff --git a/src/zpotrf_U.jdf b/src/zpotrf_U.jdf
index d64194e7..e9e1afc3 100644
--- a/src/zpotrf_U.jdf
+++ b/src/zpotrf_U.jdf
@@ -8,6 +8,12 @@ extern "C" %{
  * @precisions normal z -> s d c
  *
  */
+#include "dplasma/config.h"
+#if defined(DPLASMA_HAVE_CUDA)
+#include <cublas_v2.h>
+#include "potrf_cublas_utils.h"
+#endif  /* defined(DPLASMA_HAVE_CUDA) */
+
 #include "dplasmajdf.h"
 #include "parsec/data_dist/matrix/matrix.h"
 
@@ -15,11 +21,6 @@ extern "C" %{
 #include "parsec/recursive.h"
 static void zpotrf_U_update_INFO(parsec_taskpool_t* _tp, const parsec_recursive_callback_t* data);
 
-#if defined(DPLASMA_HAVE_CUDA)
-#include <cublas_v2.h>
-#include "potrf_cublas_utils.h"
-#endif  /* defined(DPLASMA_HAVE_CUDA) */
-
 /* Define the different shapes this JDF is using */
 #define DEFAULT 0
 
@@ -162,7 +163,7 @@ BODY [type=CUDA
 
     status = cusolverDnZpotrf( handles->cusolverDn_handle, cublas_uplo, tempkn, T, ldak, workspace, wp->lwork, d_iinfo);
     PARSEC_CUDA_CHECK_ERROR( "cublasZpotrf_v2 ", status,
-                            {return -1;} );
+                            {return PARSEC_HOOK_RETURN_ERROR;} );
 }
 END
 
@@ -230,13 +231,14 @@ BODY [type=CUDA]
     cublasStatus_t status;
     handles = parsec_info_get(&gpu_stream->infos, CuHandlesID);
     assert(NULL != handles);
+    cublasSetStream( handles->cublas_handle, parsec_body.stream );
     status = cublasZtrsm_v2(handles->cublas_handle,
                             CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER,
                             CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT,
                             descA->mb, tempnn,
                             &zone, T, ldak_T, C, ldak_C);
     PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status,
-                            {return -1;} );
+                            {return PARSEC_HOOK_RETURN_ERROR;} );
 }
 END
 
@@ -337,6 +339,7 @@ BODY [type=CUDA]
 
     handles = parsec_info_get(&gpu_stream->infos, CuHandlesID);
     assert(NULL != handles);
+    cublasSetStream( handles->cublas_handle, parsec_body.stream );
     status = cublasZherk_v2( handles->cublas_handle,
                              CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_C,
                              tempnn, descA->mb,
@@ -344,7 +347,7 @@ BODY [type=CUDA]
                              &zone, T, ldan);
 
     PARSEC_CUDA_CHECK_ERROR( "cublasZherk_v2 ", status,
-                            {return -1;} );
+                            {return PARSEC_HOOK_RETURN_ERROR;} );
 }
 END
 
@@ -460,7 +463,7 @@ BODY [type=CUDA
     handles = parsec_info_get(&gpu_stream->infos, CuHandlesID);
     assert(NULL != handles);
 
-    cublasSetKernelStream( parsec_body.stream );
+    cublasSetStream( handles->cublas_handle, parsec_body.stream );
     status = cublasZgemm_v2( handles->cublas_handle,
                              CUBLAS_OP_C, CUBLAS_OP_N,
                              descA->mb, tempnn, descA->nb,
@@ -468,7 +471,7 @@ BODY [type=CUDA
                                      (cuDoubleComplex*)B, ldak_B,
                              &zone,  (cuDoubleComplex*)C, ldam_C);
     PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status,
-                            {return -1;} );
+                            {return PARSEC_HOOK_RETURN_ERROR;} );
 
     printlogcuda("CUDA_zgemm( %d, %d, %d )\n\t( %s, %s, %d, %d, %d, %f, A(%d,%d)[%p], %d, A(%d,%d)[%p], %d, %f, A(%d,%d)[%p], %d)\n",
                  m, n, k,
diff --git a/src/zpotrf_wrapper.c b/src/zpotrf_wrapper.c
index 71da5948..8699ee0d 100644
--- a/src/zpotrf_wrapper.c
+++ b/src/zpotrf_wrapper.c
@@ -12,9 +12,12 @@
 #include "dplasma.h"
 #include "dplasma/types.h"
 #include "dplasma/types_lapack.h"
-#include "dplasmaaux.h"
+#if defined(DPLASMA_HAVE_CUDA)
+#include <cublas_v2.h>
 #include "potrf_cublas_utils.h"
 #include "parsec/utils/zone_malloc.h"
+#endif  /* defined(DPLASMA_HAVE_CUDA) */
+#include "dplasmaaux.h"
 
 #include "zpotrf_U.h"
 #include "zpotrf_L.h"
diff --git a/tests/common.c b/tests/common.c
index c4212198..78a3d865 100644
--- a/tests/common.c
+++ b/tests/common.c
@@ -27,8 +27,8 @@
 #include <mpi.h>
 #endif
 #if defined(DPLASMA_HAVE_CUDA)
+#include <cublas_v2.h>
 #include "dplasmaaux.h"
-#include <cublas.h>
 #include <cusolverDn.h>
 #endif
 
@@ -732,7 +732,6 @@ void cleanup_parsec(parsec_context_t* parsec, int *iparam)
 #if defined(DPLASMA_HAVE_CUDA)
     parsec_info_id_t CuHI = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", NULL);
     parsec_info_unregister(&parsec_per_stream_infos, CuHI, NULL);
-    cublasShutdown();
 #endif
 
     parsec_fini(&parsec);
diff --git a/tools/gemmpeak/cu-gemmpeak.cpp b/tools/gemmpeak/cu-gemmpeak.cpp
index 0aacc069..7f91d5e1 100644
--- a/tools/gemmpeak/cu-gemmpeak.cpp
+++ b/tools/gemmpeak/cu-gemmpeak.cpp
@@ -1,8 +1,8 @@
 #include <stdio.h>
 #include <stdlib.h>
-#include "cuda.h"
-#include "cuda_runtime_api.h"
-#include "cublas.h"
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cublas.h>
 
 inline float mmax( float a, float b ) { return a > b ? a : b; }