diff --git a/src/dplasmaaux_cuda.c b/src/dplasmaaux_cuda.c index ff410102..e90cf82a 100644 --- a/src/dplasmaaux_cuda.c +++ b/src/dplasmaaux_cuda.c @@ -5,11 +5,12 @@ * $COPYRIGHT * */ -#include -#include -#include "potrf_cublas_utils.h" +#include "dplasma/config.h" #include "parsec/utils/zone_malloc.h" +#include "parsec/utils/show_help.h" +#include #include "dplasmaaux_cuda.h" +#include "potrf_cublas_utils.h" /* * Global info ID's for cublas handles and workspaces @@ -95,4 +96,4 @@ void *dplasma_create_cuda_handles(void *obj, void *_n) new->cusolverDn_handle = cusolver_handle; return new; -} \ No newline at end of file +} diff --git a/src/dplasmaaux_cuda.h b/src/dplasmaaux_cuda.h index f0620f06..f7e57ed2 100644 --- a/src/dplasmaaux_cuda.h +++ b/src/dplasmaaux_cuda.h @@ -11,6 +11,52 @@ #include "parsec/mca/device/cuda/device_cuda.h" +/** + * DPLASMA currently supports a mix of cublas v1 and v2, but not in the same source file. Thus, + * the simplest way to provide common headers is to require the developer to manually specify + * when cublas_v2 is needed by including the header before dplasmaaux.h. Otherwise, we will include + * cublas.h (v1) automatically if CUDA is enabled. + */ +#if !defined(CUBLAS_V2_H_) +#include +#endif /* !defined(CUBLAS_V2_H_) */ + +#define dplasma_cublas_side(side) \ + assert( (side == dplasmaRight) || (side == dplasmaLeft) ); \ + side = (side == dplasmaRight) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT; + + +#define dplasma_cublas_diag(diag) \ + assert( (diag == dplasmaNonUnit) || (diag == dplasmaUnit) ); \ + diag = (diag == dplasmaNonUnit) ? CUBLAS_DIAG_NON_UNIT : CUBLAS_DIAG_UNIT; + +#define dplasma_cublas_fill(fill) \ + assert( (fill == dplasmaLower) || (fill == dplasmaUpper) ); \ + fill = (fill == dplasmaLower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; + +#if defined(PRECISION_z) || defined(PRECISION_c) +#define dplasma_cublas_op(trans) \ + assert( (trans == dplasmaNoTrans) || (trans == dplasmaTrans) || (trans == dplasmaConjTrans) ); \ + switch(trans){ \ + case dplasmaNoTrans: \ + trans = CUBLAS_OP_N; \ + break; \ + case dplasmaTrans: \ + trans = CUBLAS_OP_T; \ + break; \ + case dplasmaConjTrans: \ + trans = CUBLAS_OP_C; \ + break; \ + default: \ + trans = CUBLAS_OP_N; \ + break; \ + } +#else +#define dplasma_cublas_op(trans) \ + assert( (trans == dplasmaNoTrans) || (trans == dplasmaTrans) ); \ + trans = (trans == dplasmaNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; +#endif /* PRECISION_z || PRECISION_c */ + extern parsec_info_id_t CuHI; extern parsec_info_id_t WoSI; @@ -31,6 +77,12 @@ void *dplasma_create_cuda_handles(void *obj, void *user); } \ } while(0) +#if defined(CUBLAS_V2_H_) +/* Support for cusolve requires cublas_v2 */ +#include + +char *dplasma_cusolver_error_to_string(cusolverStatus_t cusolver_status); + #define DPLASMA_CUSOLVER_CHECK_STATUS( STR, STATUS, CODE ) \ do { \ cusolverStatus_t __cusolver_status = (cusolverStatus_t) (STATUS); \ @@ -40,5 +92,6 @@ void *dplasma_create_cuda_handles(void *obj, void *user); CODE; \ } \ } while(0) +#endif /* defined(CUBLAS_V2_H_) */ -#endif /* __DPLAMAAUX_CUDA_H__ */ \ No newline at end of file +#endif /* __DPLAMAAUX_CUDA_H__ */ diff --git a/src/dplasmajdf.h b/src/dplasmajdf.h index 8006c23f..4d2acbae 100644 --- a/src/dplasmajdf.h +++ b/src/dplasmajdf.h @@ -37,9 +37,5 @@ #undef TEMP_TYPE #endif /* PARSEC_HAVE_MPI */ -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - #endif /* _DPLASMAJDF_H_ */ diff --git a/src/dtd_wrappers/dplasma_z_dtd.h b/src/dtd_wrappers/dplasma_z_dtd.h index 0a476aae..97c906da 100644 --- a/src/dtd_wrappers/dplasma_z_dtd.h +++ b/src/dtd_wrappers/dplasma_z_dtd.h @@ -20,7 +20,6 @@ #include "parsec/utils/zone_malloc.h" #include "dplasmaaux.h" #include "potrf_cublas_utils.h" -#include /* probably need to add this to substitions */ #if defined(PRECISION_s) @@ -52,4 +51,4 @@ parsec_task_class_t * parsec_dtd_create_ztrsm_task_class(parsec_taskpool_t * dtd parsec_task_class_t * parsec_dtd_create_zherk_task_class(parsec_taskpool_t * dtd_tp, int tile_full, int devices); parsec_task_class_t * parsec_dtd_create_zgemm_task_class(parsec_taskpool_t * dtd_tp, int tile_full, int devices); -#endif /* __DTD_WRAPPERS_Z_H__ */ \ No newline at end of file +#endif /* __DTD_WRAPPERS_Z_H__ */ diff --git a/src/dtd_wrappers/zgemm.c b/src/dtd_wrappers/zgemm.c index 2496a1e5..80361892 100644 --- a/src/dtd_wrappers/zgemm.c +++ b/src/dtd_wrappers/zgemm.c @@ -6,6 +6,11 @@ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" + +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasma_z_dtd.h" @@ -77,6 +82,8 @@ parsec_core_zgemm_cuda(parsec_device_gpu_module_t* gpu_device, handles = parsec_info_get(&gpu_stream->infos, CuHI); + parsec_cuda_exec_stream_t* cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream; + cublasSetStream( handles->cublas_handle, cuda_stream->cuda_stream ); status = cublasZgemm(handles->cublas_handle, transA, transB, n, m, k, &alphag, (cuDoubleComplex*)Ag, lda, diff --git a/src/dtd_wrappers/zherk.c b/src/dtd_wrappers/zherk.c index fb848b68..f0aed388 100644 --- a/src/dtd_wrappers/zherk.c +++ b/src/dtd_wrappers/zherk.c @@ -6,6 +6,11 @@ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" + +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasma_z_dtd.h" @@ -66,6 +71,8 @@ parsec_core_zherk_cuda(parsec_device_gpu_module_t* gpu_device, } #endif /* defined(PARSEC_DEBUG_NOISIER) */ + parsec_cuda_exec_stream_t* cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream; + cublasSetStream( handles->cublas_handle, cuda_stream->cuda_stream ); status = cublasZherk(handles->cublas_handle, uplo, trans, m, n, &alpha, (cuDoubleComplex*)Ag, lda, @@ -105,4 +112,4 @@ parsec_dtd_create_zherk_task_class(parsec_taskpool_t* dtd_tp, int tile_full, int parsec_dtd_task_class_add_chore(dtd_tp, zherk_tc, PARSEC_DEV_CPU, parsec_core_zherk); return zherk_tc; -} \ No newline at end of file +} diff --git a/src/dtd_wrappers/zpotrf.c b/src/dtd_wrappers/zpotrf.c index 1fe09c36..0b979be4 100644 --- a/src/dtd_wrappers/zpotrf.c +++ b/src/dtd_wrappers/zpotrf.c @@ -6,6 +6,11 @@ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" + +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasma_z_dtd.h" diff --git a/src/dtd_wrappers/ztrsm.c b/src/dtd_wrappers/ztrsm.c index 6a415a87..612ec99e 100644 --- a/src/dtd_wrappers/ztrsm.c +++ b/src/dtd_wrappers/ztrsm.c @@ -6,6 +6,11 @@ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" + +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasma_z_dtd.h" @@ -74,6 +79,8 @@ parsec_core_ztrsm_cuda(parsec_device_gpu_module_t* gpu_device, } #endif /* defined(PARSEC_DEBUG_NOISIER) */ + parsec_cuda_exec_stream_t* cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream; + cublasSetStream( handles->cublas_handle, cuda_stream->cuda_stream ); status = cublasZtrsm(handles->cublas_handle, side, uplo, trans, diag, m, n, &alphag, diff --git a/src/include/dplasma/constants.h b/src/include/dplasma/constants.h index 79da941b..3d701a59 100644 --- a/src/include/dplasma/constants.h +++ b/src/include/dplasma/constants.h @@ -209,45 +209,4 @@ enum dplasma_matrix_type_e { extern char *dplasma_lapack_const_strings[]; #define dplasma_lapack_const(plasma_const) (dplasma_lapack_const_strings[plasma_const][0]) -#if defined(DPLASMA_HAVE_CUDA) -#include - -#define dplasma_cublas_side(side) \ - assert( (side == dplasmaRight) || (side == dplasmaLeft) ); \ - side = (side == dplasmaRight) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT; - - -#define dplasma_cublas_diag(diag) \ - assert( (diag == dplasmaNonUnit) || (diag == dplasmaUnit) ); \ - diag = (diag == dplasmaNonUnit) ? CUBLAS_DIAG_NON_UNIT : CUBLAS_DIAG_UNIT; - -#define dplasma_cublas_fill(fill) \ - assert( (fill == dplasmaLower) || (fill == dplasmaUpper) ); \ - fill = (fill == dplasmaLower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; - -#if defined(PRECISION_z) || defined(PRECISION_c) -#define dplasma_cublas_op(trans) \ - assert( (trans == dplasmaNoTrans) || (trans == dplasmaTrans) || (trans == dplasmaConjTrans) ); \ - switch(trans){ \ - case dplasmaNoTrans: \ - trans = CUBLAS_OP_N; \ - break; \ - case dplasmaTrans: \ - trans = CUBLAS_OP_T; \ - break; \ - case dplasmaConjTrans: \ - trans = CUBLAS_OP_C; \ - break; \ - default: \ - trans = CUBLAS_OP_N; \ - break; \ - } -#else -#define dplasma_cublas_op(trans) \ - assert( (trans == dplasmaNoTrans) || (trans == dplasmaTrans) ); \ - trans = (trans == dplasmaNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; -#endif /* PRECISION_z || PRECISION_c */ - -#endif /* DPLASMA_HAVE_CUDA */ - #endif /* _DPLASMA_CONSTANTS_H_ */ diff --git a/src/include/dplasma/types_lapack.h b/src/include/dplasma/types_lapack.h index 9c57b846..c5022c60 100644 --- a/src/include/dplasma/types_lapack.h +++ b/src/include/dplasma/types_lapack.h @@ -13,7 +13,7 @@ /* Support for TILED/LAPACK matrix with non homogeneous datatypes across tiles. * NOTE: we are operating with the following condition: - * For a given datacollection, if we reuse the datatype for one shape on different + * For a given data collection, if we reuse the datatype for one shape on different * locations, then other shapes will also be reusing a datatype for those locations. * The same happens between layouts. * diff --git a/src/potrf_cublas_utils.h b/src/potrf_cublas_utils.h index e3be832d..8ff02a9b 100644 --- a/src/potrf_cublas_utils.h +++ b/src/potrf_cublas_utils.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 The University of Tennessee and The University + * Copyright (c) 2020-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -8,8 +8,6 @@ #define DPLASMA_POTRF_CUBLAS_UTILS_H #if defined(DPLASMA_HAVE_CUDA) -#include -#include typedef struct { char *tmpmem; @@ -20,28 +18,6 @@ typedef struct { void* host_buffer; } dplasma_potrf_workspace_t; -typedef cusolverStatus_t (*cublas_spotrf_v2_t) ( - cusolverDnHandle_t handle, cublasFillMode_t uplo, - int n, float *A, int lda, - float *Workspace, int Lwork, int *devInfo ); - -typedef cusolverStatus_t (*cublas_dpotrf_v2_t) ( - cusolverDnHandle_t handle, cublasFillMode_t uplo, - int n, double *A, int lda, - double *Workspace, int Lwork, int *devInfo ); - -typedef cusolverStatus_t (*cublas_cpotrf_v2_t) ( - cusolverDnHandle_t handle, cublasFillMode_t uplo, - int n, cuComplex *A, int lda, - cuComplex *Workspace, int Lwork, int *devInfo ); - -typedef cusolverStatus_t (*cublas_zpotrf_v2_t) ( - cusolverDnHandle_t handle, cublasFillMode_t uplo, - int n, cuDoubleComplex *A, int lda, - cuDoubleComplex *Workspace, int Lwork, int *devInfo ); - -char *dplasma_cusolver_error_to_string(cusolverStatus_t cusolver_status); - #endif #endif //DPLASMA_POTRF_CUBLAS_UTILS_H diff --git a/src/zpotrf_L.jdf b/src/zpotrf_L.jdf index 58f47e33..2749324e 100644 --- a/src/zpotrf_L.jdf +++ b/src/zpotrf_L.jdf @@ -9,6 +9,12 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#include "potrf_cublas_utils.h" +#endif /* defined(DPLASMA_HAVE_CUDA) */ + #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" @@ -16,11 +22,6 @@ extern "C" %{ #include "parsec/recursive.h" static void zpotrf_L_update_INFO(parsec_taskpool_t* _tp, const parsec_recursive_callback_t* data); -#if defined(DPLASMA_HAVE_CUDA) -#include -#include "potrf_cublas_utils.h" -#endif /* defined(DPLASMA_HAVE_CUDA) */ - /* Define the different shapes this JDF is using */ #define DEFAULT 0 @@ -163,7 +164,7 @@ BODY [type=CUDA status = cusolverDnZpotrf( handles->cusolverDn_handle, cublas_uplo, tempkm, T, ldak, workspace, wp->lwork, d_iinfo); PARSEC_CUDA_CHECK_ERROR( "cublasZpotrf_v2 ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -228,13 +229,14 @@ BODY [type=CUDA] cublasStatus_t status; handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); status = cublasZtrsm_v2(handles->cublas_handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT, tempmm, descA->nb, &zone, T, ldak, C, ldam); PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -335,13 +337,14 @@ BODY [type=CUDA] handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); status = cublasZherk_v2( handles->cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, tempmm, descA->mb, &mzone, A, ldam_A, &zone, T, ldam_T); PARSEC_CUDA_CHECK_ERROR( "cublasZherk_v2 ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -446,8 +449,9 @@ BODY [type=CUDA int ldan_B = LDA(ddescA, B); int ldam_C = LDA(ddescA, C); - dplasma_cuda_handles_t *handles; cublasStatus_t status; + dplasma_cuda_handles_t *handles; + assert( ldam_A <= descA->mb ); assert( ldan_B <= descA->mb ); assert( ldam_C <= descA->mb ); @@ -455,6 +459,7 @@ BODY [type=CUDA handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); status = cublasZgemm_v2( handles->cublas_handle, CUBLAS_OP_N, CUBLAS_OP_C, tempmm, descA->mb, descA->mb, @@ -462,7 +467,7 @@ BODY [type=CUDA (cuDoubleComplex*)B, ldan_B, &zone, (cuDoubleComplex*)C, ldam_C ); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/zpotrf_U.jdf b/src/zpotrf_U.jdf index d64194e7..e9e1afc3 100644 --- a/src/zpotrf_U.jdf +++ b/src/zpotrf_U.jdf @@ -8,6 +8,12 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#include "potrf_cublas_utils.h" +#endif /* defined(DPLASMA_HAVE_CUDA) */ + #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" @@ -15,11 +21,6 @@ extern "C" %{ #include "parsec/recursive.h" static void zpotrf_U_update_INFO(parsec_taskpool_t* _tp, const parsec_recursive_callback_t* data); -#if defined(DPLASMA_HAVE_CUDA) -#include -#include "potrf_cublas_utils.h" -#endif /* defined(DPLASMA_HAVE_CUDA) */ - /* Define the different shapes this JDF is using */ #define DEFAULT 0 @@ -162,7 +163,7 @@ BODY [type=CUDA status = cusolverDnZpotrf( handles->cusolverDn_handle, cublas_uplo, tempkn, T, ldak, workspace, wp->lwork, d_iinfo); PARSEC_CUDA_CHECK_ERROR( "cublasZpotrf_v2 ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -230,13 +231,14 @@ BODY [type=CUDA] cublasStatus_t status; handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); status = cublasZtrsm_v2(handles->cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT, descA->mb, tempnn, &zone, T, ldak_T, C, ldak_C); PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -337,6 +339,7 @@ BODY [type=CUDA] handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); assert(NULL != handles); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); status = cublasZherk_v2( handles->cublas_handle, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_C, tempnn, descA->mb, @@ -344,7 +347,7 @@ BODY [type=CUDA] &zone, T, ldan); PARSEC_CUDA_CHECK_ERROR( "cublasZherk_v2 ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -460,7 +463,7 @@ BODY [type=CUDA handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); assert(NULL != handles); - cublasSetKernelStream( parsec_body.stream ); + cublasSetStream( handles->cublas_handle, parsec_body.stream ); status = cublasZgemm_v2( handles->cublas_handle, CUBLAS_OP_C, CUBLAS_OP_N, descA->mb, tempnn, descA->nb, @@ -468,7 +471,7 @@ BODY [type=CUDA (cuDoubleComplex*)B, ldak_B, &zone, (cuDoubleComplex*)C, ldam_C); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); printlogcuda("CUDA_zgemm( %d, %d, %d )\n\t( %s, %s, %d, %d, %d, %f, A(%d,%d)[%p], %d, A(%d,%d)[%p], %d, %f, A(%d,%d)[%p], %d)\n", m, n, k, diff --git a/src/zpotrf_wrapper.c b/src/zpotrf_wrapper.c index 71da5948..8699ee0d 100644 --- a/src/zpotrf_wrapper.c +++ b/src/zpotrf_wrapper.c @@ -12,9 +12,12 @@ #include "dplasma.h" #include "dplasma/types.h" #include "dplasma/types_lapack.h" -#include "dplasmaaux.h" +#if defined(DPLASMA_HAVE_CUDA) +#include #include "potrf_cublas_utils.h" #include "parsec/utils/zone_malloc.h" +#endif /* defined(DPLASMA_HAVE_CUDA) */ +#include "dplasmaaux.h" #include "zpotrf_U.h" #include "zpotrf_L.h" diff --git a/tests/common.c b/tests/common.c index c4212198..78a3d865 100644 --- a/tests/common.c +++ b/tests/common.c @@ -27,8 +27,8 @@ #include #endif #if defined(DPLASMA_HAVE_CUDA) +#include #include "dplasmaaux.h" -#include #include #endif @@ -732,7 +732,6 @@ void cleanup_parsec(parsec_context_t* parsec, int *iparam) #if defined(DPLASMA_HAVE_CUDA) parsec_info_id_t CuHI = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", NULL); parsec_info_unregister(&parsec_per_stream_infos, CuHI, NULL); - cublasShutdown(); #endif parsec_fini(&parsec); diff --git a/tools/gemmpeak/cu-gemmpeak.cpp b/tools/gemmpeak/cu-gemmpeak.cpp index 0aacc069..7f91d5e1 100644 --- a/tools/gemmpeak/cu-gemmpeak.cpp +++ b/tools/gemmpeak/cu-gemmpeak.cpp @@ -1,8 +1,8 @@ #include #include -#include "cuda.h" -#include "cuda_runtime_api.h" -#include "cublas.h" +#include +#include +#include inline float mmax( float a, float b ) { return a > b ? a : b; }