Skip to content

Commit

Permalink
Merge pull request #95 from bosilca/fix/cublas_v2_conflict
Browse files Browse the repository at this point in the history
Allow a mix of cublas v1 and v2 into DPLASMA.
  • Loading branch information
bosilca authored Aug 2, 2023
2 parents 45831f1 + cf33780 commit 9f7bffd
Show file tree
Hide file tree
Showing 16 changed files with 125 additions and 105 deletions.
9 changes: 5 additions & 4 deletions src/dplasmaaux_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
* $COPYRIGHT
*
*/
#include <cublas_v2.h>
#include <cusolverDn.h>
#include "potrf_cublas_utils.h"
#include "dplasma/config.h"
#include "parsec/utils/zone_malloc.h"
#include "parsec/utils/show_help.h"
#include <cublas_v2.h>
#include "dplasmaaux_cuda.h"
#include "potrf_cublas_utils.h"

/*
* Global info ID's for cublas handles and workspaces
Expand Down Expand Up @@ -95,4 +96,4 @@ void *dplasma_create_cuda_handles(void *obj, void *_n)
new->cusolverDn_handle = cusolver_handle;

return new;
}
}
55 changes: 54 additions & 1 deletion src/dplasmaaux_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,52 @@

#include "parsec/mca/device/cuda/device_cuda.h"

/**
* DPLASMA currently supports a mix of cublas v1 and v2, but not in the same source file. Thus,
* the simplest way to provide common headers is to require the developer to manually specify
* when cublas_v2 is needed by including the header before dplasmaaux.h. Otherwise, we will include
* cublas.h (v1) automatically if CUDA is enabled.
*/
#if !defined(CUBLAS_V2_H_)
#include <cublas.h>
#endif /* !defined(CUBLAS_V2_H_) */

#define dplasma_cublas_side(side) \
assert( (side == dplasmaRight) || (side == dplasmaLeft) ); \
side = (side == dplasmaRight) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;


#define dplasma_cublas_diag(diag) \
assert( (diag == dplasmaNonUnit) || (diag == dplasmaUnit) ); \
diag = (diag == dplasmaNonUnit) ? CUBLAS_DIAG_NON_UNIT : CUBLAS_DIAG_UNIT;

#define dplasma_cublas_fill(fill) \
assert( (fill == dplasmaLower) || (fill == dplasmaUpper) ); \
fill = (fill == dplasmaLower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;

#if defined(PRECISION_z) || defined(PRECISION_c)
#define dplasma_cublas_op(trans) \
assert( (trans == dplasmaNoTrans) || (trans == dplasmaTrans) || (trans == dplasmaConjTrans) ); \
switch(trans){ \
case dplasmaNoTrans: \
trans = CUBLAS_OP_N; \
break; \
case dplasmaTrans: \
trans = CUBLAS_OP_T; \
break; \
case dplasmaConjTrans: \
trans = CUBLAS_OP_C; \
break; \
default: \
trans = CUBLAS_OP_N; \
break; \
}
#else
#define dplasma_cublas_op(trans) \
assert( (trans == dplasmaNoTrans) || (trans == dplasmaTrans) ); \
trans = (trans == dplasmaNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
#endif /* PRECISION_z || PRECISION_c */

extern parsec_info_id_t CuHI;
extern parsec_info_id_t WoSI;

Expand All @@ -31,6 +77,12 @@ void *dplasma_create_cuda_handles(void *obj, void *user);
} \
} while(0)

#if defined(CUBLAS_V2_H_)
/* Support for cusolve requires cublas_v2 */
#include <cusolverDn.h>

char *dplasma_cusolver_error_to_string(cusolverStatus_t cusolver_status);

#define DPLASMA_CUSOLVER_CHECK_STATUS( STR, STATUS, CODE ) \
do { \
cusolverStatus_t __cusolver_status = (cusolverStatus_t) (STATUS); \
Expand All @@ -40,5 +92,6 @@ void *dplasma_create_cuda_handles(void *obj, void *user);
CODE; \
} \
} while(0)
#endif /* defined(CUBLAS_V2_H_) */

#endif /* __DPLAMAAUX_CUDA_H__ */
#endif /* __DPLAMAAUX_CUDA_H__ */
4 changes: 0 additions & 4 deletions src/dplasmajdf.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,5 @@
#undef TEMP_TYPE
#endif /* PARSEC_HAVE_MPI */

#if defined(DPLASMA_HAVE_CUDA)
#include <cublas.h>
#endif /* defined(DPLASMA_HAVE_CUDA) */

#endif /* _DPLASMAJDF_H_ */

3 changes: 1 addition & 2 deletions src/dtd_wrappers/dplasma_z_dtd.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
#include "parsec/utils/zone_malloc.h"
#include "dplasmaaux.h"
#include "potrf_cublas_utils.h"
#include <cublas_v2.h>

/* probably need to add this to substitions */
#if defined(PRECISION_s)
Expand Down Expand Up @@ -52,4 +51,4 @@ parsec_task_class_t * parsec_dtd_create_ztrsm_task_class(parsec_taskpool_t * dtd
parsec_task_class_t * parsec_dtd_create_zherk_task_class(parsec_taskpool_t * dtd_tp, int tile_full, int devices);
parsec_task_class_t * parsec_dtd_create_zgemm_task_class(parsec_taskpool_t * dtd_tp, int tile_full, int devices);

#endif /* __DTD_WRAPPERS_Z_H__ */
#endif /* __DTD_WRAPPERS_Z_H__ */
7 changes: 7 additions & 0 deletions src/dtd_wrappers/zgemm.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
* @precisions normal z -> s d c
*
*/
#include "dplasma/config.h"

#if defined(DPLASMA_HAVE_CUDA)
#include <cublas_v2.h>
#endif /* defined(DPLASMA_HAVE_CUDA) */

#include "dplasma_z_dtd.h"

Expand Down Expand Up @@ -77,6 +82,8 @@ parsec_core_zgemm_cuda(parsec_device_gpu_module_t* gpu_device,

handles = parsec_info_get(&gpu_stream->infos, CuHI);

parsec_cuda_exec_stream_t* cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream;
cublasSetStream( handles->cublas_handle, cuda_stream->cuda_stream );
status = cublasZgemm(handles->cublas_handle, transA, transB,
n, m, k,
&alphag, (cuDoubleComplex*)Ag, lda,
Expand Down
9 changes: 8 additions & 1 deletion src/dtd_wrappers/zherk.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
* @precisions normal z -> s d c
*
*/
#include "dplasma/config.h"

#if defined(DPLASMA_HAVE_CUDA)
#include <cublas_v2.h>
#endif /* defined(DPLASMA_HAVE_CUDA) */

#include "dplasma_z_dtd.h"

Expand Down Expand Up @@ -66,6 +71,8 @@ parsec_core_zherk_cuda(parsec_device_gpu_module_t* gpu_device,
}
#endif /* defined(PARSEC_DEBUG_NOISIER) */

parsec_cuda_exec_stream_t* cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream;
cublasSetStream( handles->cublas_handle, cuda_stream->cuda_stream );
status = cublasZherk(handles->cublas_handle, uplo, trans,
m, n,
&alpha, (cuDoubleComplex*)Ag, lda,
Expand Down Expand Up @@ -105,4 +112,4 @@ parsec_dtd_create_zherk_task_class(parsec_taskpool_t* dtd_tp, int tile_full, int
parsec_dtd_task_class_add_chore(dtd_tp, zherk_tc, PARSEC_DEV_CPU, parsec_core_zherk);

return zherk_tc;
}
}
5 changes: 5 additions & 0 deletions src/dtd_wrappers/zpotrf.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
* @precisions normal z -> s d c
*
*/
#include "dplasma/config.h"

#if defined(DPLASMA_HAVE_CUDA)
#include <cublas_v2.h>
#endif /* defined(DPLASMA_HAVE_CUDA) */

#include "dplasma_z_dtd.h"

Expand Down
7 changes: 7 additions & 0 deletions src/dtd_wrappers/ztrsm.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
* @precisions normal z -> s d c
*
*/
#include "dplasma/config.h"

#if defined(DPLASMA_HAVE_CUDA)
#include <cublas_v2.h>
#endif /* defined(DPLASMA_HAVE_CUDA) */

#include "dplasma_z_dtd.h"

Expand Down Expand Up @@ -74,6 +79,8 @@ parsec_core_ztrsm_cuda(parsec_device_gpu_module_t* gpu_device,
}
#endif /* defined(PARSEC_DEBUG_NOISIER) */

parsec_cuda_exec_stream_t* cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream;
cublasSetStream( handles->cublas_handle, cuda_stream->cuda_stream );
status = cublasZtrsm(handles->cublas_handle,
side, uplo, trans, diag,
m, n, &alphag,
Expand Down
41 changes: 0 additions & 41 deletions src/include/dplasma/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -209,45 +209,4 @@ enum dplasma_matrix_type_e {
extern char *dplasma_lapack_const_strings[];
#define dplasma_lapack_const(plasma_const) (dplasma_lapack_const_strings[plasma_const][0])

#if defined(DPLASMA_HAVE_CUDA)
#include <cublas.h>

#define dplasma_cublas_side(side) \
assert( (side == dplasmaRight) || (side == dplasmaLeft) ); \
side = (side == dplasmaRight) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;


#define dplasma_cublas_diag(diag) \
assert( (diag == dplasmaNonUnit) || (diag == dplasmaUnit) ); \
diag = (diag == dplasmaNonUnit) ? CUBLAS_DIAG_NON_UNIT : CUBLAS_DIAG_UNIT;

#define dplasma_cublas_fill(fill) \
assert( (fill == dplasmaLower) || (fill == dplasmaUpper) ); \
fill = (fill == dplasmaLower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;

#if defined(PRECISION_z) || defined(PRECISION_c)
#define dplasma_cublas_op(trans) \
assert( (trans == dplasmaNoTrans) || (trans == dplasmaTrans) || (trans == dplasmaConjTrans) ); \
switch(trans){ \
case dplasmaNoTrans: \
trans = CUBLAS_OP_N; \
break; \
case dplasmaTrans: \
trans = CUBLAS_OP_T; \
break; \
case dplasmaConjTrans: \
trans = CUBLAS_OP_C; \
break; \
default: \
trans = CUBLAS_OP_N; \
break; \
}
#else
#define dplasma_cublas_op(trans) \
assert( (trans == dplasmaNoTrans) || (trans == dplasmaTrans) ); \
trans = (trans == dplasmaNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
#endif /* PRECISION_z || PRECISION_c */

#endif /* DPLASMA_HAVE_CUDA */

#endif /* _DPLASMA_CONSTANTS_H_ */
2 changes: 1 addition & 1 deletion src/include/dplasma/types_lapack.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

/* Support for TILED/LAPACK matrix with non homogeneous datatypes across tiles.
* NOTE: we are operating with the following condition:
* For a given datacollection, if we reuse the datatype for one shape on different
* For a given data collection, if we reuse the datatype for one shape on different
* locations, then other shapes will also be reusing a datatype for those locations.
* The same happens between layouts.
*
Expand Down
26 changes: 1 addition & 25 deletions src/potrf_cublas_utils.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021 The University of Tennessee and The University
* Copyright (c) 2020-2023 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
Expand All @@ -8,8 +8,6 @@
#define DPLASMA_POTRF_CUBLAS_UTILS_H

#if defined(DPLASMA_HAVE_CUDA)
#include <cublas.h>
#include <cusolverDn.h>

typedef struct {
char *tmpmem;
Expand All @@ -20,28 +18,6 @@ typedef struct {
void* host_buffer;
} dplasma_potrf_workspace_t;

typedef cusolverStatus_t (*cublas_spotrf_v2_t) (
cusolverDnHandle_t handle, cublasFillMode_t uplo,
int n, float *A, int lda,
float *Workspace, int Lwork, int *devInfo );

typedef cusolverStatus_t (*cublas_dpotrf_v2_t) (
cusolverDnHandle_t handle, cublasFillMode_t uplo,
int n, double *A, int lda,
double *Workspace, int Lwork, int *devInfo );

typedef cusolverStatus_t (*cublas_cpotrf_v2_t) (
cusolverDnHandle_t handle, cublasFillMode_t uplo,
int n, cuComplex *A, int lda,
cuComplex *Workspace, int Lwork, int *devInfo );

typedef cusolverStatus_t (*cublas_zpotrf_v2_t) (
cusolverDnHandle_t handle, cublasFillMode_t uplo,
int n, cuDoubleComplex *A, int lda,
cuDoubleComplex *Workspace, int Lwork, int *devInfo );

char *dplasma_cusolver_error_to_string(cusolverStatus_t cusolver_status);

#endif

#endif //DPLASMA_POTRF_CUBLAS_UTILS_H
25 changes: 15 additions & 10 deletions src/zpotrf_L.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,19 @@ extern "C" %{
* @precisions normal z -> s d c
*
*/
#include "dplasma/config.h"
#if defined(DPLASMA_HAVE_CUDA)
#include <cublas_v2.h>
#include "potrf_cublas_utils.h"
#endif /* defined(DPLASMA_HAVE_CUDA) */

#include "dplasmajdf.h"
#include "parsec/data_dist/matrix/matrix.h"

#include "parsec/data_dist/matrix/subtile.h"
#include "parsec/recursive.h"
static void zpotrf_L_update_INFO(parsec_taskpool_t* _tp, const parsec_recursive_callback_t* data);

#if defined(DPLASMA_HAVE_CUDA)
#include <cublas_v2.h>
#include "potrf_cublas_utils.h"
#endif /* defined(DPLASMA_HAVE_CUDA) */

/* Define the different shapes this JDF is using */
#define DEFAULT 0

Expand Down Expand Up @@ -163,7 +164,7 @@ BODY [type=CUDA

status = cusolverDnZpotrf( handles->cusolverDn_handle, cublas_uplo, tempkm, T, ldak, workspace, wp->lwork, d_iinfo);
PARSEC_CUDA_CHECK_ERROR( "cublasZpotrf_v2 ", status,
{return -1;} );
{return PARSEC_HOOK_RETURN_ERROR;} );
}
END

Expand Down Expand Up @@ -228,13 +229,14 @@ BODY [type=CUDA]
cublasStatus_t status;
handles = parsec_info_get(&gpu_stream->infos, CuHandlesID);
assert(NULL != handles);
cublasSetStream( handles->cublas_handle, parsec_body.stream );
status = cublasZtrsm_v2(handles->cublas_handle,
CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER,
CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT,
tempmm, descA->nb,
&zone, T, ldak, C, ldam);
PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status,
{return -1;} );
{return PARSEC_HOOK_RETURN_ERROR;} );
}
END

Expand Down Expand Up @@ -335,13 +337,14 @@ BODY [type=CUDA]

handles = parsec_info_get(&gpu_stream->infos, CuHandlesID);
assert(NULL != handles);
cublasSetStream( handles->cublas_handle, parsec_body.stream );
status = cublasZherk_v2( handles->cublas_handle,
CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N,
tempmm, descA->mb,
&mzone, A, ldam_A,
&zone, T, ldam_T);
PARSEC_CUDA_CHECK_ERROR( "cublasZherk_v2 ", status,
{return -1;} );
{return PARSEC_HOOK_RETURN_ERROR;} );
}
END

Expand Down Expand Up @@ -446,23 +449,25 @@ BODY [type=CUDA
int ldan_B = LDA(ddescA, B);
int ldam_C = LDA(ddescA, C);

dplasma_cuda_handles_t *handles;
cublasStatus_t status;
dplasma_cuda_handles_t *handles;

assert( ldam_A <= descA->mb );
assert( ldan_B <= descA->mb );
assert( ldam_C <= descA->mb );

handles = parsec_info_get(&gpu_stream->infos, CuHandlesID);
assert(NULL != handles);

cublasSetStream( handles->cublas_handle, parsec_body.stream );
status = cublasZgemm_v2( handles->cublas_handle,
CUBLAS_OP_N, CUBLAS_OP_C,
tempmm, descA->mb, descA->mb,
&mzone, (cuDoubleComplex*)A, ldam_A,
(cuDoubleComplex*)B, ldan_B,
&zone, (cuDoubleComplex*)C, ldam_C );
PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status,
{return -1;} );
{return PARSEC_HOOK_RETURN_ERROR;} );
}
END

Expand Down
Loading

0 comments on commit 9f7bffd

Please sign in to comment.