Skip to content

Commit

Permalink
zpotrf_dtd working for the GPU, although results still suspicious
Browse files Browse the repository at this point in the history
code refactoring, removed useless includes, changed redundancy in lapack to cublas converstions

reverted previous refactoring, added gpu support for gemm_dtd

added dtd sources and changed to cusolverDnXpotrf

refactoring, cusolver workspaces are allocated per streams

fixed typo and changed comment

changed to devices
  • Loading branch information
Brieuc Nicolas authored and 444nuits committed Aug 1, 2023
1 parent a0ea91c commit ffb656d
Show file tree
Hide file tree
Showing 16 changed files with 1,102 additions and 550 deletions.
4 changes: 4 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ set(EXTRA_SOURCES
if( NOT DPLASMA_HAVE_COMPLEX_H )
list(APPEND EXTRA_SOURCES complex.c)
endif()
if( DPLASMA_HAVE_CUDA )
list(APPEND EXTRA_SOURCES dplasmaaux_cuda.c)
endif()

### Generate .c files from .jdf for all required precisions
set(JDF
Expand Down Expand Up @@ -213,6 +216,7 @@ target_ptg_sources(dplasma PRIVATE
${generated_jdf})

Add_Subdirectory(cores)
Add_Subdirectory(dtd_wrappers)

target_include_directories(dplasma
INTERFACE
Expand Down
85 changes: 0 additions & 85 deletions src/dplasmaaux.c
Original file line number Diff line number Diff line change
Expand Up @@ -109,88 +109,3 @@ dplasma_aux_getGEMMLookahead( parsec_tiled_matrix_t *A )
return dplasma_imax( ceil( alpha ), 2 );
}
}

#if defined(DPLASMA_HAVE_CUDA)
#include <cublas_v2.h>
#include <cusolverDn.h>
#include "potrf_cublas_utils.h"
#include "parsec/utils/zone_malloc.h"

/* Unfortunately, CUBLAS does not provide a error to string function */
static char *dplasma_cublas_error_to_string(cublasStatus_t cublas_status)
{
switch(cublas_status)
{
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
default: return "unknown CUBLAS error";
}
}

/* Unfortunately, cuSolver does not provide a error to string function */
static char *dplasma_cusolver_error_to_string(cusolverStatus_t cusolver_status)
{
switch(cusolver_status) {
case CUSOLVER_STATUS_SUCCESS: return "CUSOLVER_STATUS_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED: return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED: return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE: return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH: return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED: return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR: return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
default: return "unknown cusolver error";
}
}

void *dplasma_create_cuda_handles(void *obj, void *_n)
{
parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)obj;
dplasma_cuda_handles_t *new;
cublasHandle_t cublas_handle;
cublasStatus_t cublas_status;

(void)_n;

/* No need to call cudaSetDevice, as this has been done by PaRSEC before calling the task body */
cublas_status = cublasCreate(&cublas_handle);
if(CUBLAS_STATUS_SUCCESS != cublas_status) {
if( CUBLAS_STATUS_ALLOC_FAILED == cublas_status ) {
parsec_show_help("help-dplasma.txt", "cu*_alloc_failed", 1, "CUBLAS");
}
parsec_fatal("Unable to create CUBLAS Handle: %s",
dplasma_cublas_error_to_string(cublas_status));
return NULL;
}
cublas_status = cublasSetStream(cublas_handle, cuda_stream->cuda_stream);
assert(CUBLAS_STATUS_SUCCESS == cublas_status);

cusolverDnHandle_t cusolver_handle;
cusolverStatus_t cusolver_status;
cusolver_status = cusolverDnCreate(&cusolver_handle);
if(CUSOLVER_STATUS_SUCCESS != cusolver_status) {
cublasDestroy(cublas_handle);
if( CUSOLVER_STATUS_ALLOC_FAILED == cusolver_status ) {
parsec_show_help("help-dplasma.txt", "cu*_alloc_failed", 1, "cusolver");
}
parsec_fatal("Unable to create a cuSolver handle: %s",
dplasma_cusolver_error_to_string(cusolver_status));
return NULL;
}
cusolver_status = cusolverDnSetStream(cusolver_handle, cuda_stream->cuda_stream);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);

new = malloc(sizeof(dplasma_cuda_handles_t));
new->cublas_handle = cublas_handle;
new->cusolverDn_handle = cusolver_handle;

return new;
}

#endif
9 changes: 1 addition & 8 deletions src/dplasmaaux.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,14 +110,7 @@ extern void *dplasma_pcomm;
#endif /* defined(DPLASMA_DEBUG) */

#if defined(DPLASMA_HAVE_CUDA)
#include <cublas.h>
#include "parsec/mca/device/cuda/device_cuda.h"
typedef struct {
cublasHandle_t cublas_handle;
void * cusolverDn_handle;
} dplasma_cuda_handles_t;
void *dplasma_create_cuda_handles(void *obj, void *user);

#include "dplasmaaux_cuda.h"
#endif

#endif /* _DPLASMAAUX_H_INCLUDED */
98 changes: 98 additions & 0 deletions src/dplasmaaux_cuda.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/*
* Copyright (c) 2023- The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT
*
*/
#include <cublas_v2.h>
#include <cusolverDn.h>
#include "potrf_cublas_utils.h"
#include "parsec/utils/zone_malloc.h"
#include "dplasmaaux_cuda.h"

/*
* Global info ID's for cublas handles and workspaces
* Should be initialized in the tests
* with the return of parsec_info_register
* or parsec_info_lookup
*/
parsec_info_id_t CuHI = -1;
parsec_info_id_t WoSI = -1;

/* Unfortunately, CUBLAS does not provide a error to string function */
static char *dplasma_cublas_error_to_string(cublasStatus_t cublas_status)
{
switch(cublas_status)
{
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
default: return "unknown CUBLAS error";
}
}

/* Unfortunately, cuSolver does not provide a error to string function */
char *dplasma_cusolver_error_to_string(cusolverStatus_t cusolver_status)
{
switch(cusolver_status) {
case CUSOLVER_STATUS_SUCCESS: return "CUSOLVER_STATUS_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED: return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED: return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE: return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH: return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED: return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR: return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
default: return "unknown cusolver error";
}
}

void *dplasma_create_cuda_handles(void *obj, void *_n)
{
parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)obj;
dplasma_cuda_handles_t *new;
cublasHandle_t cublas_handle;
cublasStatus_t cublas_status;

(void)_n;

/* No need to call cudaSetDevice, as this has been done by PaRSEC before calling the task body */
cublas_status = cublasCreate(&cublas_handle);
if(CUBLAS_STATUS_SUCCESS != cublas_status) {
if( CUBLAS_STATUS_ALLOC_FAILED == cublas_status ) {
parsec_show_help("help-dplasma.txt", "cu*_alloc_failed", 1, "CUBLAS");
}
parsec_fatal("Unable to create CUBLAS Handle: %s",
dplasma_cublas_error_to_string(cublas_status));
return NULL;
}
cublas_status = cublasSetStream(cublas_handle, cuda_stream->cuda_stream);
assert(CUBLAS_STATUS_SUCCESS == cublas_status);

cusolverDnHandle_t cusolver_handle;
cusolverStatus_t cusolver_status;
cusolver_status = cusolverDnCreate(&cusolver_handle);
if(CUSOLVER_STATUS_SUCCESS != cusolver_status) {
cublasDestroy(cublas_handle);
if( CUSOLVER_STATUS_ALLOC_FAILED == cusolver_status ) {
parsec_show_help("help-dplasma.txt", "cu*_alloc_failed", 1, "cusolver");
}
parsec_fatal("Unable to create a cuSolver handle: %s",
dplasma_cusolver_error_to_string(cusolver_status));
return NULL;
}
cusolver_status = cusolverDnSetStream(cusolver_handle, cuda_stream->cuda_stream);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);

new = malloc(sizeof(dplasma_cuda_handles_t));
new->cublas_handle = cublas_handle;
new->cusolverDn_handle = cusolver_handle;

return new;
}
44 changes: 44 additions & 0 deletions src/dplasmaaux_cuda.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/*
* Copyright (c) 2023- The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT
*
*/

#ifndef _DPLASMAAAUX_CUDA_H_
#define _DPLASMAAAUX_CUDA_H_

#include "parsec/mca/device/cuda/device_cuda.h"

extern parsec_info_id_t CuHI;
extern parsec_info_id_t WoSI;

typedef struct {
cublasHandle_t cublas_handle;
void * cusolverDn_handle;
} dplasma_cuda_handles_t;

void *dplasma_create_cuda_handles(void *obj, void *user);

#define DPLASMA_CUBLAS_CHECK_STATUS( STR, STATUS, CODE ) \
do { \
cublasStatus_t __cublas_status = (cublasStatus_t) (STATUS); \
if( CUBLAS_STATUS_SUCCESS != __cublas_status ) { \
parsec_warning( "%s:%d %s%s", __FILE__, __LINE__, \
(STR), cublasGetStatusString(__cublas_status) ); \
CODE; \
} \
} while(0)

#define DPLASMA_CUSOLVER_CHECK_STATUS( STR, STATUS, CODE ) \
do { \
cusolverStatus_t __cusolver_status = (cusolverStatus_t) (STATUS); \
if( CUSOLVER_STATUS_SUCCESS != __cusolver_status ) { \
parsec_warning( "%s:%d %s%s", __FILE__, __LINE__, \
(STR), dplasma_cusolver_error_to_string(__cusolver_status) ); \
CODE; \
} \
} while(0)

#endif /* __DPLAMAAUX_CUDA_H__ */
53 changes: 53 additions & 0 deletions src/dtd_wrappers/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
include(PrecisionGenerator)

set(DTD_HEADERS
dplasma_z_dtd.h
)
set(DTD_SOURCES
zgemm_dtd.c
ztrsm_dtd.c
zherk_dtd.c
zpotrf_dtd.c
)

set(generated_headers "")
precisions_rules_py(generated_dtd_headers
"${DTD_HEADERS}"
PRECISIONS "${DPLASMA_PRECISIONS}")

set(generated_files "")
precisions_rules_py(generated_dtd_cores
"${DTD_SOURCES}"
PRECISIONS "${DPLASMA_PRECISIONS}")

add_custom_target(dplasma_dtd_includes ALL SOURCES
${generated_dtd_headers} )

### Publish the documented files
#add_documented_files(PROJECT DPLASMA DIR ${CMAKE_CURRENT_BINARY_DIR} FILES ${generated_dtd_files} ${generated_dtd_headers})

### Generate the dplasma_dtd object library
# We cannot do a simple target_sources because of a bug in CMake <3.18 where the
# GENERATED property has a directory visibility
if(NOT TARGET dplasma_dtd)
add_library(dplasma_dtd OBJECT ${generated_dtd_cores})
endif(NOT TARGET dplasma_dtd)

add_dependencies(dplasma_dtd dplasma_includes dplasma_cores_includes dplasma_dtd_includes)

set_target_properties(dplasma_dtd PROPERTIES ENABLE_EXPORTS True)
set_target_properties(dplasma_dtd PROPERTIES POSITION_INDEPENDENT_CODE ${BUILD_SHARED_LIBS})
target_include_directories(dplasma_dtd
PRIVATE
${CMAKE_CURRENT_BINARY_DIR}
$<$<NOT:${DPLASMA_BUILD_INPLACE}>:${CMAKE_CURRENT_SOURCE_DIR}>)
target_link_libraries(dplasma_dtd
PUBLIC
dplasma_cores
PaRSEC::parsec
LAPACKE::LAPACKE
CUDA::cublas
)

# Integrate the output into the main library
target_sources(dplasma PRIVATE $<TARGET_OBJECTS:dplasma_dtd>)
55 changes: 55 additions & 0 deletions src/dtd_wrappers/dplasma_z_dtd.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Copyright (c) 2023- The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* @precisions normal z -> s d c
*
*/
#ifndef __DTD_WRAPPERS_Z_H__
#define __DTD_WRAPPERS_Z_H__

#include "dplasma/types.h"
#include "parsec/interfaces/dtd/insert_function.h"
#include "cores/core_blas.h"

#if defined(DPLASMA_HAVE_CUDA)
#include "parsec/execution_stream.h"
#include "parsec/parsec_internal.h"
#include "parsec/mca/device/cuda/device_cuda.h"
#include "parsec/utils/zone_malloc.h"
#include "dplasmaaux.h"
#include "potrf_cublas_utils.h"
#include <cublas_v2.h>

/* probably need to add this to substitions */
#if defined(PRECISION_s)
#define CUSOLVER_COMPUTE_TYPE CUDA_R_32F
#elif defined(PRECISION_d)
#define CUSOLVER_COMPUTE_TYPE CUDA_R_64F
#elif defined(PRECISION_c)
#define CUSOLVER_COMPUTE_TYPE CUDA_C_32F
#elif defined(PRECISION_z)
#define CUSOLVER_COMPUTE_TYPE CUDA_C_64F
#endif

typedef struct zpotrf_dtd_workspace_info_s {
int mb;
int nb;
dplasma_enum_t uplo;
} zpotrf_dtd_workspace_info_t;

void* zpotrf_dtd_create_workspace(void *obj, void *user);
void zpotrf_dtd_destroy_workspace(void *_ws, void *_n);

void* zpotrf_dtd_create_params(void *obj, void *user);
void zpotrf_dtd_destroy_params(void *params, void *_n);

#endif

parsec_task_class_t * parsec_dtd_create_zpotrf_task_class(parsec_taskpool_t * dtd_tp, int tile_full, int devices);
parsec_task_class_t * parsec_dtd_create_ztrsm_task_class(parsec_taskpool_t * dtd_tp, int tile_full, int devices);
parsec_task_class_t * parsec_dtd_create_zherk_task_class(parsec_taskpool_t * dtd_tp, int tile_full, int devices);
parsec_task_class_t * parsec_dtd_create_zgemm_task_class(parsec_taskpool_t * dtd_tp, int tile_full, int devices);

#endif /* __DTD_WRAPPERS_Z_H__ */
Loading

0 comments on commit ffb656d

Please sign in to comment.