Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add parsec_advise_data_on_device for zpotrf_L #118

Merged
merged 3 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion parsec
Submodule parsec updated 100 files
107 changes: 107 additions & 0 deletions src/dplasmaaux.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#include <string.h>
#include "dplasmaaux.h"
#include "parsec/utils/show_help.h"
#include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h"
#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"

#if defined(PARSEC_HAVE_MPI)
/*
Expand Down Expand Up @@ -110,3 +112,108 @@ dplasma_aux_getGEMMLookahead( parsec_tiled_matrix_t *A )
}
}

#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)

/** Find all GPUs
* Size of dev_index: at least parsec_nb_devices
*/
void dplasma_find_nb_gpus(int *dev_index, int *nb) {
*nb = 0;
for(int i = 0; i < (int)parsec_nb_devices; i++) {
parsec_device_module_t *device = parsec_mca_device_get(i);
if( PARSEC_DEV_CUDA & device->type || PARSEC_DEV_HIP & device->type ) {
dev_index[(*nb)++] = device->device_index;
}
}

#if defined(DPLASMA_DEBUG)
if((*nb) == 0) {
char hostname[256];
gethostname(hostname, 256);
parsec_warning(stderr, "No CUDA device found on rank %d on %s\n",
parsec->my_rank, hostname);
}
#endif
}

/** Get the most suitable process/gpu grid */
int dplasma_grid_calculation( int nb_process ) {
int P;
for( P = (int)(sqrt(nb_process + 1.0)); P > 0; P-- ) {
if( 0 == nb_process % P ) break;
}
return P;
}

/* Operator 2D */
int dplasma_advise_data_on_device_ops_2D(parsec_execution_stream_t *es,
const parsec_tiled_matrix_t *A,
void *_A, parsec_matrix_uplo_t uplo,
int m, int n, void *op_args) {
dplasma_advise_data_on_device_t *args = (dplasma_advise_data_on_device_t *)op_args;

if( args->nb_gpu_devices > 0 ) {
/* Nested 2D grid on GPU */
int g = (m / args->grid_rows % args->gpu_rows) * args->gpu_cols + n / args->grid_cols % args->gpu_cols;
parsec_advise_data_on_device(A->super.data_of((parsec_data_collection_t*)A, m, n),
args->gpu_device_index[g],
PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
}

(void)es; (void)uplo;
return 0;
}

/* Set advise data on device
*
* If op_args == NULL, use dplasma_advise_data_on_device_t by default
*/
int dplasma_advise_data_on_device(parsec_context_t *parsec,
parsec_matrix_uplo_t uplo,
parsec_tiled_matrix_t *A,
parsec_tiled_matrix_unary_op_t operation,
void *op_args) {

if(NULL != op_args) {
parsec_apply(parsec, uplo, A, operation, op_args);
} else {
/* Find the number of GPUs */
dplasma_advise_data_on_device_t *args = (dplasma_advise_data_on_device_t *)malloc(sizeof(dplasma_advise_data_on_device_t));
args->gpu_device_index = (int *)malloc(parsec_nb_devices * sizeof(int));
dplasma_find_nb_gpus(args->gpu_device_index, &args->nb_gpu_devices);

/* Calculate the nested grid for the multiple GPUs on one process
* gpu_rows >= gpu_cols and as square as possible */
bosilca marked this conversation as resolved.
Show resolved Hide resolved
if(dplasmaUpper == uplo) {
args->gpu_rows = dplasma_grid_calculation(args->nb_gpu_devices);
args->gpu_cols = args->nb_gpu_devices/args->gpu_rows;
} else {
args->gpu_cols = dplasma_grid_calculation(args->nb_gpu_devices);
args->gpu_rows = args->nb_gpu_devices/args->gpu_cols;
}

if(dplasmaUpper == uplo || dplasmaLower == uplo) {
args->grid_rows = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.rows;
args->grid_cols = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.cols;
} else if(dplasmaUpperLower == uplo) {
args->grid_rows = ((parsec_matrix_block_cyclic_t *)A)->grid.rows;
args->grid_cols = ((parsec_matrix_block_cyclic_t *)A)->grid.cols;
} else {
dplasma_error("dplasma_advise_data_on_device", "illegal value of uplo");
}

#if defined(DPLASMA_DEBUG)
parsec_warning("nb_gpu_devices %d gpu_rows %d gpu_cols %d grid_rows %d grid_cols %d\n",
args->nb_gpu_devices, args->gpu_rows, args->gpu_cols, args->grid_rows, args->grid_cols);
#endif

parsec_apply(parsec, uplo, A, operation, (void *)args);

free(args->gpu_device_index);
free(args);
}

return 0;
}

#endif
37 changes: 37 additions & 0 deletions src/dplasmaaux.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,4 +115,41 @@ extern void *dplasma_pcomm;
#if defined(DPLASMA_HAVE_HIP)
#include "dplasmaaux_hip.h"
#endif

#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)
/* Advise data on device arguments */
typedef struct dplasma_advise_data_on_device_s {
int nb_gpu_devices;
int *gpu_device_index;
int gpu_rows;
int gpu_cols;
int grid_rows;
int grid_cols;
} dplasma_advise_data_on_device_t;

/* Find all GPUs
* Size of dev_index: at least parsec_nb_devices
* */
void dplasma_find_nb_gpus(int *dev_index, int *nb);

/* Get the most suitable process/gpu grid */
int dplasma_grid_calculation( int nb_process );

/* Operator 2D */
int dplasma_advise_data_on_device_ops_2D(parsec_execution_stream_t *es,
const parsec_tiled_matrix_t *descA,
void *_A, parsec_matrix_uplo_t uplo,
int m, int n, void *args);

/* Set advise data on device
*
* If op_args == NULL, use dplasma_advise_data_on_device_t by default
*/
int dplasma_advise_data_on_device( parsec_context_t *parsec,
parsec_matrix_uplo_t uplo,
parsec_tiled_matrix_t *A,
parsec_tiled_matrix_unary_op_t operation,
void *op_args );
#endif

#endif /* _DPLASMAAUX_H_INCLUDED */
4 changes: 4 additions & 0 deletions src/zlascal_wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "dplasma.h"
#include "dplasma/types.h"
#include "dplasmaaux.h"
#include "parsec/data_dist/matrix/apply.h"
#include "cores/core_blas.h"


Expand Down Expand Up @@ -142,6 +143,9 @@ dplasma_zlascal_New( dplasma_enum_t uplo,
void
dplasma_zlascal_Destruct( parsec_taskpool_t *tp )
{
if( ((parsec_apply_taskpool_t *)tp)->_g_op_args ) {
free( ((parsec_apply_taskpool_t *)tp)->_g_op_args );
}
parsec_apply_Destruct(tp);
}

Expand Down
4 changes: 4 additions & 0 deletions src/zlaset_wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "dplasma.h"
#include "dplasma/types.h"
#include "dplasmaaux.h"
#include "parsec/data_dist/matrix/apply.h"


static int
Expand Down Expand Up @@ -126,6 +127,9 @@ dplasma_zlaset_New( dplasma_enum_t uplo,
void
dplasma_zlaset_Destruct( parsec_taskpool_t *tp )
{
if( ((parsec_apply_taskpool_t *)tp)->_g_op_args ) {
free( ((parsec_apply_taskpool_t *)tp)->_g_op_args );
}
parsec_apply_Destruct(tp);
}

Expand Down
2 changes: 2 additions & 0 deletions src/zlatms_wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,10 @@ dplasma_zlatms( parsec_context_t *parsec,
parsec_context_start( parsec );
parsec_context_wait( parsec );
parsec_apply_Destruct(tp);
free(condptr);
}
else {
free(condptr);
return -1;
}
}
Expand Down
4 changes: 4 additions & 0 deletions src/zplghe_wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "dplasma.h"
#include "dplasma/types.h"
#include "dplasmaaux.h"
#include "parsec/data_dist/matrix/apply.h"
#include "cores/core_blas.h"


Expand Down Expand Up @@ -127,6 +128,9 @@ dplasma_zplghe_New( double bump, dplasma_enum_t uplo,
void
dplasma_zplghe_Destruct( parsec_taskpool_t *tp )
{
if( ((parsec_apply_taskpool_t *)tp)->_g_op_args ) {
free( ((parsec_apply_taskpool_t *)tp)->_g_op_args );
}
parsec_apply_Destruct(tp);
}

Expand Down
5 changes: 4 additions & 1 deletion src/zplgsy_wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include "dplasma.h"
#include "dplasma/types.h"
#include "dplasmaaux.h"

#include "parsec/data_dist/matrix/apply.h"
#include "cores/core_blas.h"

struct zplgsy_args_s {
Expand Down Expand Up @@ -129,6 +129,9 @@ dplasma_zplgsy_New( dplasma_complex64_t bump, dplasma_enum_t uplo,
void
dplasma_zplgsy_Destruct( parsec_taskpool_t *tp )
{
if( ((parsec_apply_taskpool_t *)tp)->_g_op_args ) {
free( ((parsec_apply_taskpool_t *)tp)->_g_op_args );
}
parsec_apply_Destruct(tp);
}

Expand Down
5 changes: 4 additions & 1 deletion src/zplrnt_wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include "dplasma.h"
#include "dplasma/types.h"
#include "dplasmaaux.h"

#include "parsec/data_dist/matrix/apply.h"
#include "cores/core_blas.h"


Expand Down Expand Up @@ -143,6 +143,9 @@ dplasma_zplrnt_New( int diagdom,
void
dplasma_zplrnt_Destruct( parsec_taskpool_t *tp )
{
if( ((parsec_apply_taskpool_t *)tp)->_g_op_args ) {
free( ((parsec_apply_taskpool_t *)tp)->_g_op_args );
}
parsec_apply_Destruct(tp);
}

Expand Down
2 changes: 2 additions & 0 deletions src/zpltmg_wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,11 @@ dplasma_zpltmg_generic( parsec_context_t *parsec,
{
parsec_context_add_taskpool(parsec, (parsec_taskpool_t*)parsec_zpltmg);
dplasma_wait_until_completion(parsec);
free(params);
parsec_apply_Destruct( parsec_zpltmg );
return 0;
}
free(params);
return -101;
}

Expand Down
21 changes: 21 additions & 0 deletions tests/testing_zgemm.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
*/

#include "common.h"
#include "dplasmaaux.h"
#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"

static int check_solution( parsec_context_t *parsec, int loud,
Expand Down Expand Up @@ -76,6 +77,16 @@ int main(int argc, char ** argv)
dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_t *)&dcC, Cseed);
if(loud > 2) printf("Done\n");

/* Advice data on device */
#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcA,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcB,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcC,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
#endif

int t;
for(t = 0; t < nruns; t++) {
parsec_devices_release_memory();
Expand Down Expand Up @@ -142,6 +153,16 @@ int main(int argc, char ** argv)
parsec_devices_release_memory();
parsec_devices_reset_load(parsec);

/* Advice data on device */
#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcA,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcB,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcC,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
#endif

/* Create GEMM PaRSEC */
if(loud) printf("Compute ... ... ");
PASTE_CODE_ENQUEUE_PROGRESS_DESTRUCT_KERNEL(parsec, zgemm,
Expand Down
10 changes: 9 additions & 1 deletion tests/testing_zpotrf.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include "common.h"
#include "flops.h"
#include "dplasmaaux.h"
#include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h"
#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"

Expand All @@ -18,7 +19,7 @@ int main(int argc, char ** argv)
{
parsec_context_t* parsec;
int iparam[IPARAM_SIZEOF];
dplasma_enum_t uplo = dplasmaUpper;
bosilca marked this conversation as resolved.
Show resolved Hide resolved
dplasma_enum_t uplo = dplasmaLower;
int info = 0;
int ret = 0;

Expand All @@ -43,6 +44,13 @@ int main(int argc, char ** argv)
parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE,
rank, MB, NB, LDA, N, 0, 0,
N, N, P, nodes/P, uplo));

/* Advice data on device */
#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)
dplasma_advise_data_on_device(parsec, uplo, (parsec_tiled_matrix_t*)&dcA,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
#endif

int t;
for(t = 0; t < nruns; t++) {
/* matrix (re)generation */
Expand Down
Loading