Skip to content

Commit

Permalink
Merge pull request #87 from abouteiller/parsec-api/pr/517
Browse files Browse the repository at this point in the history
Remove weight properties and use time_estimate in LLT
  • Loading branch information
abouteiller authored Jun 27, 2023
2 parents 13f5c64 + 2e8e7b5 commit 966c04d
Show file tree
Hide file tree
Showing 23 changed files with 86 additions and 53 deletions.
2 changes: 1 addition & 1 deletion parsec
Submodule parsec updated 42 files
+1 −0 .github/CI/github_runner.yaml
+4 −5 parsec/interfaces/dtd/insert_function.c
+2 −1 parsec/interfaces/ptg/ptg-compiler/jdf.c
+38 −18 parsec/interfaces/ptg/ptg-compiler/jdf2c.c
+28 −21 parsec/mca/device/cuda/device_cuda_module.c
+89 −89 parsec/mca/device/device.c
+21 −10 parsec/mca/device/device.h
+2 −2 parsec/mca/device/device_gpu.h
+5 −5 parsec/mca/device/template/device_template_module.c
+2 −3 parsec/parsec_comm_engine.c
+7 −0 parsec/parsec_internal.h
+3 −3 parsec/parsec_mpi_funnelled.c
+11 −1 parsec/scheduling.c
+1 −1 tests/api/CMakeLists.txt
+9 −7 tests/api/Testings.cmake
+0 −0 tests/api/init_fini.c
+0 −7 tests/api/taskpool_wait/CMakeLists.txt
+1 −0 tests/apps/Testings.cmake
+0 −5 tests/apps/stencil/CMakeLists.txt
+7 −0 tests/apps/stencil/Testings.cmake
+2 −1 tests/dsl/dtd/CMakeLists.txt
+2 −3 tests/dsl/dtd/Testings.cmake
+31 −0 tests/dsl/dtd/dtd_test_new_tile.c
+0 −1 tests/dsl/ptg/CMakeLists.txt
+1 −0 tests/runtime/CMakeLists.txt
+1 −0 tests/runtime/Testings.cmake
+1 −3 tests/runtime/cuda/CMakeLists.txt
+13 −0 tests/runtime/cuda/Testings.cmake
+0 −0 tests/runtime/cuda/cuda_test_internal.h
+21 −11 tests/runtime/cuda/get_best_device_check.jdf
+18 −7 tests/runtime/cuda/nvlink.jdf
+1 −1 tests/runtime/cuda/nvlink_main.c
+0 −0 tests/runtime/cuda/nvlink_wrapper.c
+0 −0 tests/runtime/cuda/nvlink_wrapper.h
+14 −3 tests/runtime/cuda/stage_custom.jdf
+0 −0 tests/runtime/cuda/stage_main.c
+16 −4 tests/runtime/cuda/stress.jdf
+0 −0 tests/runtime/cuda/stress_main.c
+0 −0 tests/runtime/cuda/stress_wrapper.c
+0 −0 tests/runtime/cuda/stress_wrapper.h
+0 −0 tests/runtime/cuda/testing_get_best_device.c
+1 −1 tests/runtime/scheduling/Testings.cmake
3 changes: 1 addition & 2 deletions src/zgemm_NN.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,7 @@ RW C <- (k == 0) ? ddescC(m, n) [ type = %{ return
CTL ctla -> (k < (descA->nt-lookQ)) ? ctla READ_A(m, k+lookQ)
CTL ctlb -> (k < (descA->nt-lookP)) ? ctlb READ_B(k+lookP, n)

BODY [type=CUDA
weight=(descA->nt-k)]
BODY [type=CUDA]
{
#if defined(PRECISION_z) || defined(PRECISION_c)
cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha));
Expand Down
3 changes: 1 addition & 2 deletions src/zgemm_NN_gpu.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -362,8 +362,7 @@ RW C <- k == 0 ? C READ_C(m, n)
CTL Z <- ( k > 0 ) & ((k % tD) == 0) ? Z LOCAL_BARRIER(x, y, z, u, v)
-> ((k == descB->mt-1) | (k == (z+1)*tD-1)) ? Z LOCAL_BARRIER(xn, yn, zn, u, v)

BODY [type=CUDA
weight=(descA->nt-k)]
BODY [type=CUDA]
{
#if defined(PRECISION_z) || defined(PRECISION_c)
cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha));
Expand Down
1 change: 0 additions & 1 deletion src/zgemm_NN_summa.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,6 @@ CTL ctla -> (k < (descA->nt-lookQ)) ? ctla RING_A(m, k+lookQ, n%Q)
CTL ctlb -> (k < (descA->nt-lookP)) ? ctlb RING_B(k+lookP, n, m%P)

BODY [type=CUDA
weight=(descA->nt-k)
A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%}
C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%}
Expand Down
1 change: 0 additions & 1 deletion src/zgemm_NT.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,6 @@ CTL ctla -> (k < (descA->nt-lookQ)) ? ctla READ_A(m, k+lookQ)
CTL ctlb -> (k < (descA->nt-lookP)) ? ctlb READ_B(n, k+lookP)

BODY [type=CUDA
weight=(descA->nt-k)
A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%}
C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%}
Expand Down
1 change: 0 additions & 1 deletion src/zgemm_NT_summa.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,6 @@ CTL ctla -> (k < (descA->nt-lookQ)) ? ctla RING_A(m, k+lookQ, n%Q)
CTL ctlb -> (k < (descA->nt-lookP)) ? ctlb RING_B(n, k+lookP, m%P)

BODY [type=CUDA
weight=(descA->nt-k)
A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%}
C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%}
Expand Down
1 change: 0 additions & 1 deletion src/zgemm_TN.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,6 @@ CTL ctla -> (k < (descA->mt-lookQ)) ? ctla READ_A(k+lookQ, m)
CTL ctlb -> (k < (descA->mt-lookP)) ? ctlb READ_B(k+lookP, n)

BODY [type=CUDA
weight=(descA->mt-k)
A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%}
C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%}
Expand Down
1 change: 0 additions & 1 deletion src/zgemm_TN_summa.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,6 @@ CTL ctla -> (k < (descA->mt-lookQ)) ? ctla RING_A(k+lookQ, m, n%Q)
CTL ctlb -> (k < (descA->mt-lookP)) ? ctlb RING_B(k+lookP, n, m%P)

BODY [type=CUDA
weight=(descA->mt-k)
A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%}
C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%}
Expand Down
1 change: 0 additions & 1 deletion src/zgemm_TT.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,6 @@ CTL ctla -> (k < (descA->mt-lookQ)) ? ctla READ_A(k+lookQ, m)
CTL ctlb -> (k < (descA->mt-lookP)) ? ctlb READ_B(n, k+lookP)

BODY [type=CUDA
weight=(descA->mt-k)
A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%}
C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%}
Expand Down
1 change: 0 additions & 1 deletion src/zgemm_TT_summa.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,6 @@ CTL ctla -> (k < (descA->mt-lookQ)) ? ctla RING_A(k+lookQ, m, n%Q)
CTL ctlb -> (k < (descA->mt-lookP)) ? ctlb RING_B(n, k+lookP, m%P)

BODY [type=CUDA
weight=(descA->mt-k)
A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%}
C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%}
Expand Down
1 change: 0 additions & 1 deletion src/zgetrf_nopiv.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,6 @@ loc_C = %{ return LOC(descA, m, n); %}
-> ((m > (k+1)) && (n > (k+1))) ? C zgemm(k+1, m, n) /* dep OUT: rely on datacopy dtt for sending */

BODY [type=CUDA
weight="dplasma_imin( m-k, n-k )"
A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
C.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
Expand Down
44 changes: 35 additions & 9 deletions src/zpotrf_L.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ static void zpotrf_L_update_INFO(parsec_taskpool_t* _tp, const parsec_recursive_
* WARNING: If mt is greater than 1200, we might get integer overflow.
*/

static int64_t zpotrf_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev);
static int64_t ztrsm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev);
static int64_t zherk_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev);
static int64_t zgemm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev);
%}

/* Globals
Expand All @@ -80,7 +84,9 @@ POWorkspaceID [type = "int" hidden = on default = -1 ]
/**************************************************
* potrf_zpotrf *
**************************************************/
potrf_zpotrf(k) [high_priority = on flops = inline_c %{ return FLOPS_ZPOTRF( CLEAN_MB(descA, k) ); %}]
potrf_zpotrf(k) [high_priority = on
flops = inline_c %{ return FLOPS_ZPOTRF( CLEAN_MB(descA, k) ); %}
time_estimate = zpotrf_time_estimate]

// Execution space
k = 0 .. descA->mt-1
Expand Down Expand Up @@ -181,7 +187,9 @@ END
/**************************************************
* potrf_ztrsm *
**************************************************/
potrf_ztrsm(m, k) [high_priority = on flops = inline_c %{ return FLOPS_ZTRSM(PlasmaRight, CLEAN_MB(descA, m), descA->nb); %}]
potrf_ztrsm(m, k) [high_priority = on
flops = inline_c %{ return FLOPS_ZTRSM(PlasmaRight, CLEAN_MB(descA, m), descA->nb); %}
time_estimate = ztrsm_time_estimate]

// Execution space
m = 1 .. descA->mt-1
Expand All @@ -206,8 +214,7 @@ RW C <- (k == 0) ? ddescA(m, k) [ type = %{ return ADTT_

; (m >= (descA->mt - PRI_CHANGE)) ? (descA->mt - m) * (descA->mt - m) * (descA->mt - m) + 3 * ((2 * descA->mt) - k - m - 1) * (m - k) : PRI_MAX

BODY [type=CUDA
weight=(m+k)]
BODY [type=CUDA]
{
int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb;
int ldak = LDA(ddescA, T);
Expand Down Expand Up @@ -293,7 +300,9 @@ END
/**************************************************
* potrf_zherk *
**************************************************/
potrf_zherk(k, m) [high_priority = on flops = inline_c %{ return FLOPS_ZHERK(CLEAN_MB(descA, m), descA->mb); %}]
potrf_zherk(k, m) [high_priority = on
flops = inline_c %{ return FLOPS_ZHERK(CLEAN_MB(descA, m), descA->mb); %}
time_estimate = zherk_time_estimate]

// Execution space
k = 0 .. descA->mt-2
Expand All @@ -314,8 +323,7 @@ RW T <- (k == 0) ? ddescA(m, m) [ type = %{ return ADTT_REA

; (m >= (descA->mt - PRI_CHANGE)) ? (descA->mt - m) * (descA->mt - m) * (descA->mt - m) + 3 * (m - k) : PRI_MAX

BODY [type=CUDA
weight=(m+k)]
BODY [type=CUDA]
{
int tempmm = m == descA->mt-1 ? descA->m - m*descA->mb : descA->mb;
int ldam_A = LDA(ddescA, A);
Expand Down Expand Up @@ -393,7 +401,8 @@ END
* potrf_zgemm *
**************************************************/
// Name
potrf_zgemm(m, n, k) [flops = inline_c %{ return FLOPS_ZGEMM(CLEAN_MB(descA, m), descA->mb, descA->mb); %}]
potrf_zgemm(m, n, k) [flops = inline_c %{ return FLOPS_ZGEMM(CLEAN_MB(descA, m), descA->mb, descA->mb); %}
time_estimate = zgemm_time_estimate]

// Execution space
k = 0 .. descA->mt-3
Expand All @@ -418,7 +427,6 @@ RW C <- (k == 0) ? ddescA(m, n) [ type = %{ return ADTT_
; (m >= (descA->mt - PRI_CHANGE)) ? (descA->mt - m) * (descA->mt - m) * (descA->mt - m) + 3 * ((2 * descA->mt) - m - n - 3) * (m - n) + 6 * (m - k) : PRI_MAX

BODY [type=CUDA
weight=(n+1-k)
A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
C.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
Expand Down Expand Up @@ -524,6 +532,24 @@ END

extern "C" %{

/* Compute the time estimates based on device capabilities and flops for the task */
static int64_t zpotrf_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev) {
int mb = ((parsec_zpotrf_L_taskpool_t*)task->taskpool)->_g_descA->mb;
return (int64_t)FLOPS_ZPOTRF(mb) / dev->gflops_fp64;
}
static int64_t ztrsm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev) {
int mb = ((parsec_zpotrf_L_taskpool_t*)task->taskpool)->_g_descA->mb;
return (int64_t)FLOPS_ZTRSM(PlasmaRight, mb, mb) / dev->gflops_fp64;
}
static int64_t zherk_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev) {
int mb = ((parsec_zpotrf_L_taskpool_t*)task->taskpool)->_g_descA->mb;
return (int64_t)FLOPS_ZHERK(mb, mb) / dev->gflops_fp64;
}
static int64_t zgemm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev) {
int mb = ((parsec_zpotrf_L_taskpool_t*)task->taskpool)->_g_descA->mb;
return (int64_t)FLOPS_ZGEMM(mb, mb, mb) / dev->gflops_fp64;
}

/*
* A function to recursively update the value of the INFO argument for
* recursive calls. We need a special function because the recursive calls being asynchronous
Expand Down
44 changes: 35 additions & 9 deletions src/zpotrf_U.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ static void zpotrf_U_update_INFO(parsec_taskpool_t* _tp, const parsec_recursive_
* WARNING: If mt is greater than 1200, we might get integer overflow.
*/

static int64_t zpotrf_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev);
static int64_t ztrsm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev);
static int64_t zherk_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev);
static int64_t zgemm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev);
%}

/* Globals
Expand All @@ -80,7 +84,9 @@ POWorkspaceID [type = "int" hidden = on default = -1 ]
/**************************************************
* potrf_zpotrf *
**************************************************/
potrf_zpotrf(k) [high_priority = on flops = inline_c %{ return FLOPS_ZPOTRF( CLEAN_NB(descA, k) ); %}]
potrf_zpotrf(k) [high_priority = on
flops = inline_c %{ return FLOPS_ZPOTRF( CLEAN_NB(descA, k) ); %}
time_estimate = zpotrf_time_estimate]

// Execution space
k = 0 .. descA->nt-1
Expand Down Expand Up @@ -182,7 +188,9 @@ END
/**************************************************
* potrf_ztrsm *
**************************************************/
potrf_ztrsm(k, n) [high_priority = on flops = inline_c %{ return FLOPS_ZTRSM(PlasmaLeft, descA->mb, CLEAN_NB(descA, n)); %}]
potrf_ztrsm(k, n) [high_priority = on
flops = inline_c %{ return FLOPS_ZTRSM(PlasmaLeft, descA->mb, CLEAN_NB(descA, n)); %}
time_estimate = ztrsm_time_estimate]

// Execution space
k = 0 .. descA->nt-2
Expand All @@ -207,8 +215,7 @@ RW C <- (k == 0) ? ddescA(k, n) [ type = %{ return ADTT_

; (n >= (descA->nt - PRI_CHANGE)) ? (descA->nt - n) * (descA->nt - n) * (descA->nt - n) + 3 * ((2 * descA->nt) - k - n - 1) * (n - k) : PRI_MAX

BODY [type=CUDA
weight=(k+n)]
BODY [type=CUDA]
{
int tempnn = n == descA->nt - 1 ? descA->n - n * descA->nb : descA->nb;
int ldak_T = LDA(ddescA, T);
Expand Down Expand Up @@ -294,7 +301,9 @@ END
/**************************************************
* potrf_zherk *
**************************************************/
potrf_zherk(k, n) [high_priority = on flops = inline_c %{ return FLOPS_ZHERK(CLEAN_NB(descA, n), descA->mb); %}]
potrf_zherk(k, n) [high_priority = on
flops = inline_c %{ return FLOPS_ZHERK(CLEAN_NB(descA, n), descA->mb); %}
time_estimate = zherk_time_estimate]

// Execution space
k = 0 .. descA->nt-2
Expand All @@ -316,8 +325,7 @@ RW T <- (k == 0) ? ddescA(n, n) [ type = %{ return ADTT_REA

; (n >= (descA->nt - PRI_CHANGE)) ? (descA->nt - n) * (descA->nt - n) * (descA->nt - n) + 3 * (n - k) : PRI_MAX

BODY [type=CUDA
weight=(k+n)]
BODY [type=CUDA]
{
int tempnn = n == descA->nt-1 ? descA->n - n*descA->nb : descA->nb;
int ldak = LDA(ddescA, A);
Expand Down Expand Up @@ -396,7 +404,8 @@ END
* potrf_zgemm *
**************************************************/
// Name
potrf_zgemm(m, n, k) [ flops = inline_c %{ return FLOPS_ZGEMM(descA->mb, CLEAN_NB(descA, n), descA->nb); %}]
potrf_zgemm(m, n, k) [ flops = inline_c %{ return FLOPS_ZGEMM(descA->mb, CLEAN_NB(descA, n), descA->nb); %}
time_estimate = zgemm_time_estimate]

// Execution space
k = 0 .. descA->mt-3
Expand All @@ -421,7 +430,6 @@ RW C <- (k == 0) ? ddescA(m, n) [ type = %{ return ADTT_
; (n >= (descA->nt - PRI_CHANGE)) ? (descA->nt - n) * (descA->nt - n) * (descA->nt - n) + 3 * ((2 * descA->nt) - m - n - 3) * (n - m) + 6 * (n - k) : PRI_MAX

BODY [type=CUDA
weight=(m+1-k)
A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
C.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
Expand Down Expand Up @@ -538,6 +546,24 @@ END

extern "C" %{

/* Compute the time estimates based on device capabilities and flops for the task */
static int64_t zpotrf_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev) {
int mb = ((parsec_zpotrf_U_taskpool_t*)task->taskpool)->_g_descA->mb;
return (int64_t)FLOPS_ZPOTRF(mb) / dev->gflops_fp64;
}
static int64_t ztrsm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev) {
int mb = ((parsec_zpotrf_U_taskpool_t*)task->taskpool)->_g_descA->mb;
return (int64_t)FLOPS_ZTRSM(PlasmaLeft, mb, mb) / dev->gflops_fp64;
}
static int64_t zherk_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev) {
int mb = ((parsec_zpotrf_U_taskpool_t*)task->taskpool)->_g_descA->mb;
return (int64_t)FLOPS_ZHERK(mb, mb) / dev->gflops_fp64;
}
static int64_t zgemm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev) {
int mb = ((parsec_zpotrf_U_taskpool_t*)task->taskpool)->_g_descA->mb;
return (int64_t)FLOPS_ZGEMM(mb, mb, mb) / dev->gflops_fp64;
}

/*
* A function to recursively update the value of the INFO argument for
* recursive calls. We need a special function because the recursive calls being asynchronous
Expand Down
3 changes: 1 addition & 2 deletions src/ztrsm_LLN.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,7 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(CLEAN_MB(descB, m), CLEAN_N
-> (m>=(k+2)) ? E zgemm(k+1, m, n)
-> ((k+1)==m) ? B ztrsm(m, n)

BODY [type=CUDA
weight=(n+1-k)]
BODY [type=CUDA]
{
#if defined(PRECISION_z) || defined(PRECISION_c)
cuDoubleComplex mzone = make_cuDoubleComplex(-1., 0.);
Expand Down
3 changes: 1 addition & 2 deletions src/ztrsm_LLT.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,7 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(descB->mb, CLEAN_NB(descB,
-> ((1+k)==m) ? B ztrsm(m, n)
-> (m>=(k+2)) ? E zgemm(k+1, m, n)

BODY [type=CUDA
weight=(n+1-k)]
BODY [type=CUDA]
{
#if defined(PRECISION_z) || defined(PRECISION_c)
cuDoubleComplex mzone = make_cuDoubleComplex(-1., 0.);
Expand Down
3 changes: 1 addition & 2 deletions src/ztrsm_LUN.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,7 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(descB->mb, CLEAN_NB(descB,
-> ((1+k)==m) ? B ztrsm(m, n)
-> (m>=(k+2)) ? E zgemm(k+1, m, n)

BODY [type=CUDA
weight=(n+1-k)]
BODY [type=CUDA]
{
#if defined(PRECISION_z) || defined(PRECISION_c)
cuDoubleComplex mzone = make_cuDoubleComplex(-1., 0.);
Expand Down
3 changes: 1 addition & 2 deletions src/ztrsm_LUT.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,7 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(CLEAN_MB(descB, m), CLEAN_N
-> (m>=(k+2)) ? E zgemm(k+1, m, n)
-> ((k+1)==m) ? B ztrsm(m, n)

BODY [type=CUDA
weight=(n+1-k)]
BODY [type=CUDA]
{
#if defined(PRECISION_z) || defined(PRECISION_c)
cuDoubleComplex mzone = make_cuDoubleComplex(-1., 0.);
Expand Down
3 changes: 1 addition & 2 deletions src/ztrsm_RLN.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,7 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(CLEAN_MB(descB, m), descB->
-> (n>=(k+2)) ? E zgemm(k+1, m, n)
-> ((k+1)==n) ? B ztrsm(n, m)

BODY [type=CUDA
weight=(n+1-k)]
BODY [type=CUDA]
{
#if defined(PRECISION_z) || defined(PRECISION_c)
cuDoubleComplex mzone = make_cuDoubleComplex(-1., 0.);
Expand Down
3 changes: 1 addition & 2 deletions src/ztrsm_RLT.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,7 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(CLEAN_MB(descB, m), CLEAN_N
-> ((1+k)==n) ? B ztrsm(n, m)
-> (n>=(2+k)) ? E zgemm(k+1, m, n)

BODY [type=CUDA
weight=(n+1-k)]
BODY [type=CUDA]
{
#if defined(PRECISION_z) || defined(PRECISION_c)
cuDoubleComplex zone = make_cuDoubleComplex( 1., 0.);
Expand Down
3 changes: 1 addition & 2 deletions src/ztrsm_RUN.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,7 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(CLEAN_MB(descB, m), CLEAN_N
-> ((1+k)==n) ? B ztrsm(n, m)
-> (n>=(2+k)) ? E zgemm(k+1, m, n)

BODY [type=CUDA
weight=(n+1-k)]
BODY [type=CUDA]
{
#if defined(PRECISION_z) || defined(PRECISION_c)
cuDoubleComplex mzone = make_cuDoubleComplex(-1., 0.);
Expand Down
3 changes: 1 addition & 2 deletions src/ztrsm_RUT.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,7 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(CLEAN_MB(descB, m), descB->
-> (n>=(k+2)) ? E zgemm(k+1, m, n)
-> ((k+1)==n) ? B ztrsm(n, m)

BODY [type=CUDA
weight=(n+1-k)]
BODY [type=CUDA]
{
#if defined(PRECISION_z) || defined(PRECISION_c)
cuDoubleComplex zone = make_cuDoubleComplex( 1., 0.);
Expand Down
Loading

0 comments on commit 966c04d

Please sign in to comment.