diff --git a/parsec b/parsec index 53060153..25d3ec95 160000 --- a/parsec +++ b/parsec @@ -1 +1 @@ -Subproject commit 530601533fc80ad27d7c12cdfda56744746baac9 +Subproject commit 25d3ec95b844a5c769c37559dc85468378f44436 diff --git a/src/zgemm_NN.jdf b/src/zgemm_NN.jdf index b91ae820..9bcb9b2c 100644 --- a/src/zgemm_NN.jdf +++ b/src/zgemm_NN.jdf @@ -165,8 +165,7 @@ RW C <- (k == 0) ? ddescC(m, n) [ type = %{ return CTL ctla -> (k < (descA->nt-lookQ)) ? ctla READ_A(m, k+lookQ) CTL ctlb -> (k < (descA->nt-lookP)) ? ctlb READ_B(k+lookP, n) -BODY [type=CUDA - weight=(descA->nt-k)] +BODY [type=CUDA] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); diff --git a/src/zgemm_NN_gpu.jdf b/src/zgemm_NN_gpu.jdf index 79ff1a2d..d5463e63 100644 --- a/src/zgemm_NN_gpu.jdf +++ b/src/zgemm_NN_gpu.jdf @@ -362,8 +362,7 @@ RW C <- k == 0 ? C READ_C(m, n) CTL Z <- ( k > 0 ) & ((k % tD) == 0) ? Z LOCAL_BARRIER(x, y, z, u, v) -> ((k == descB->mt-1) | (k == (z+1)*tD-1)) ? Z LOCAL_BARRIER(xn, yn, zn, u, v) -BODY [type=CUDA - weight=(descA->nt-k)] +BODY [type=CUDA] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); diff --git a/src/zgemm_NN_summa.jdf b/src/zgemm_NN_summa.jdf index 57f5f92b..d1cfa5ea 100644 --- a/src/zgemm_NN_summa.jdf +++ b/src/zgemm_NN_summa.jdf @@ -207,7 +207,6 @@ CTL ctla -> (k < (descA->nt-lookQ)) ? ctla RING_A(m, k+lookQ, n%Q) CTL ctlb -> (k < (descA->nt-lookP)) ? ctlb RING_B(k+lookP, n, m%P) BODY [type=CUDA - weight=(descA->nt-k) A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} diff --git a/src/zgemm_NT.jdf b/src/zgemm_NT.jdf index da5fccba..91def4a0 100644 --- a/src/zgemm_NT.jdf +++ b/src/zgemm_NT.jdf @@ -165,7 +165,6 @@ CTL ctla -> (k < (descA->nt-lookQ)) ? ctla READ_A(m, k+lookQ) CTL ctlb -> (k < (descA->nt-lookP)) ? ctlb READ_B(n, k+lookP) BODY [type=CUDA - weight=(descA->nt-k) A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} diff --git a/src/zgemm_NT_summa.jdf b/src/zgemm_NT_summa.jdf index 6d2d4f0b..08aa5660 100644 --- a/src/zgemm_NT_summa.jdf +++ b/src/zgemm_NT_summa.jdf @@ -207,7 +207,6 @@ CTL ctla -> (k < (descA->nt-lookQ)) ? ctla RING_A(m, k+lookQ, n%Q) CTL ctlb -> (k < (descA->nt-lookP)) ? ctlb RING_B(n, k+lookP, m%P) BODY [type=CUDA - weight=(descA->nt-k) A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} diff --git a/src/zgemm_TN.jdf b/src/zgemm_TN.jdf index 2aad66ee..7232d3d9 100644 --- a/src/zgemm_TN.jdf +++ b/src/zgemm_TN.jdf @@ -165,7 +165,6 @@ CTL ctla -> (k < (descA->mt-lookQ)) ? ctla READ_A(k+lookQ, m) CTL ctlb -> (k < (descA->mt-lookP)) ? ctlb READ_B(k+lookP, n) BODY [type=CUDA - weight=(descA->mt-k) A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} diff --git a/src/zgemm_TN_summa.jdf b/src/zgemm_TN_summa.jdf index a19ce812..f1e86ba8 100644 --- a/src/zgemm_TN_summa.jdf +++ b/src/zgemm_TN_summa.jdf @@ -206,7 +206,6 @@ CTL ctla -> (k < (descA->mt-lookQ)) ? ctla RING_A(k+lookQ, m, n%Q) CTL ctlb -> (k < (descA->mt-lookP)) ? ctlb RING_B(k+lookP, n, m%P) BODY [type=CUDA - weight=(descA->mt-k) A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} diff --git a/src/zgemm_TT.jdf b/src/zgemm_TT.jdf index 376e32b7..30d82d0c 100644 --- a/src/zgemm_TT.jdf +++ b/src/zgemm_TT.jdf @@ -165,7 +165,6 @@ CTL ctla -> (k < (descA->mt-lookQ)) ? ctla READ_A(k+lookQ, m) CTL ctlb -> (k < (descA->mt-lookP)) ? ctlb READ_B(n, k+lookP) BODY [type=CUDA - weight=(descA->mt-k) A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} diff --git a/src/zgemm_TT_summa.jdf b/src/zgemm_TT_summa.jdf index 0bd628da..385021c1 100644 --- a/src/zgemm_TT_summa.jdf +++ b/src/zgemm_TT_summa.jdf @@ -206,7 +206,6 @@ CTL ctla -> (k < (descA->mt-lookQ)) ? ctla RING_A(k+lookQ, m, n%Q) CTL ctlb -> (k < (descA->mt-lookP)) ? ctlb RING_B(n, k+lookP, m%P) BODY [type=CUDA - weight=(descA->mt-k) A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} diff --git a/src/zgetrf_nopiv.jdf b/src/zgetrf_nopiv.jdf index 91934fb1..2fd2dfea 100644 --- a/src/zgetrf_nopiv.jdf +++ b/src/zgetrf_nopiv.jdf @@ -196,7 +196,6 @@ loc_C = %{ return LOC(descA, m, n); %} -> ((m > (k+1)) && (n > (k+1))) ? C zgemm(k+1, m, n) /* dep OUT: rely on datacopy dtt for sending */ BODY [type=CUDA - weight="dplasma_imin( m-k, n-k )" A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} C.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} diff --git a/src/zpotrf_L.jdf b/src/zpotrf_L.jdf index 8e242e12..58f47e33 100644 --- a/src/zpotrf_L.jdf +++ b/src/zpotrf_L.jdf @@ -62,6 +62,10 @@ static void zpotrf_L_update_INFO(parsec_taskpool_t* _tp, const parsec_recursive_ * WARNING: If mt is greater than 1200, we might get integer overflow. */ +static int64_t zpotrf_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev); +static int64_t ztrsm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev); +static int64_t zherk_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev); +static int64_t zgemm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev); %} /* Globals @@ -80,7 +84,9 @@ POWorkspaceID [type = "int" hidden = on default = -1 ] /************************************************** * potrf_zpotrf * **************************************************/ -potrf_zpotrf(k) [high_priority = on flops = inline_c %{ return FLOPS_ZPOTRF( CLEAN_MB(descA, k) ); %}] +potrf_zpotrf(k) [high_priority = on + flops = inline_c %{ return FLOPS_ZPOTRF( CLEAN_MB(descA, k) ); %} + time_estimate = zpotrf_time_estimate] // Execution space k = 0 .. descA->mt-1 @@ -181,7 +187,9 @@ END /************************************************** * potrf_ztrsm * **************************************************/ -potrf_ztrsm(m, k) [high_priority = on flops = inline_c %{ return FLOPS_ZTRSM(PlasmaRight, CLEAN_MB(descA, m), descA->nb); %}] +potrf_ztrsm(m, k) [high_priority = on + flops = inline_c %{ return FLOPS_ZTRSM(PlasmaRight, CLEAN_MB(descA, m), descA->nb); %} + time_estimate = ztrsm_time_estimate] // Execution space m = 1 .. descA->mt-1 @@ -206,8 +214,7 @@ RW C <- (k == 0) ? ddescA(m, k) [ type = %{ return ADTT_ ; (m >= (descA->mt - PRI_CHANGE)) ? (descA->mt - m) * (descA->mt - m) * (descA->mt - m) + 3 * ((2 * descA->mt) - k - m - 1) * (m - k) : PRI_MAX -BODY [type=CUDA - weight=(m+k)] +BODY [type=CUDA] { int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; int ldak = LDA(ddescA, T); @@ -293,7 +300,9 @@ END /************************************************** * potrf_zherk * **************************************************/ -potrf_zherk(k, m) [high_priority = on flops = inline_c %{ return FLOPS_ZHERK(CLEAN_MB(descA, m), descA->mb); %}] +potrf_zherk(k, m) [high_priority = on + flops = inline_c %{ return FLOPS_ZHERK(CLEAN_MB(descA, m), descA->mb); %} + time_estimate = zherk_time_estimate] // Execution space k = 0 .. descA->mt-2 @@ -314,8 +323,7 @@ RW T <- (k == 0) ? ddescA(m, m) [ type = %{ return ADTT_REA ; (m >= (descA->mt - PRI_CHANGE)) ? (descA->mt - m) * (descA->mt - m) * (descA->mt - m) + 3 * (m - k) : PRI_MAX -BODY [type=CUDA - weight=(m+k)] +BODY [type=CUDA] { int tempmm = m == descA->mt-1 ? descA->m - m*descA->mb : descA->mb; int ldam_A = LDA(ddescA, A); @@ -393,7 +401,8 @@ END * potrf_zgemm * **************************************************/ // Name -potrf_zgemm(m, n, k) [flops = inline_c %{ return FLOPS_ZGEMM(CLEAN_MB(descA, m), descA->mb, descA->mb); %}] +potrf_zgemm(m, n, k) [flops = inline_c %{ return FLOPS_ZGEMM(CLEAN_MB(descA, m), descA->mb, descA->mb); %} + time_estimate = zgemm_time_estimate] // Execution space k = 0 .. descA->mt-3 @@ -418,7 +427,6 @@ RW C <- (k == 0) ? ddescA(m, n) [ type = %{ return ADTT_ ; (m >= (descA->mt - PRI_CHANGE)) ? (descA->mt - m) * (descA->mt - m) * (descA->mt - m) + 3 * ((2 * descA->mt) - m - n - 3) * (m - n) + 6 * (m - k) : PRI_MAX BODY [type=CUDA - weight=(n+1-k) A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} C.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} @@ -524,6 +532,24 @@ END extern "C" %{ +/* Compute the time estimates based on device capabilities and flops for the task */ +static int64_t zpotrf_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev) { + int mb = ((parsec_zpotrf_L_taskpool_t*)task->taskpool)->_g_descA->mb; + return (int64_t)FLOPS_ZPOTRF(mb) / dev->gflops_fp64; +} +static int64_t ztrsm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev) { + int mb = ((parsec_zpotrf_L_taskpool_t*)task->taskpool)->_g_descA->mb; + return (int64_t)FLOPS_ZTRSM(PlasmaRight, mb, mb) / dev->gflops_fp64; +} +static int64_t zherk_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev) { + int mb = ((parsec_zpotrf_L_taskpool_t*)task->taskpool)->_g_descA->mb; + return (int64_t)FLOPS_ZHERK(mb, mb) / dev->gflops_fp64; +} +static int64_t zgemm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev) { + int mb = ((parsec_zpotrf_L_taskpool_t*)task->taskpool)->_g_descA->mb; + return (int64_t)FLOPS_ZGEMM(mb, mb, mb) / dev->gflops_fp64; +} + /* * A function to recursively update the value of the INFO argument for * recursive calls. We need a special function because the recursive calls being asynchronous diff --git a/src/zpotrf_U.jdf b/src/zpotrf_U.jdf index 06c88fbb..d64194e7 100644 --- a/src/zpotrf_U.jdf +++ b/src/zpotrf_U.jdf @@ -61,6 +61,10 @@ static void zpotrf_U_update_INFO(parsec_taskpool_t* _tp, const parsec_recursive_ * WARNING: If mt is greater than 1200, we might get integer overflow. */ +static int64_t zpotrf_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev); +static int64_t ztrsm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev); +static int64_t zherk_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev); +static int64_t zgemm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev); %} /* Globals @@ -80,7 +84,9 @@ POWorkspaceID [type = "int" hidden = on default = -1 ] /************************************************** * potrf_zpotrf * **************************************************/ -potrf_zpotrf(k) [high_priority = on flops = inline_c %{ return FLOPS_ZPOTRF( CLEAN_NB(descA, k) ); %}] +potrf_zpotrf(k) [high_priority = on + flops = inline_c %{ return FLOPS_ZPOTRF( CLEAN_NB(descA, k) ); %} + time_estimate = zpotrf_time_estimate] // Execution space k = 0 .. descA->nt-1 @@ -182,7 +188,9 @@ END /************************************************** * potrf_ztrsm * **************************************************/ -potrf_ztrsm(k, n) [high_priority = on flops = inline_c %{ return FLOPS_ZTRSM(PlasmaLeft, descA->mb, CLEAN_NB(descA, n)); %}] +potrf_ztrsm(k, n) [high_priority = on + flops = inline_c %{ return FLOPS_ZTRSM(PlasmaLeft, descA->mb, CLEAN_NB(descA, n)); %} + time_estimate = ztrsm_time_estimate] // Execution space k = 0 .. descA->nt-2 @@ -207,8 +215,7 @@ RW C <- (k == 0) ? ddescA(k, n) [ type = %{ return ADTT_ ; (n >= (descA->nt - PRI_CHANGE)) ? (descA->nt - n) * (descA->nt - n) * (descA->nt - n) + 3 * ((2 * descA->nt) - k - n - 1) * (n - k) : PRI_MAX -BODY [type=CUDA - weight=(k+n)] +BODY [type=CUDA] { int tempnn = n == descA->nt - 1 ? descA->n - n * descA->nb : descA->nb; int ldak_T = LDA(ddescA, T); @@ -294,7 +301,9 @@ END /************************************************** * potrf_zherk * **************************************************/ -potrf_zherk(k, n) [high_priority = on flops = inline_c %{ return FLOPS_ZHERK(CLEAN_NB(descA, n), descA->mb); %}] +potrf_zherk(k, n) [high_priority = on + flops = inline_c %{ return FLOPS_ZHERK(CLEAN_NB(descA, n), descA->mb); %} + time_estimate = zherk_time_estimate] // Execution space k = 0 .. descA->nt-2 @@ -316,8 +325,7 @@ RW T <- (k == 0) ? ddescA(n, n) [ type = %{ return ADTT_REA ; (n >= (descA->nt - PRI_CHANGE)) ? (descA->nt - n) * (descA->nt - n) * (descA->nt - n) + 3 * (n - k) : PRI_MAX -BODY [type=CUDA - weight=(k+n)] +BODY [type=CUDA] { int tempnn = n == descA->nt-1 ? descA->n - n*descA->nb : descA->nb; int ldak = LDA(ddescA, A); @@ -396,7 +404,8 @@ END * potrf_zgemm * **************************************************/ // Name -potrf_zgemm(m, n, k) [ flops = inline_c %{ return FLOPS_ZGEMM(descA->mb, CLEAN_NB(descA, n), descA->nb); %}] +potrf_zgemm(m, n, k) [ flops = inline_c %{ return FLOPS_ZGEMM(descA->mb, CLEAN_NB(descA, n), descA->nb); %} + time_estimate = zgemm_time_estimate] // Execution space k = 0 .. descA->mt-3 @@ -421,7 +430,6 @@ RW C <- (k == 0) ? ddescA(m, n) [ type = %{ return ADTT_ ; (n >= (descA->nt - PRI_CHANGE)) ? (descA->nt - n) * (descA->nt - n) * (descA->nt - n) + 3 * ((2 * descA->nt) - m - n - 3) * (n - m) + 6 * (n - k) : PRI_MAX BODY [type=CUDA - weight=(m+1-k) A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} C.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} @@ -538,6 +546,24 @@ END extern "C" %{ +/* Compute the time estimates based on device capabilities and flops for the task */ +static int64_t zpotrf_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev) { + int mb = ((parsec_zpotrf_U_taskpool_t*)task->taskpool)->_g_descA->mb; + return (int64_t)FLOPS_ZPOTRF(mb) / dev->gflops_fp64; +} +static int64_t ztrsm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev) { + int mb = ((parsec_zpotrf_U_taskpool_t*)task->taskpool)->_g_descA->mb; + return (int64_t)FLOPS_ZTRSM(PlasmaLeft, mb, mb) / dev->gflops_fp64; +} +static int64_t zherk_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev) { + int mb = ((parsec_zpotrf_U_taskpool_t*)task->taskpool)->_g_descA->mb; + return (int64_t)FLOPS_ZHERK(mb, mb) / dev->gflops_fp64; +} +static int64_t zgemm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev) { + int mb = ((parsec_zpotrf_U_taskpool_t*)task->taskpool)->_g_descA->mb; + return (int64_t)FLOPS_ZGEMM(mb, mb, mb) / dev->gflops_fp64; +} + /* * A function to recursively update the value of the INFO argument for * recursive calls. We need a special function because the recursive calls being asynchronous diff --git a/src/ztrsm_LLN.jdf b/src/ztrsm_LLN.jdf index d628ebd9..242fe6e4 100644 --- a/src/ztrsm_LLN.jdf +++ b/src/ztrsm_LLN.jdf @@ -89,8 +89,7 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(CLEAN_MB(descB, m), CLEAN_N -> (m>=(k+2)) ? E zgemm(k+1, m, n) -> ((k+1)==m) ? B ztrsm(m, n) -BODY [type=CUDA - weight=(n+1-k)] +BODY [type=CUDA] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex mzone = make_cuDoubleComplex(-1., 0.); diff --git a/src/ztrsm_LLT.jdf b/src/ztrsm_LLT.jdf index cef908c4..666fc78a 100644 --- a/src/ztrsm_LLT.jdf +++ b/src/ztrsm_LLT.jdf @@ -90,8 +90,7 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(descB->mb, CLEAN_NB(descB, -> ((1+k)==m) ? B ztrsm(m, n) -> (m>=(k+2)) ? E zgemm(k+1, m, n) -BODY [type=CUDA - weight=(n+1-k)] +BODY [type=CUDA] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex mzone = make_cuDoubleComplex(-1., 0.); diff --git a/src/ztrsm_LUN.jdf b/src/ztrsm_LUN.jdf index df6be09d..de922c9f 100644 --- a/src/ztrsm_LUN.jdf +++ b/src/ztrsm_LUN.jdf @@ -89,8 +89,7 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(descB->mb, CLEAN_NB(descB, -> ((1+k)==m) ? B ztrsm(m, n) -> (m>=(k+2)) ? E zgemm(k+1, m, n) -BODY [type=CUDA - weight=(n+1-k)] +BODY [type=CUDA] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex mzone = make_cuDoubleComplex(-1., 0.); diff --git a/src/ztrsm_LUT.jdf b/src/ztrsm_LUT.jdf index c1de62f6..a156ab88 100644 --- a/src/ztrsm_LUT.jdf +++ b/src/ztrsm_LUT.jdf @@ -89,8 +89,7 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(CLEAN_MB(descB, m), CLEAN_N -> (m>=(k+2)) ? E zgemm(k+1, m, n) -> ((k+1)==m) ? B ztrsm(m, n) -BODY [type=CUDA - weight=(n+1-k)] +BODY [type=CUDA] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex mzone = make_cuDoubleComplex(-1., 0.); diff --git a/src/ztrsm_RLN.jdf b/src/ztrsm_RLN.jdf index 632f88e1..9383c66f 100644 --- a/src/ztrsm_RLN.jdf +++ b/src/ztrsm_RLN.jdf @@ -89,8 +89,7 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(CLEAN_MB(descB, m), descB-> -> (n>=(k+2)) ? E zgemm(k+1, m, n) -> ((k+1)==n) ? B ztrsm(n, m) -BODY [type=CUDA - weight=(n+1-k)] +BODY [type=CUDA] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex mzone = make_cuDoubleComplex(-1., 0.); diff --git a/src/ztrsm_RLT.jdf b/src/ztrsm_RLT.jdf index d5550d05..33de63c4 100644 --- a/src/ztrsm_RLT.jdf +++ b/src/ztrsm_RLT.jdf @@ -88,8 +88,7 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(CLEAN_MB(descB, m), CLEAN_N -> ((1+k)==n) ? B ztrsm(n, m) -> (n>=(2+k)) ? E zgemm(k+1, m, n) -BODY [type=CUDA - weight=(n+1-k)] +BODY [type=CUDA] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex zone = make_cuDoubleComplex( 1., 0.); diff --git a/src/ztrsm_RUN.jdf b/src/ztrsm_RUN.jdf index ffc9f785..5bceec3b 100644 --- a/src/ztrsm_RUN.jdf +++ b/src/ztrsm_RUN.jdf @@ -89,8 +89,7 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(CLEAN_MB(descB, m), CLEAN_N -> ((1+k)==n) ? B ztrsm(n, m) -> (n>=(2+k)) ? E zgemm(k+1, m, n) -BODY [type=CUDA - weight=(n+1-k)] +BODY [type=CUDA] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex mzone = make_cuDoubleComplex(-1., 0.); diff --git a/src/ztrsm_RUT.jdf b/src/ztrsm_RUT.jdf index bb68a1e1..1d028794 100644 --- a/src/ztrsm_RUT.jdf +++ b/src/ztrsm_RUT.jdf @@ -88,8 +88,7 @@ zgemm(k,m,n) [ flops = inline_c%{ return FLOPS_ZGEMM(CLEAN_MB(descB, m), descB-> -> (n>=(k+2)) ? E zgemm(k+1, m, n) -> ((k+1)==n) ? B ztrsm(n, m) -BODY [type=CUDA - weight=(n+1-k)] +BODY [type=CUDA] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex zone = make_cuDoubleComplex( 1., 0.); diff --git a/tests/testing_zpotrf_dtd.c b/tests/testing_zpotrf_dtd.c index 08beebe8..2192321c 100644 --- a/tests/testing_zpotrf_dtd.c +++ b/tests/testing_zpotrf_dtd.c @@ -157,7 +157,7 @@ int parsec_core_cuda_gemm(parsec_execution_stream_t *es, parsec_task_t *this_task) { parsec_gpu_task_t *gpu_task; - double ratio; + int64_t task_load; int dev_index; int transA, transB; int m, n, k, lda, ldb, ldc; @@ -169,8 +169,7 @@ parsec_core_cuda_gemm(parsec_execution_stream_t *es, parsec_task_t *this_task) parsec_dtd_unpack_args(this_task, &transA, &transB, &m, &n, &k, &alpha, &A, &lda, &B, &ldb, &beta, &C, &ldc); - ratio = ((m + 1) - k); - dev_index = parsec_get_best_device((parsec_task_t *) this_task, ratio); + dev_index = parsec_get_best_device((parsec_task_t *) this_task, &task_load); assert(dev_index >= 0); if (dev_index < 2) { /* Fallback to the CPU only version */ @@ -182,7 +181,7 @@ parsec_core_cuda_gemm(parsec_execution_stream_t *es, parsec_task_t *this_task) gpu_task->ec = (parsec_task_t *) this_task; gpu_task->submit = &gpu_kernel_submit_dpotrf_U_potrf_dgemm; gpu_task->task_type = 0; - gpu_task->load = ratio * parsec_device_sweight[dev_index]; + gpu_task->load = task_load; gpu_task->last_data_check_epoch = -1; /* force at least one validation for the task */ gpu_task->pushout = 0; gpu_task->flow[0] = NULL; /*&flow_of_dpotrf_U_potrf_dgemm_for_C;*/ @@ -191,7 +190,6 @@ parsec_core_cuda_gemm(parsec_execution_stream_t *es, parsec_task_t *this_task) } gpu_task->flow[1] = NULL; /* &flow_of_dpotrf_U_potrf_dgemm_for_A; */ gpu_task->flow[2] = NULL; /* &flow_of_dpotrf_U_potrf_dgemm_for_B; */ - parsec_device_load[dev_index] += gpu_task->load; (void)es; return parsec_cuda_kernel_scheduler(es, gpu_task, dev_index); diff --git a/tools/PrecisionGenerator/subs.py b/tools/PrecisionGenerator/subs.py index d5ed67ac..0d81f61a 100644 --- a/tools/PrecisionGenerator/subs.py +++ b/tools/PrecisionGenerator/subs.py @@ -282,8 +282,9 @@ ('example_s', 'example_d', 'example_c', 'example_z' ), ('FLOPS_SSY', 'FLOPS_DSY', 'FLOPS_CHE', 'FLOPS_ZHE' ), ('FLOPS_S', 'FLOPS_D', 'FLOPS_C', 'FLOPS_Z' ), + ('gflops_fp32', 'gflops_fp64', 'gflops_fp32', 'gflops_fp64'), ('lapack_s', 'lapack_d', 'lapack_c', 'lapack_z' ), - ('LAPACKE_s', 'LAPACKE_d', 'LAPACKE_c', 'LAPACKE_z' ), + ('LAPACKE_s', 'LAPACKE_d', 'LAPACKE_c', 'LAPACKE_z' ), ('PLASMA_s', 'PLASMA_d', 'PLASMA_c', 'PLASMA_z' ), ('PLASMA_S', 'PLASMA_D', 'PLASMA_C', 'PLASMA_Z' ), ('plasma_s', 'plasma_d', 'plasma_c', 'plasma_z' ),