diff --git a/examples/recursive_matmul.cxx b/examples/recursive_matmul.cxx index 7f198160..daf63fea 100644 --- a/examples/recursive_matmul.cxx +++ b/examples/recursive_matmul.cxx @@ -22,7 +22,7 @@ void recursive_matmul(int n, MPI_Comm_rank(pcomm, &rank); MPI_Comm_size(pcomm, &num_pes); - if (num_pes == 1){ + if (num_pes == 1 || m == 1 || n == 1 || k==1){ C["ij"] += 1.0*A["ik"]*B["kj"]; } else { for (div=2; num_pes%div!=0; div++){} diff --git a/scalapack_tests/svd.cxx b/scalapack_tests/svd.cxx index 434fa097..74e018e9 100644 --- a/scalapack_tests/svd.cxx +++ b/scalapack_tests/svd.cxx @@ -30,9 +30,9 @@ bool svd(Matrix A, bool pass_orthogonality = true; - double nrm; - E.norm2(nrm); - if (nrm > m*n*1.E-6){ + double nrm1, nrm2, nrm3; + E.norm2(nrm1); + if (nrm1 > m*n*1.E-6){ pass_orthogonality = false; } @@ -40,16 +40,16 @@ bool svd(Matrix A, E["ij"] -= VT["ik"]*conj(VT)["jk"]; - E.norm2(nrm); - if (nrm > m*n*1.E-6){ + E.norm2(nrm2); + if (nrm2 > m*n*1.E-6){ pass_orthogonality = false; } A["ij"] -= U["ik"]*S["k"]*VT["kj"]; bool pass_residual = true; - A.norm2(nrm); - if (nrm > m*n*n*1.E-6){ + A.norm2(nrm3); + if (nrm3 > m*n*n*1.E-6){ pass_residual = false; } @@ -57,6 +57,12 @@ bool svd(Matrix A, if (dw.rank == 0){ printf("SVD orthogonality check returned %d, residual check %d\n", pass_orthogonality, pass_residual); } +#else + if (!pass_residual || ! pass_orthogonality){ + if (dw.rank == 0){ + printf("SVD orthogonality check returned %d (%lf, %lf), residual check %d (%lf)\n", pass_orthogonality, nrm1, nrm2, pass_residual, nrm3); + } + } #endif return pass_residual & pass_orthogonality; } diff --git a/src/contraction/contraction.cxx b/src/contraction/contraction.cxx index cc6603ba..06d37bb7 100644 --- a/src/contraction/contraction.cxx +++ b/src/contraction/contraction.cxx @@ -2475,14 +2475,12 @@ namespace CTF_int { #else for (int t=global_comm.rank+1; t<(int)wrld->topovec.size()+8; t+=global_comm.np){ #endif - TAU_FSTART(evaluate_mappings_clear_and_init); A->clear_mapping(); B->clear_mapping(); C->clear_mapping(); A->set_padding(); B->set_padding(); C->set_padding(); - TAU_FSTOP(evaluate_mappings_clear_and_init); topology * topo_i = NULL; if (t < 8){ @@ -2511,9 +2509,7 @@ namespace CTF_int { } else topo_i = wrld->topovec[t-8]; ASSERT(topo_i != NULL); - TAU_FSTART(map_ctr_to_topo); ret = map_to_topology(topo_i, j); - TAU_FSTOP(map_ctr_to_topo); if (ret == NEGATIVE){ //printf("map_to_topology returned negative\n"); @@ -2527,18 +2523,13 @@ namespace CTF_int { B->topo = topo_i; C->topo = topo_i; - TAU_FSTART(check_ctr_mapping); if (check_mapping() == 0){ - TAU_FSTOP(check_ctr_mapping); continue; } - TAU_FSTOP(check_ctr_mapping); est_time = 0.0; - TAU_FSTART(evaluate_mappings_set_padding2); A->set_padding(); B->set_padding(); C->set_padding(); - TAU_FSTOP(evaluate_mappings_set_padding2); #if DEBUG >= 3 if (global_comm.rank == 0){ printf("\nTest mappings:\n"); @@ -2565,8 +2556,14 @@ namespace CTF_int { } nnz_frac_C = std::min(1.,std::max(nnz_frac_C,nnz_frac_A*nnz_frac_B*len_ctr)); } + // check this early on to avoid 64-bit integer overflow + double size_memuse = A->size*nnz_frac_A*A->sr->el_size + B->size*nnz_frac_B*B->sr->el_size + C->size*nnz_frac_C*C->sr->el_size; + if (size_memuse >= (double)max_memuse){ + if (global_comm.rank == 0) + DPRINTF(1,"Not enough memory available for topo %d with order %d to store tensors %ld/%ld\n", t,j,(int64_t)size_memuse,max_memuse); + continue; + } - TAU_FSTART(evaluate_mappings_folding); #if FOLD_TSR if (can_fold()){ est_time = est_time_fold(); @@ -2589,7 +2586,6 @@ namespace CTF_int { est_time = sctr->est_time_rec(sctr->num_lyr); } } - TAU_FSTOP(evaluate_mappings_folding); #if DEBUG >= 3 if (global_comm.rank == 0){ printf("mapping passed contr est_time = %E sec\n", est_time); @@ -2600,7 +2596,6 @@ namespace CTF_int { need_remap_A = 0; need_remap_B = 0; need_remap_C = 0; - TAU_FSTART(evaluate_mappings_comp_maps); if (topo_i == old_topo_A){ for (d=0; dorder; d++){ if (!comp_dim_map(&A->edge_map[d],&old_map_A[d])) @@ -2631,8 +2626,6 @@ namespace CTF_int { } } else need_remap_C = 1; - TAU_FSTOP(evaluate_mappings_comp_maps); - TAU_FSTART(est_ctr_map_time); if (need_remap_C) { est_time += 2.*C->est_redist_time(*dC, nnz_frac_C); memuse = std::max(1.0*memuse,2.*C->get_redist_mem(*dC, nnz_frac_C)); @@ -2646,20 +2639,16 @@ namespace CTF_int { printf("total (with redistribution and transp) est_time = %E\n", est_time); } #endif - TAU_FSTOP(est_ctr_map_time); ASSERT(est_time >= 0.0); - TAU_FSTART(get_avail_res); if ((int64_t)memuse >= max_memuse){ if (global_comm.rank == 0) DPRINTF(1,"Not enough memory available for topo %d with order %d memory %ld/%ld\n", t,j,memuse,max_memuse); - TAU_FSTOP(get_avail_res); delete sctr; continue; } if ((!A->is_sparse && A->size > INT_MAX) ||(!B->is_sparse && B->size > INT_MAX) || (!C->is_sparse && C->size > INT_MAX)){ DPRINTF(1,"MPI does not handle enough bits for topo %d with order\n", j); - TAU_FSTOP(get_avail_res); delete sctr; continue; } @@ -2670,7 +2659,6 @@ namespace CTF_int { btopo = 6*t+j; } delete sctr; - TAU_FSTOP(get_avail_res); } } TAU_FSTOP(evaluate_mappings) @@ -3509,7 +3497,27 @@ namespace CTF_int { } #endif - + if (blk_sz_A < vrt_sz_A){ + printf("blk_sz_A = %ld, vrt_sz_A = %ld\n", blk_sz_A, vrt_sz_A); + printf("sizes are %ld %ld %ld\n",A->size,B->size,C->size); + A->print_map(stdout, 0); + B->print_map(stdout, 0); + C->print_map(stdout, 0); + } + if (blk_sz_B < vrt_sz_B){ + printf("blk_sz_B = %ld, vrt_sz_B = %ld\n", blk_sz_B, vrt_sz_B); + printf("sizes are %ld %ld %ld\n",A->size,B->size,C->size); + A->print_map(stdout, 0); + B->print_map(stdout, 0); + C->print_map(stdout, 0); + } + if (blk_sz_C < vrt_sz_C){ + printf("blk_sz_C = %ld, vrt_sz_C = %ld\n", blk_sz_C, vrt_sz_C); + printf("sizes are %ld %ld %ld\n",A->size,B->size,C->size); + A->print_map(stdout, 0); + B->print_map(stdout, 0); + C->print_map(stdout, 0); + } ASSERT(blk_sz_A >= vrt_sz_A); ASSERT(blk_sz_B >= vrt_sz_B); ASSERT(blk_sz_C >= vrt_sz_C); diff --git a/src/contraction/ctr_2d_general.cxx b/src/contraction/ctr_2d_general.cxx index 90b016b5..1c5909fb 100755 --- a/src/contraction/ctr_2d_general.cxx +++ b/src/contraction/ctr_2d_general.cxx @@ -47,7 +47,7 @@ namespace CTF_int { int & load_phase_C){ mapping * map; int j; - int nstep = 1; + int64_t nstep = 1; if (comp_dim_map(&C->edge_map[i_C], &B->edge_map[i_B])){ map = &B->edge_map[i_B]; while (map->has_child) map = map->child; @@ -111,16 +111,24 @@ namespace CTF_int { /virt_blk_len_C[j]; } if (B->edge_map[i_B].type != PHYSICAL_MAP){ + if (blk_sz_B / nstep == 0) + printf("blk_len_B[%d] = %d, nstep = %ld blk_sz_B = %ld\n",i_B,blk_len_B[i_B],nstep,blk_sz_B); blk_sz_B = blk_sz_B / nstep; blk_len_B[i_B] = blk_len_B[i_B] / nstep; } else { + if (blk_sz_B * B->edge_map[i_B].np/ nstep == 0) + printf("blk_len_B[%d] = %d B->edge_map[%d].np = %d, nstep = %ld blk_sz_B = %ld\n",i_B,blk_len_B[i_B],i_B,B->edge_map[i_B].np,nstep,blk_sz_B); blk_sz_B = blk_sz_B * B->edge_map[i_B].np / nstep; blk_len_B[i_B] = blk_len_B[i_B] * B->edge_map[i_B].np / nstep; } if (C->edge_map[i_C].type != PHYSICAL_MAP){ + if (blk_sz_C / nstep == 0) + printf("blk_len_C[%d] = %d, nstep = %ld blk_sz_C = %ld\n",i_C,blk_len_C[i_C],nstep,blk_sz_C); blk_sz_C = blk_sz_C / nstep; blk_len_C[i_C] = blk_len_C[i_C] / nstep; } else { + if (blk_sz_C * C->edge_map[i_C].np/ nstep == 0) + printf("blk_len_C[%d] = %d C->edge_map[%d].np = %d, nstep = %ld blk_sz_C = %ld\n",i_C,blk_len_C[i_C],i_C,C->edge_map[i_C].np,nstep,blk_sz_C); blk_sz_C = blk_sz_C * C->edge_map[i_C].np / nstep; blk_len_C[i_C] = blk_len_C[i_C] * C->edge_map[i_C].np / nstep; } diff --git a/src/interface/matrix.cxx b/src/interface/matrix.cxx index e041d2f0..f68ceeed 100644 --- a/src/interface/matrix.cxx +++ b/src/interface/matrix.cxx @@ -213,8 +213,8 @@ namespace CTF { if (lda == nrow/pc){ memcpy(this->data, (char*)data_, sizeof(dtype)*this->size); } else { - for (int i=0; idata+i*lda*sizeof(dtype),(char*)(data_+i*lda), nrow*sizeof(dtype)/pr); + for (int64_t i=0; idata+i*lda*sizeof(dtype),(char*)(data_+i*lda), ((int64_t)nrow)*sizeof(dtype)/pr); } } } else { @@ -258,7 +258,7 @@ namespace CTF { if (lda == nrow/pc){ memcpy((char*)data_, this->data, sizeof(dtype)*this->size); } else { - for (int i=0; idata+i*lda*sizeof(dtype), nrow*sizeof(dtype)/pr); } } @@ -437,11 +437,11 @@ namespace CTF { this->read_mat(desca, A); - dtype * tau = (dtype*)malloc(n*sizeof(dtype)); + dtype * tau = (dtype*)malloc(((int64_t)n)*sizeof(dtype)); dtype dlwork; CTF_SCALAPACK::pgeqrf(m,n,A,1,1,desca,tau,(dtype*)&dlwork,-1,&info); int lwork = get_int_fromreal(dlwork); - dtype * work = (dtype*)malloc(lwork*sizeof(dtype)); + dtype * work = (dtype*)malloc(((int64_t)lwork)*sizeof(dtype)); CTF_SCALAPACK::pgeqrf(m,n,A,1,1,desca,tau,work,lwork,&info); @@ -453,14 +453,14 @@ namespace CTF { R = Matrix(Q); else { R = Matrix(desca,dQ,*this->wrld,*this->sr); - R = R.slice(0,m*(n-1)+n-1); + R = R.slice(0,((int64_t)m)*(n-1)+n-1); } free(work); CTF_SCALAPACK::porgqr(m,n,n,dQ,1,1,desca,tau,(dtype*)&dlwork,-1,&info); lwork = get_int_fromreal(dlwork); - work = (dtype*)malloc(lwork*sizeof(dtype)); + work = (dtype*)malloc(((int64_t)lwork)*sizeof(dtype)); CTF_SCALAPACK::porgqr(m,n,n,dQ,1,1,desca,tau,work,lwork,&info); Q = Matrix(desca, dQ, (*(this->wrld))); free(work); @@ -502,14 +502,16 @@ namespace CTF { int64_t kpr = k/pr + (k % pr != 0); int64_t kpc = k/pc + (k % pc != 0); int64_t npc = n/pc + (n % pc != 0); + CTF_SCALAPACK::cdescinit(descu, m, k, 1, 1, 0, 0, ictxt, mpr, &info); CTF_SCALAPACK::cdescinit(descvt, k, n, 1, 1, 0, 0, ictxt, kpr, &info); - dtype * A = (dtype*)malloc(this->size*sizeof(dtype)); + + dtype * A = (dtype*)CTF_int::alloc(this->size*sizeof(dtype)); - dtype * u = (dtype*)new dtype[mpr*kpc]; - dtype * s = (dtype*)new dtype[k]; - dtype * vt = (dtype*)new dtype[kpr*npc]; + dtype * u = (dtype*)CTF_int::alloc(sizeof(dtype)*mpr*kpc); + dtype * s = (dtype*)CTF_int::alloc(sizeof(dtype)*k); + dtype * vt = (dtype*)CTF_int::alloc(sizeof(dtype)*kpr*npc); this->read_mat(desca, A); int lwork; @@ -517,7 +519,7 @@ namespace CTF { CTF_SCALAPACK::pgesvd('V', 'V', m, n, NULL, 1, 1, desca, NULL, NULL, 1, 1, descu, vt, 1, 1, descvt, &dlwork, -1, &info); lwork = get_int_fromreal(dlwork); - dtype * work = (dtype*)malloc(sizeof(dtype)*lwork); + dtype * work = (dtype*)CTF_int::alloc(sizeof(dtype)*((int64_t)lwork)); CTF_SCALAPACK::pgesvd('V', 'V', m, n, A, 1, 1, desca, s, u, 1, 1, descu, vt, 1, 1, descvt, work, lwork, &info); @@ -537,18 +539,18 @@ namespace CTF { } if (rank > 0 && rank < k) { S = S.slice(0, rank-1); - U = U.slice(0, rank*(m)-1); - VT = VT.slice(0, k*n-(k-rank+1)); + U = U.slice(0, rank*((int64_t)m)-1); + VT = VT.slice(0, k*((int64_t)n)-(k-rank+1)); } - free(A); - delete [] u; - delete [] s; - delete [] vt; + CTF_int::cdealloc(A); + CTF_int::cdealloc(u); + CTF_int::cdealloc(s); + CTF_int::cdealloc(vt); free(desca); free(descu); free(descvt); - free(work); + CTF_int::cdealloc(work); }