deepmodeling · mohanchen · Nov 13, 2024 · Nov 9, 2024 · Nov 11, 2024 · Nov 12, 2024
diff --git a/source/module_lr/dm_trans/dm_trans_parallel.cpp b/source/module_lr/dm_trans/dm_trans_parallel.cpp
@@ -109,7 +109,7 @@ std::vector<container::Tensor> cal_dm_trans_pblas(const std::complex<double>* X_
         //     &beta, dm_trans[isk].data<std::complex<double>>(), &i1, &i1, pmat.desc);
 
         // ============== [C_virt * X * C_occ^\dagger]^T=============
-        // ============== = [C_occ^* * X^T * C_virt^T]^T=============
+        // ============== = [C_occ^* * X^T * C_virt^T]=============
         // 1. X*C_occ^\dagger
         char transa = 'N';
         char transb = 'C';

diff --git a/source/module_lr/esolver_lrtd_lcao.cpp b/source/module_lr/esolver_lrtd_lcao.cpp
@@ -16,6 +16,7 @@
 #include "module_base/scalapack_connector.h"
 #include "module_parameter/parameter.h"
 #include "module_lr/ri_benchmark/ri_benchmark.h"
+#include "module_lr/operator_casida/operator_lr_diag.h" // for precondition
 
 #ifdef __EXX
 template<>
@@ -444,20 +445,29 @@ void LR::ESolver_LR<T, TR>::runner(int istep, UnitCell& cell)
                 if (GlobalV::MY_RANK == 0) { assert(nst == LR_Util::write_value(efile(label), prec, e, nst)); }
                 assert(nst * dim == LR_Util::write_value(vfile(label), prec, v, nst, dim));
             };
+        std::vector<double> precondition(this->input.lr_solver == "lapack" ? 0 : nloc_per_band, 1.0);
         // allocate and initialize A matrix and density matrix
         if (openshell)
         {
+            for (int is : {0, 1})
+            {
+                const int offset_is = is * this->paraX_[0].get_local_size();
+                OperatorLRDiag<double> pre_op(this->eig_ks.c + is * nk * (nocc[0] + nvirt[0]), this->paraX_[is], this->nk, this->nocc[is], this->nvirt[is]);
+                if (input.lr_solver != "lapack") { pre_op.act(1, offset_is, 1, precondition.data() + offset_is, precondition.data() + offset_is); }
+            }
             std::cout << "Solving spin-conserving excitation for open-shell system." << std::endl;
             HamiltULR<T> hulr(xc_kernel, nspin, this->nbasis, this->nocc, this->nvirt, this->ucell, orb_cutoff_, GlobalC::GridD, *this->psi_ks, this->eig_ks,
 #ifdef __EXX
                 this->exx_lri, this->exx_info.info_global.hybrid_alpha,
 #endif
                 this->gint_, this->pot, this->kv, this->paraX_, this->paraC_, this->paraMat_);
-            LR::HSolver::solve(hulr, this->X[0].template data<T>(), nloc_per_band, nstates, this->pelec->ekb.c, this->input.lr_solver, this->input.lr_thr);
+            LR::HSolver::solve(hulr, this->X[0].template data<T>(), nloc_per_band, nstates, this->pelec->ekb.c, this->input.lr_solver, this->input.lr_thr, precondition);
             if (input.out_wfc_lr) { write_states("openshell", this->pelec->ekb.c, this->X[0].template data<T>(), nloc_per_band, nstates); }
         }
         else
         {
+            OperatorLRDiag<double> pre_op(this->eig_ks.c, this->paraX_[0], this->nk, this->nocc[0], this->nvirt[0]);
+            if (input.lr_solver != "lapack") { pre_op.act(1, nloc_per_band, 1, precondition.data(), precondition.data()); }
             auto spin_types = std::vector<std::string>({ "singlet", "triplet" });
             for (int is = 0;is < nspin;++is)
             {
@@ -470,7 +480,7 @@ void LR::ESolver_LR<T, TR>::runner(int istep, UnitCell& cell)
                     spin_types[is], input.ri_hartree_benchmark, (input.ri_hartree_benchmark == "aims" ? input.aims_nbasis : std::vector<int>({})));
                 // solve the Casida equation
                 LR::HSolver::solve(hlr, this->X[is].template data<T>(), nloc_per_band, nstates,
-                    this->pelec->ekb.c + is * nstates, this->input.lr_solver, this->input.lr_thr/*,
+                    this->pelec->ekb.c + is * nstates, this->input.lr_solver, this->input.lr_thr, precondition/*,
                         !std::set<std::string>({ "hf", "hse" }).count(this->xc_kernel)*/);  //whether the kernel is Hermitian
                 if (input.out_wfc_lr) { write_states(spin_types[is], this->pelec->ekb.c + is * nstates, this->X[is].template data<T>(), nloc_per_band, nstates); }
             }
@@ -587,8 +597,9 @@ void LR::ESolver_LR<T, TR>::set_X_initial_guess()
             const int is_in_x = openshell ? 0 : is;     // if openshell, spin-up and spin-down are put together
             if (px.in_this_processor(virt_global, occ_global))
             {
+                const int xstart_pair = ik * px.get_local_size();
                 const int ipair_loc = px.global2local_col(occ_global) * px.get_row_size() + px.global2local_row(virt_global);
-                X[is_in_x].data<T>()[xstart_bs + ipair_loc] = (static_cast<T>(1.0) / static_cast<T>(nk));
+                X[is_in_x].data<T>()[xstart_bs + xstart_pair + ipair_loc] = (static_cast<T>(1.0) / static_cast<T>(nk));
             }
         }
     }

diff --git a/source/module_lr/hsolver_lrtd.hpp b/source/module_lr/hsolver_lrtd.hpp
@@ -32,15 +32,15 @@ namespace LR
             double* eig,
             const std::string method,
             const Real<T>& diag_ethr, ///< threshold for diagonalization
+            const std::vector<Real<T>>& precondition,
             const bool hermitian = true)
         {
             ModuleBase::TITLE("HSolverLR", "solve");
             const std::vector<std::string> spin_types = { "singlet", "triplet" };
             // note: if not TDA, the eigenvalues will be complex
             // then we will need a new constructor of DiagoDavid
 
-            // 1. allocate precondition and eigenvalue
-            std::vector<Real<T>> precondition(dim);
+            // 1. allocate eigenvalue
             std::vector<Real<T>> eigenvalue(nband);   //nstates
             // 2. select the method
 #ifdef __MPI
@@ -67,9 +67,7 @@ namespace LR
             }
             else
             {
-                // 3. set precondition and diagethr
-                for (int i = 0; i < dim; ++i) { precondition[i] = static_cast<Real<T>>(1.0); }
-
+                // 3. set maxiter and funcs
                 const int maxiter = hsolver::DiagoIterAssist<T>::PW_DIAG_NMAX;
 
                 auto hpsi_func = [&hm](T* psi_in, T* hpsi, const int ld_psi, const int nvec) {hm.hPsi(psi_in, hpsi, ld_psi, nvec);};
@@ -139,7 +137,8 @@ namespace LR
 
                     auto psi_tensor = ct::TensorMap(psi, ct::DataTypeToEnum<T>::value, ct::DeviceType::CpuDevice, ct::TensorShape({ nband, dim }));
                     auto eigen_tensor = ct::TensorMap(eigenvalue.data(), ct::DataTypeToEnum<Real<T>>::value, ct::DeviceType::CpuDevice, ct::TensorShape({ nband }));
-                    auto precon_tensor = ct::TensorMap(precondition.data(), ct::DataTypeToEnum<Real<T>>::value, ct::DeviceType::CpuDevice, ct::TensorShape({ dim }));
+                    std::vector<Real<T>> precondition_(precondition);   //since TensorMap does not support const pointer
+                    auto precon_tensor = ct::TensorMap(precondition_.data(), ct::DataTypeToEnum<Real<T>>::value, ct::DeviceType::CpuDevice, ct::TensorShape({ dim }));
                     auto hpsi_func = [&hm](const ct::Tensor& psi_in, ct::Tensor& hpsi) {hm.hPsi(psi_in.data<T>(), hpsi.data<T>(), psi_in.shape().dim_size(0) /*nbasis_local*/, 1/*band-by-band*/);};
                     auto spsi_func = [&hm](const ct::Tensor& psi_in, ct::Tensor& spsi)
                         { std::memcpy(spsi.data<T>(), psi_in.data<T>(), sizeof(T) * psi_in.NumElements()); };

diff --git a/source/module_lr/operator_casida/operator_lr_diag.h b/source/module_lr/operator_casida/operator_lr_diag.h
@@ -46,9 +46,8 @@ namespace LR
             const bool is_first_node = false)const override
         {
             ModuleBase::TITLE("OperatorLRDiag", "act");
-            const int nlocal_ph = nk * pX.get_local_size();   // local size of particle-hole basis
             hsolver::vector_mul_vector_op<T, Device>()(this->ctx,
-                nk * pX.get_local_size(),
+                nk * pX.get_local_size(),   // local size of particle-hole basis
                 hpsi,
                 psi_in,
                 this->eig_ks_diff.c);

diff --git a/source/module_lr/operator_casida/operator_lr_exx.cpp b/source/module_lr/operator_casida/operator_lr_exx.cpp
@@ -27,6 +27,9 @@ namespace LR
     void OperatorLREXX<double>::cal_DM_onebase(const int io, const int iv, const int ik) const
     {
         ModuleBase::TITLE("OperatorLREXX", "cal_DM_onebase");
+        // NOTICE: DM_onebase will be passed into `cal_energy` interface and conjugated by "zdotc". 
+        // So the formula should be the same as RHS. instead of LHS of the A-matrix, 
+        // i.e. c1v · conj(c2o) ·  e^{-ik(R2-R1)}
         assert(ik == 0);
         for (auto cell : this->BvK_cells)
         {
@@ -42,8 +45,12 @@ namespace LR
                             const int nw2 = aims_nbasis.empty() ? ucell.atoms[it2].nw : aims_nbasis[it2];
                             for (int iw1 = 0;iw1 < nw1;++iw1)
                                 for (int iw2 = 0;iw2 < nw2;++iw2)
-                                    if (this->pmat.in_this_processor(ucell.itiaiw2iwt(it1, ia1, iw1), ucell.itiaiw2iwt(it2, ia2, iw2)))
-                                        D2d(iw1, iw2) = this->psi_ks_full(ik, io, ucell.itiaiw2iwt(it1, ia1, iw1)) * this->psi_ks_full(ik, nocc + iv, ucell.itiaiw2iwt(it2, ia2, iw2));
+                                {
+                                    const int iwt1 = ucell.itiaiw2iwt(it1, ia1, iw1);
+                                    const int iwt2 = ucell.itiaiw2iwt(it2, ia2, iw2);
+                                    if (this->pmat.in_this_processor(iwt1, iwt2))
+                                        D2d(iw1, iw2) = this->psi_ks_full(ik, io, iwt1) * this->psi_ks_full(ik, nocc + iv, iwt2);
+                                }
                         }
         }
     }
@@ -52,6 +59,9 @@ namespace LR
     void OperatorLREXX<std::complex<double>>::cal_DM_onebase(const int io, const int iv, const int ik) const
     {
         ModuleBase::TITLE("OperatorLREXX", "cal_DM_onebase");
+        // NOTICE: DM_onebase will be passed into `cal_energy` interface and conjugated by "zdotc". 
+        // So the formula should be the same as RHS. instead of LHS of the A-matrix, 
+        // i.e. c1v · conj(c2o) ·  e^{-ik(R2-R1)}
         for (auto cell : this->BvK_cells)
         {
             std::complex<double> frac = RI::Global_Func::convert<std::complex<double>>(std::exp(
@@ -68,8 +78,12 @@ namespace LR
                             const int nw2 = aims_nbasis.empty() ? ucell.atoms[it2].nw : aims_nbasis[it2];
                             for (int iw1 = 0;iw1 < nw1;++iw1)
                                 for (int iw2 = 0;iw2 < nw2;++iw2)
-                                    if (this->pmat.in_this_processor(ucell.itiaiw2iwt(it1, ia1, iw1), ucell.itiaiw2iwt(it2, ia2, iw2)))
-                                        D2d(iw1, iw2) = frac * std::conj(this->psi_ks_full(ik, io, ucell.itiaiw2iwt(it1, ia1, iw1))) * this->psi_ks_full(ik, nocc + iv, ucell.itiaiw2iwt(it2, ia2, iw2));
+                                {
+                                    const int iwt1 = ucell.itiaiw2iwt(it1, ia1, iw1);
+                                    const int iwt2 = ucell.itiaiw2iwt(it2, ia2, iw2);
+                                    if (this->pmat.in_this_processor(iwt1, iwt2))
+                                        D2d(iw1, iw2) = frac * std::conj(this->psi_ks_full(ik, io, iwt2)) * this->psi_ks_full(ik, nocc + iv, iwt1);
+                                }
                         }
         }
     }
@@ -78,9 +92,6 @@ namespace LR
     void OperatorLREXX<T>::act(const int nbands, const int nbasis, const int npol, const T* psi_in, T* hpsi, const int ngk_ik, const bool is_first_node)const
     {
         ModuleBase::TITLE("OperatorLREXX", "act");
-
-        const int& nk = this->kv.get_nks() / this->nspin;
-
         // convert parallel info to LibRI interfaces
         std::vector<std::tuple<std::set<TA>, std::set<TA>>> judge = RI_2D_Comm::get_2D_judge(this->pmat);
 
@@ -120,12 +131,12 @@ namespace LR
                 {
                     const int xstart_bk = ik * pX.get_local_size();
                     this->cal_DM_onebase(io, iv, ik);       //set Ds_onebase for all e-h pairs (not only on this processor)
-                    // LR_Util::print_CV(Ds_onebase[is], "Ds_onebase of occ " + std::to_string(io) + ", virtual " + std::to_string(iv) + " in OperatorLREXX", 1e-10);
+                    // LR_Util::print_CV(Ds_onebase, "Ds_onebase of occ " + std::to_string(io) + ", virtual " + std::to_string(iv) + " in OperatorLREXX", 1e-10);
                     const T& ene = 2 * alpha * //minus for exchange(but here plus is right, why?), 2 for Hartree to Ry
                         lri->exx_lri.post_2D.cal_energy(this->Ds_onebase, lri->Hexxs[0]);
                     if (this->pX.in_this_processor(iv, io))
                     {
-                        hpsi[xstart_bk + ik * pX.get_local_size() + this->pX.global2local_col(io) * this->pX.get_row_size() + this->pX.global2local_row(iv)] += ene;
+                        hpsi[xstart_bk + this->pX.global2local_col(io) * this->pX.get_row_size() + this->pX.global2local_row(iv)] += ene;
                     }
                 }
             }

diff --git a/source/module_lr/operator_casida/operator_lr_exx.h b/source/module_lr/operator_casida/operator_lr_exx.h
@@ -32,7 +32,7 @@ namespace LR
             const Parallel_Orbitals& pmat_in,
             const double& alpha = 1.0,
             const std::vector<int>& aims_nbasis = {})
-            : nspin(nspin), naos(naos), nocc(nocc), nvirt(nvirt),
+            : nspin(nspin), naos(naos), nocc(nocc), nvirt(nvirt), nk(kv_in.get_nks() / nspin),
             psi_ks(psi_ks_in), DM_trans(DM_trans_in), exx_lri(exx_lri_in), kv(kv_in),
             pX(pX_in), pc(pc_in), pmat(pmat_in), ucell(ucell_in), alpha(alpha),
             aims_nbasis(aims_nbasis)
@@ -42,8 +42,11 @@ namespace LR
             this->is_first_node = false;
 
             // reduce psi_ks for later use
-            this->psi_ks_full.resize(this->kv.get_nks(), nocc + nvirt, this->naos);
-            LR_Util::gather_2d_to_full(this->pc, this->psi_ks.get_pointer(), this->psi_ks_full.get_pointer(), false, this->naos, nocc + nvirt);
+            this->psi_ks_full.resize(this->nk, nocc + nvirt, this->naos);
+            for (int ik = 0;ik < nk;++ik)
+            {
+                LR_Util::gather_2d_to_full(this->pc, &this->psi_ks(ik, 0, 0), &this->psi_ks_full(ik, 0, 0), false, this->naos, nocc + nvirt);
+            }
 
             // get cells in BvK supercell
             const TC period = RI_Util::get_Born_vonKarmen_period(kv_in);
@@ -63,12 +66,13 @@ namespace LR
                          const int ngk_ik = 0,
                          const bool is_first_node = false) const override;
 
-      private:
+    private:
         //global sizes
-        const int& nspin;
-        const int& naos;
-        const int& nocc;
-        const int& nvirt;
+        const int nspin = 1;
+        const int naos = 1;
+        const int nocc = 1;
+        const int nvirt = 1;
+        const int nk = 1;  ///< number of k-points
         const double alpha = 1.0;   //(allow non-ref constant)
         const bool cal_dm_trans = false;
         const bool tdm_sym = false; ///< whether transition density matrix is symmetric

diff --git a/tests/integrate/291_NO_KP_LR/result.ref b/tests/integrate/291_NO_KP_LR/result.ref
@@ -1 +1 @@
-totexcitationenergyref 0.784471
+totexcitationenergyref 0.784274
diff --git a/tests/integrate/391_NO_GO_LR/result.ref b/tests/integrate/391_NO_GO_LR/result.ref
@@ -1 +1 @@
-totexcitationenergyref 2.641295
+totexcitationenergyref 2.641190
diff --git a/tests/integrate/392_NO_GO_LR_HF/result.ref b/tests/integrate/392_NO_GO_LR_HF/result.ref
@@ -1 +1 @@
-totexcitationenergyref 3.067979
+totexcitationenergyref 3.067058