From 9bebb83af2d938471572f5cace79bf8ed90af6d9 Mon Sep 17 00:00:00 2001
From: kylebystrom <kylebystrom@gmail.com>
Date: Wed, 22 May 2024 14:07:35 -0400
Subject: [PATCH] start smooth vi, l=1 still not working

---
 ciderpress/dft/lcao_convolutions.py     | 154 ++++++--
 ciderpress/dft/lcao_nldf_generator.py   | 127 ++++++-
 ciderpress/lib/mod_cider/convolutions.c | 448 +++++++++++++++++++++++-
 ciderpress/lib/mod_cider/convolutions.h |   2 +
 ciderpress/pyscf/descriptors.py         |  11 +-
 ciderpress/pyscf/nldf_convolutions.py   | 142 +++++++-
 ciderpress/pyscf/numint.py              |   8 +-
 ciderpress/pyscf/tests/test_nldf.py     |  40 ++-
 examples/pyscf/compute_ae.py            |   4 +
 9 files changed, 890 insertions(+), 46 deletions(-)

diff --git a/ciderpress/dft/lcao_convolutions.py b/ciderpress/dft/lcao_convolutions.py
index f78e9f5..0ecc037 100644
--- a/ciderpress/dft/lcao_convolutions.py
+++ b/ciderpress/dft/lcao_convolutions.py
@@ -214,6 +214,30 @@ def env(self):
         libcider.get_atco_env(env.ctypes.data_as(ctypes.c_void_p), self._atco)
         return env
 
+    def _run_checks(self, theta_rlmq, p_uq, loc, rads, offset):
+        nalpha = theta_rlmq.shape[-1]
+        stride = p_uq.shape[1]
+        if offset is None:
+            offset = 0
+        else:
+            assert offset >= 0
+        nrad = rads.size
+        assert loc.flags.c_contiguous
+        assert loc.dtype == np.int32
+        assert loc.ndim == 1
+        assert rads.flags.c_contiguous
+        assert rads.dtype == np.float64
+        assert theta_rlmq.ndim == 3
+        assert theta_rlmq.shape[0] == nrad
+        assert theta_rlmq.flags.c_contiguous
+        assert theta_rlmq.dtype == np.float64
+        assert p_uq.ndim == 2
+        assert p_uq.shape[0] == self.nao
+        assert p_uq.flags.c_contiguous
+        assert p_uq.dtype == np.float64
+        assert offset + nalpha <= stride, "{} {} {}".format(offset, nalpha, stride)
+        return nalpha, stride, offset, nrad
+
     def convert_rad2orb_(
         self, theta_rlmq, p_uq, loc, rads, rad2orb=True, offset=None, zero_output=True
     ):
@@ -232,7 +256,7 @@ def convert_rad2orb_(
 
         Args:
             theta_rlmq (np.ndarray): float64, shape (nrad, nlm, nalpha)
-            p_uq (np.ndarray): float64, shape (ngrids, nalpha)
+            p_uq (np.ndarray): float64, shape (ngrids, stride)
             loc (np.ndarray, AtomicGridsIndexer):
                 An int32 array whose shape depends on whether rad2orb is True.
             rads (np.ndarray): float64 list of radial coordinates with size
@@ -251,27 +275,9 @@ def convert_rad2orb_(
                 loc = loc.ra_loc
             else:
                 loc = loc.ar_loc
-        nalpha = theta_rlmq.shape[-1]
-        stride = p_uq.shape[1]
-        if offset is None:
-            offset = 0
-        else:
-            assert offset >= 0
-        nrad = rads.size
-        assert loc.flags.c_contiguous
-        assert loc.dtype == np.int32
-        assert loc.ndim == 1
-        assert rads.flags.c_contiguous
-        assert rads.dtype == np.float64
-        assert theta_rlmq.ndim == 3
-        assert theta_rlmq.shape[0] == nrad
-        assert theta_rlmq.flags.c_contiguous
-        assert theta_rlmq.dtype == np.float64
-        assert p_uq.ndim == 2
-        assert p_uq.shape[0] == self.nao
-        assert p_uq.flags.c_contiguous
-        assert p_uq.dtype == np.float64
-        assert offset + nalpha <= stride
+        nalpha, stride, offset, nrad = self._run_checks(
+            theta_rlmq, p_uq, loc, rads, offset
+        )
         if rad2orb:
             assert loc.size == self.natm + 1
             fn = libcider.contract_rad_to_orb
@@ -295,6 +301,95 @@ def convert_rad2orb_(
             ctypes.c_int(offset),
         )
 
+    def _solve_coefs_(self, p_uq, nalpha, stride, offset):
+        assert p_uq.flags.c_contiguous
+        assert p_uq.shape[-1] == stride
+        libcider.solve_atc_coefs_for_orbs(
+            p_uq.ctypes.data_as(ctypes.c_void_p),
+            self.atco_c_ptr,
+            ctypes.c_int(nalpha),
+            ctypes.c_int(stride),
+            ctypes.c_int(offset),
+        )
+
+    def convert_rad2orb_conv_(
+        self,
+        theta_rlmq,
+        p_uq,
+        loc,
+        rads,
+        alphas,
+        l_add=0,
+        rad2orb=True,
+        offset=None,
+        zero_output=True,
+    ):
+        """
+        Take a set of functions stored on radial grids and spherical harmonics,
+        convolve it by the squared-exponentials with exponent parameters `alphas`,
+        and project it onto the atomic orbital basis. This is an in-place
+        operation, where theta_rlmq is written if rad2orb is False and
+        p_uq is written if rad2orb is True. len(alphas) == theta_rlmq.shape[-1]
+        See convert_rad2orb_ for more notes.
+
+        Args:
+            theta_rlmq (np.ndarray): float64, shape (nrad, nlm, nalpha)
+            p_uq (np.ndarray): float64, shape (ngrids, stride)
+            loc (np.ndarray, AtomicGridsIndexer):
+                An int32 array whose shape depends on whether rad2orb is True.
+            rads (np.ndarray): float64 list of radial coordinates with size
+                theta_rlmq.shape[0]. Corresponds to rad_arr member of
+                AtomicGridsIndexer.
+            alphas (np.ndarray): exponents of squared-exponentials for
+                convolution
+            rad2orb (bool): Whether to convert radial coordinates to orbital
+                basis (True) or vice versa (False).
+            offset (int): starting index in p_uq to add to/from
+            zero_output (bool): Whether to set output zero before
+                adding to it. Default to true. Set to False if you
+                want to add to the existing values in the output
+                array.
+        """
+        alphas = np.ascontiguousarray(alphas)
+        if isinstance(loc, AtomicGridsIndexer):
+            if rad2orb:
+                loc = loc.ra_loc
+            else:
+                loc = loc.ar_loc
+        nalpha, stride, offset, nrad = self._run_checks(
+            theta_rlmq, p_uq, loc, rads, offset
+        )
+        if rad2orb:
+            assert loc.size == self.natm + 1
+            fn = libcider.contract_rad_to_orb_conv
+            if zero_output:
+                p_uq[:, offset : offset + nalpha] = 0.0
+        else:
+            assert loc.size == nrad
+            fn = libcider.contract_orb_to_rad_conv
+            if zero_output:
+                theta_rlmq[:] = 0.0
+        # if not rad2orb:
+        #     # orbital to radial grid conversion
+        #     self._solve_coefs_(p_uq, nalpha, stride, offset)
+        fn(
+            theta_rlmq.ctypes.data_as(ctypes.c_void_p),
+            p_uq.ctypes.data_as(ctypes.c_void_p),
+            loc.ctypes.data_as(ctypes.c_void_p),
+            rads.ctypes.data_as(ctypes.c_void_p),
+            alphas.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(rads.size),
+            ctypes.c_int(theta_rlmq.shape[1]),
+            self.atco_c_ptr,
+            ctypes.c_int(nalpha),
+            ctypes.c_int(stride),
+            ctypes.c_int(offset),
+            ctypes.c_int(l_add),
+        )
+        # if rad2orb:
+        #     # radial grid to orbital conversion
+        #     self._solve_coefs_(p_uq, nalpha, stride, offset)
+
 
 class ConvolutionCollection:
     """
@@ -483,6 +578,21 @@ def multiply_atc_integrals(self, input, output=None, fwd=True):
         )
         return output
 
+    def apply_vi_contractions(self, conv_vo, conv_vq, use_r2=False):
+        libcider.apply_vi_contractions(
+            conv_vo.ctypes.data_as(ctypes.c_void_p),
+            conv_vq.ctypes.data_as(ctypes.c_void_p),
+            self._ccl,
+            ctypes.c_int(1 if use_r2 else 0),
+        )
+
+    def solve_coefs_(self, p_uq):
+        print(p_uq.shape)
+        assert p_uq.flags.c_contiguous
+        libcider.solve_atc_coefs_for_orbs2(
+            p_uq.ctypes.data_as(ctypes.c_void_p), self._ccl
+        )
+
 
 class ConvolutionCollectionK(ConvolutionCollection):
     """
diff --git a/ciderpress/dft/lcao_nldf_generator.py b/ciderpress/dft/lcao_nldf_generator.py
index a9da8f1..28b3c64 100644
--- a/ciderpress/dft/lcao_nldf_generator.py
+++ b/ciderpress/dft/lcao_nldf_generator.py
@@ -22,12 +22,22 @@ def __init__(self, plan, ccl, interpolator, grids_indexer):
         self._uq_buf = np.empty((self.ccl.atco_inp.nao, nalpha))
         self._vq_buf = np.empty((self.ccl.atco_out.nao, self.ccl.num_out))
         self._rlmq_buf = self.grids_indexer.empty_rlmq(nalpha)
+        self.timer = None
 
     @property
     def natm(self):
         return self.interpolator.atom_coords.shape[0]
 
+    def _start_timer(self, name):
+        if self.timer is not None:
+            self.timer.start(name)
+
+    def _stop_timer(self, name):
+        if self.timer is not None:
+            self.timer.stop(name)
+
     def _perform_fwd_convolution(self, theta_gq):
+        self._start_timer("Part1")
         theta_rlmq = self._rlmq_buf
         theta_uq = self._uq_buf
         conv_vq = self._vq_buf
@@ -46,12 +56,18 @@ def _perform_fwd_convolution(self, theta_gq):
         self.plan.get_transformed_interpolation_terms(
             theta_uq, i=-1, fwd=True, inplace=True
         )
+        self._stop_timer("Part1")
+        self._start_timer("Part2")
         self.ccl.multiply_atc_integrals(theta_uq, output=conv_vq, fwd=True)
         if self.plan.nldf_settings.nldf_type != "i":
             self.plan.get_transformed_interpolation_terms(
                 conv_vq[:, : self.plan.nalpha], i=0, fwd=False, inplace=True
             )
-        return self.interpolator.project_orb2grid(conv_vq)
+        self._stop_timer("Part2")
+        self._start_timer("Part3")
+        res = self.interpolator.project_orb2grid(conv_vq)
+        self._stop_timer("Part3")
+        return res
 
     def _perform_bwd_convolution(self, vf_gq):
         vtheta_rlmq = self._rlmq_buf
@@ -59,12 +75,17 @@ def _perform_bwd_convolution(self, vf_gq):
         vconv_vq = self._vq_buf
         vconv_vq[:] = 0.0
         vtheta_uq[:] = 0.0
+        self._start_timer("Part3")
         self.interpolator.project_grid2orb(vf_gq, f_uq=vconv_vq)
+        self._stop_timer("Part3")
+        self._start_timer("Part2")
         if self.plan.nldf_settings.nldf_type != "i":
             self.plan.get_transformed_interpolation_terms(
                 vconv_vq[:, : self.plan.nalpha], i=0, fwd=True, inplace=True
             )
         self.ccl.multiply_atc_integrals(vconv_vq, output=vtheta_uq, fwd=False)
+        self._stop_timer("Part2")
+        self._start_timer("Part1")
         self.plan.get_transformed_interpolation_terms(
             vtheta_uq, i=-1, fwd=False, inplace=True
         )
@@ -78,6 +99,7 @@ def _perform_bwd_convolution(self, vf_gq):
         )
         vtheta_gq = self.grids_indexer.empty_gq(self.plan.nalpha)
         self.grids_indexer.reduce_angc_ylm_(vtheta_rlmq, vtheta_gq, a2y=False)
+        self._stop_timer("Part1")
         return vtheta_gq
 
     def get_features(self, rho_in, spin=0):
@@ -228,3 +250,106 @@ def get_potential(self, vfeat, spin=0):
             )
         # self._cache[spin] = None
         return vrho_out
+
+
+class VINLDFGen(LCAONLDFGenerator):
+    def __init__(self, plan, ccl, interpolator, grids_indexer):
+        """
+
+        Args:
+            plan (NLDFAuxiliaryPlan):
+            ccl (ConvolutionCollection):
+            interpolator (LCAOInterpolator):
+            grids_indexer (AtomicGridsIndexer):
+        """
+        super(VINLDFGen, self).__init__(plan, ccl, interpolator, grids_indexer)
+        self._uq_buf = np.empty((self.ccl.atco_out.nao, self.plan.nalpha))
+        self._vq_buf = np.empty((self.ccl.atco_out.nao, self.ccl.num_out))
+        self.timer = None
+
+    def _perform_fwd_convolution(self, theta_gq):
+        self._start_timer("Part1")
+        theta_rlmq = self._rlmq_buf
+        theta_uq = self._uq_buf
+        conv_vq = self._vq_buf
+        conv_vq[:] = 0.0
+        theta_uq[:] = 0.0
+        theta_rlmq[:] = 0.0
+        self.grids_indexer.reduce_angc_ylm_(theta_rlmq, theta_gq, a2y=True)
+        shape = theta_rlmq.shape
+        theta_rlmq.shape = (-1, theta_rlmq.shape[-1])
+        self.plan.get_transformed_interpolation_terms(
+            theta_rlmq, i=-1, fwd=True, inplace=True
+        )
+        theta_rlmq.shape = shape
+        self._stop_timer("Part1")
+        self._start_timer("Part2")
+        self.ccl.atco_out.convert_rad2orb_conv_(
+            theta_rlmq,
+            theta_uq,
+            self.grids_indexer,
+            self.grids_indexer.rad_arr,
+            self.plan.alphas,
+            rad2orb=True,
+            offset=0,
+        )
+        theta_uq[:] *= self.plan.alpha_norms
+        self.ccl.apply_vi_contractions(conv_vq, theta_uq)
+        theta_uq[:] = 0
+        self.ccl.atco_out.convert_rad2orb_conv_(
+            theta_rlmq,
+            theta_uq,
+            self.grids_indexer,
+            self.grids_indexer.rad_arr,
+            self.plan.alphas,
+            l_add=2,
+            rad2orb=True,
+            offset=0,
+        )
+        theta_uq[:] *= self.plan.alpha_norms
+        self.ccl.apply_vi_contractions(conv_vq, theta_uq, use_r2=True)
+        stride = conv_vq.shape[-1]
+        print(stride, conv_vq.shape)
+        # self.ccl.atco_out._solve_coefs_(conv_vq, stride, stride, 0)
+        self.ccl.solve_coefs_(conv_vq)
+        # print(self.plan.alpha_norms)
+        # conv_vq[:, 0] = (self.plan.alpha_norms * theta_uq).sum(axis=-1)
+        # conv_vq[:, 3] = (self.plan.alphas * self.plan.alpha_norms * theta_uq).sum(
+        #     axis=-1
+        # )
+        # conv_vq[:, :] = (self.plan.alpha_norms * theta_uq).sum(axis=-1)[..., None]
+        # conv_vq[:, 0] = theta_uq.sum(axis=-1)
+        self._stop_timer("Part2")
+        self._start_timer("Part3")
+        res = self.interpolator.project_orb2grid(conv_vq)
+        self._stop_timer("Part3")
+        return res
+
+    def _perform_bwd_convolution(self, vf_gq):
+        vtheta_rlmq = self._rlmq_buf
+        # vtheta_uq = self._uq_buf
+        vconv_vq = self._vq_buf
+        vconv_vq[:] = 0.0
+        # vtheta_uq[:] = 0.0
+        self._start_timer("Part3")
+        self.interpolator.project_grid2orb(vf_gq, f_uq=vconv_vq)
+        self._stop_timer("Part3")
+        self._start_timer("Part2")
+        self.ccl.atco_out.convert_rad2orb_conv_(
+            vtheta_rlmq,
+            vconv_vq,
+            self.grids_indexer,
+            self.grids_indexer.rad_arr,
+            self.plan.alphas,
+            rad2orb=False,
+            offset=0,
+        )
+        self._stop_timer("Part2")
+        self._start_timer("Part1")
+        self.plan.get_transformed_interpolation_terms(
+            vtheta_rlmq, i=-1, fwd=False, inplace=True
+        )
+        vtheta_gq = self.grids_indexer.empty_gq(self.plan.nalpha)
+        self.grids_indexer.reduce_angc_ylm_(vtheta_rlmq, vtheta_gq, a2y=False)
+        self._stop_timer("Part1")
+        return vtheta_gq
diff --git a/ciderpress/lib/mod_cider/convolutions.c b/ciderpress/lib/mod_cider/convolutions.c
index fe5d3fb..5a241ef 100644
--- a/ciderpress/lib/mod_cider/convolutions.c
+++ b/ciderpress/lib/mod_cider/convolutions.c
@@ -9,6 +9,10 @@ inline double gauss_integral(int l, double a) {
     return 0.5 * pow(a, -1.5 - l) * tgamma(1.5 + l);
 }
 
+inline double gauss_mlapl_integral(int l, double a, double b) {
+    return 2 * a * b * pow(a + b, -2.5 - l) * tgamma(2.5 + l);
+}
+
 // TODO all gauss integrals below should be inline for efficiency,
 // TODO but they cannot be made inline and then assigned to function pointers.
 /**
@@ -175,6 +179,7 @@ void generate_atc_basis_set(atc_basis_set **atco_ptr, int *atom2l0, int *lmaxs,
         atcc->gtrans_0 = (double *)malloc(ngsh2 * sizeof(double));
         atcc->gtrans_m = (double *)malloc(ngsh2 * sizeof(double));
         atcc->gtrans_p = (double *)malloc(ngsh2 * sizeof(double));
+        atcc->gtrans_l = (double *)malloc(ngsh2 * sizeof(double));
         for (int g = 0; g < ngsh; g++) {
             atcc->gammas[g] = all_exps[g + gsh0];
             atcc->gcoefs[g] = all_coefs[g + gsh0];
@@ -194,6 +199,8 @@ void generate_atc_basis_set(atc_basis_set **atco_ptr, int *atom2l0, int *lmaxs,
                         coef0 * coef1 * gauss_integral(l - 1, exp0 + exp1);
                     atcc->gtrans_p[atcc->l_loc2[l] + g0 * ngsh_l + g1] =
                         coef0 * coef1 * gauss_integral(l + 1, exp0 + exp1);
+                    atcc->gtrans_l[atcc->l_loc2[l] + g0 * ngsh_l + g1] =
+                        coef0 * coef1 * gauss_mlapl_integral(l, exp0, exp1);
                 }
             }
             dpotrf_(&(atco->UPLO), &ngsh_l, atcc->gtrans_0 + atcc->l_loc2[l],
@@ -202,6 +209,8 @@ void generate_atc_basis_set(atc_basis_set **atco_ptr, int *atom2l0, int *lmaxs,
                     &ngsh_l, &info);
             dpotrf_(&(atco->UPLO), &ngsh_l, atcc->gtrans_p + atcc->l_loc2[l],
                     &ngsh_l, &info);
+            dpotrf_(&(atco->UPLO), &ngsh_l, atcc->gtrans_l + atcc->l_loc2[l],
+                    &ngsh_l, &info);
         }
     }
     atco->bas = (int *)malloc(BAS_SLOTS * atco->nbas * sizeof(int));
@@ -247,6 +256,7 @@ void free_atc_atom(atc_atom atcc) {
     free(atcc.gtrans_0);
     free(atcc.gtrans_m);
     free(atcc.gtrans_p);
+    free(atcc.gtrans_l);
 }
 
 /**
@@ -577,7 +587,7 @@ void generate_atc_integrals_vi(convolution_collection *ccl, int featid,
         // k^8_alpha(r, r') = alpha^2 * (r-r')^2 * exp(-alpha*(r-r')^2)
         else if (featid == 8)
             integral_func = &gauss_a2dida;
-        // NOTE: k^9 = 4 * k^8 - 2 * k^7 should given the Laplacian of the k^0
+        // NOTE: k^9 = 4 * k^8 - 2 * k^7 should give the Laplacian of the k^0
         // feature. k^9_alpha(r, r') = \nabla^2 exp(-alpha*(r-r')^2)
         else if (featid == 9)
             integral_func = &gauss_lapli0;
@@ -713,6 +723,114 @@ void solve_atc_coefs(convolution_collection *ccl) {
     }
 }
 
+void solve_atc_coefs_for_orbs(double *p_uq, atc_basis_set *atco, int nalpha,
+                              int stride, int offset) {
+#pragma omp parallel
+    {
+        double *pp_uq;
+        int ia, dish;
+        int ish0;
+        int one = 1;
+        atc_atom atcc;
+        double *chomat;
+        int aq, q, m;
+        int info;
+        int mstride, u0;
+        int max_dish = 0;
+        for (ia = 0; ia < atco->natm; ia++) {
+            atcc = atco->atc_convs[ia];
+            for (int l = 0; l < atcc.lmax + 1; l++) {
+                ish0 = atcc.global_l_loc[l];
+                dish = atcc.global_l_loc[l + 1] - ish0;
+                max_dish = MAX(dish, max_dish);
+            }
+        }
+        double *buf = (double *)malloc(max_dish * sizeof(double));
+#pragma omp for schedule(dynamic, 4)
+        for (aq = 0; aq < nalpha * atco->natm; aq++) {
+            ia = aq / nalpha;
+            q = aq % nalpha;
+            atcc = atco->atc_convs[ia];
+            for (int l = 0; l < atcc.lmax + 1; l++) {
+                ish0 = atcc.global_l_loc[l];
+                dish = atcc.global_l_loc[l + 1] - ish0;
+                u0 = atco->ao_loc[ish0];
+                mstride = (2 * l + 1) * stride;
+                chomat = atcc.gtrans_0 + atcc.l_loc2[l];
+                for (m = 0; m < 2 * l + 1; m++) {
+                    pp_uq = p_uq + (u0 + m) * stride + q + offset;
+                    for (int sh = 0; sh < dish; sh++) {
+                        buf[sh] = pp_uq[sh * mstride];
+                    }
+                    dpotrs_(&(atco->UPLO), &dish, &one, chomat, &dish, buf,
+                            &dish, &info);
+                    for (int sh = 0; sh < dish; sh++) {
+                        pp_uq[sh * mstride] = buf[sh];
+                    }
+                }
+            }
+        }
+        free(buf);
+    }
+}
+
+void solve_atc_coefs_for_orbs2(double *p_uq, convolution_collection *ccl) {
+#pragma omp parallel
+    {
+        double *pp_uq;
+        int ia, dish;
+        int ish0;
+        int one = 1;
+        atc_basis_set *atco = ccl->atco_out;
+        atc_atom atcc;
+        double *chomat;
+        int aq, q, m;
+        int info;
+        int mstride, u0;
+        int max_dish = 0;
+        int ifeat;
+        for (ia = 0; ia < atco->natm; ia++) {
+            atcc = atco->atc_convs[ia];
+            for (int l = 0; l < atcc.lmax + 1; l++) {
+                ish0 = atcc.global_l_loc[l];
+                dish = atcc.global_l_loc[l + 1] - ish0;
+                max_dish = MAX(dish, max_dish);
+            }
+        }
+        double *buf = (double *)malloc(max_dish * sizeof(double));
+#pragma omp for schedule(dynamic, 4)
+        for (aq = 0; aq < ccl->nfeat_i * atco->natm; aq++) {
+            ia = aq / ccl->nfeat_i;
+            q = aq % ccl->nfeat_i;
+            atcc = atco->atc_convs[ia];
+            for (int l = 0; l < atcc.lmax + 1; l++) {
+                ish0 = atcc.global_l_loc[l];
+                dish = atcc.global_l_loc[l + 1] - ish0;
+                u0 = atco->ao_loc[ish0];
+                mstride = (2 * l + 1) * ccl->nfeat_i;
+                ifeat = ccl->icontrib_ids[q];
+                if (ifeat == 3 || ifeat == 4 || ifeat == 5 || ifeat == 6) {
+                    chomat = atcc.gtrans_l + atcc.l_loc2[l];
+                } else {
+                    chomat = atcc.gtrans_0 + atcc.l_loc2[l];
+                }
+                for (m = 0; m < 2 * l + 1; m++) {
+                    pp_uq = p_uq + (u0 + m) * ccl->nfeat_i + q;
+                    for (int sh = 0; sh < dish; sh++) {
+                        buf[sh] = pp_uq[sh * mstride];
+                    }
+                    dpotrs_(&(atco->UPLO), &dish, &one, chomat, &dish, buf,
+                            &dish, &info);
+                    for (int sh = 0; sh < dish; sh++) {
+                        pp_uq[sh * mstride] = buf[sh];
+                    }
+                }
+            }
+        }
+        free(buf);
+    }
+}
+
 /**
  * Convolve the input inp_uq to out_vq using the convolution_collection ccl.
  * NOTE this is for version i, j, and ij. Use multiply_atc_integrals_vk for
@@ -986,6 +1104,334 @@ void contract_orb_to_rad(double *theta_rlmq, double *p_uq, int *ar_loc,
     }
 }
 
+/**
+ * Given a radial distribution of functions for each spherical harmonic
+ * lm on each atom, with dimension nalpha, project onto the orbital
+ * basis set given by atco. Note: this computes projections onto each
+ * (non-orthogonal) basis function, not expansion coefficients
+ * theta_rlmq (nrad x nlm x nalpha) : input functions to project onto atco basis
+ * p_uq (atco->nao x nalpha) : output projections
+ * ra_loc (length natm + 1) : Range of rad indices that correspond
+ *                            to each atom
+ * rads (length nrad) : radial coordinates for each radial index
+ * nrad : number of radial coordinates over all atoms
+ * nlm : number of spherical harmonics (lmax + 1)^2
+ * atco : stores the atomic basis set.
+ * nalpha : number of functions stored in the rlm space.
+ */
+void contract_rad_to_orb_conv(double *theta_rlmq, double *p_uq, int *ra_loc,
+                              double *rads, double *alphas, int nrad, int nlm,
+                              atc_basis_set *atco, int nalpha, int stride,
+                              int offset, int l_add) {
+    p_uq = p_uq + offset;
+#pragma omp parallel
+    {
+        double PI = 4.0 * atan(1.0);
+        int ish, i0, L0, nm, l, l_eff, at;
+        double *p_q, *theta_mq;
+        int *bas = atco->bas;
+        int *ao_loc = atco->ao_loc;
+        double *env = atco->env;
+        int nbas = atco->nbas;
+        int *ibas;
+        double coef, beta;
+        int r, m, q, mq;
+        double *exp_list = (double *)malloc(sizeof(double) * nalpha);
+        double *coef_list = (double *)malloc(sizeof(double) * nalpha);
+        double *val_list = (double *)malloc(sizeof(double) * nalpha);
+#pragma omp for schedule(dynamic, 4)
+        for (ish = 0; ish < nbas; ish++) {
+            ibas = bas + ish * BAS_SLOTS;
+            at = ibas[ATOM_OF];
+            l = ibas[ANG_OF];
+            l_eff = l + l_add;
+            coef = env[ibas[PTR_COEFF]];
+            beta = env[ibas[PTR_EXP]];
+            i0 = ao_loc[ish];
+            nm = 2 * l + 1;
+            L0 = l * l;
+            for (q = 0; q < nalpha; q++) {
+                exp_list[q] = beta * alphas[q] / (beta + alphas[q]);
+                coef_list[q] = coef * pow(PI / alphas[q], 1.5) *
+                               pow(alphas[q] / (beta + alphas[q]), 1.5 + l);
+            }
+            for (r = ra_loc[at]; r < ra_loc[at + 1]; r++) {
+                theta_mq = theta_rlmq + nalpha * (r * nlm + L0);
+                p_q = p_uq + i0 * stride;
+                mq = 0;
+                for (q = 0; q < nalpha; q++) {
+                    val_list[q] = coef_list[q] * pow(rads[r], l_eff) *
+                                  exp(-exp_list[q] * rads[r] * rads[r]);
+                }
+                for (m = 0; m < nm; m++) {
+                    for (q = 0; q < nalpha; q++, mq++) {
+                        p_q[q] += val_list[q] * theta_mq[mq];
+                    }
+                    p_q += stride;
+                }
+            }
+        }
+        free(exp_list);
+        free(coef_list);
+        free(val_list);
+    }
+}
+
+/**
+ * Backwards version of contract_rad_to_orb_conv
+ * (i.e. takes p_uq as input and projects the convolved basis
+ * functions out onto the radial coordinates and spherical harmonics).
+ * See contract_rad_to_orb_conv for variable definitions and details.
+ */
+void contract_orb_to_rad_conv(double *theta_rlmq, double *p_uq, int *ar_loc,
+                              double *rads, double *alphas, int nrad, int nlm,
+                              atc_basis_set *atco, int nalpha, int stride,
+                              int offset, int l_add) {
+    p_uq = p_uq + offset;
+    int nbas = atco->nbas;
+    double *exp_list = (double *)malloc(sizeof(double) * nbas * nalpha);
+    double *coef_list = (double *)malloc(sizeof(double) * nbas * nalpha);
+#pragma omp parallel
+    {
+        double PI = 4.0 * atan(1.0);
+        int ish, i0, L0, nm, l, at;
+        double *p_q, *theta_mq;
+        int *bas = atco->bas;
+        int *ao_loc = atco->ao_loc;
+        double *env = atco->env;
+        int *ibas;
+        double coef, beta;
+        int r, m, q, mq;
+        double *val_list = (double *)malloc(sizeof(double) * nalpha);
+#pragma omp for schedule(dynamic, 4)
+        for (ish = 0; ish < nbas; ish++) {
+            ibas = bas + ish * BAS_SLOTS;
+            at = ibas[ATOM_OF];
+            l = ibas[ANG_OF];
+            coef = env[ibas[PTR_COEFF]];
+            beta = env[ibas[PTR_EXP]];
+            for (q = 0; q < nalpha; q++) {
+                exp_list[ish * nalpha + q] =
+                    beta * alphas[q] / (beta + alphas[q]);
+                coef_list[ish * nalpha + q] =
+                    coef * pow(PI / alphas[q], 1.5) *
+                    pow(alphas[q] / (beta + alphas[q]), 1.5 + l);
+            }
+        }
+#pragma omp for schedule(dynamic, 4)
+        for (r = 0; r < nrad; r++) {
+            at = ar_loc[r];
+            for (ish = atco->atom_loc_ao[at]; ish < atco->atom_loc_ao[at + 1];
+                 ish++) {
+                ibas = bas + ish * BAS_SLOTS;
+                l = ibas[ANG_OF];
+                coef = env[ibas[PTR_COEFF]];
+                beta = env[ibas[PTR_EXP]];
+                i0 = ao_loc[ish];
+                nm = 2 * l + 1;
+                L0 = l * l;
+                for (q = 0; q < nalpha; q++) {
+                    val_list[q] =
+                        coef_list[ish * nalpha + q] * pow(rads[r], l) *
+                        exp(-exp_list[ish * nalpha + q] * rads[r] * rads[r]);
+                }
+                theta_mq = theta_rlmq + nalpha * (r * nlm + L0);
+                p_q = p_uq + i0 * stride;
+                mq = 0;
+                for (m = 0; m < nm; m++) {
+                    for (q = 0; q < nalpha; q++, mq++) {
+                        theta_mq[mq] += val_list[q] * p_q[q];
+                    }
+                    p_q += stride;
+                }
+            }
+        }
+    }
+}
+
+double real_se_prefac(int l, double alpha, double beta) { return 1.0; }
+
+double real_se_prefac_r2(int l, double alpha, double beta) { return 0.0; }
+
+double real_se_ap_prefac(int l, double alpha, double beta) { return alpha; }
+
+double real_se_ap_prefac_r2(int l, double alpha, double beta) { return 0.0; }
+
+double real_se_r2_prefac(int l, double alpha, double beta) {
+    return (3 * alpha - 2 * beta * l) / (2 * alpha * (beta + alpha));
+}
+
+double real_se_r2_prefac_r2(int l, double alpha, double beta) {
+    return beta * beta / ((beta + alpha) * (beta + alpha));
+}
+
+double real_se_apr2_prefac(int l, double alpha, double beta) {
+    return alpha * real_se_r2_prefac(l, alpha, beta);
+}
+
+double real_se_apr2_prefac_r2(int l, double alpha, double beta) {
+    return alpha * real_se_r2_prefac_r2(l, alpha, beta);
+}
+
+double real_se_ap2r2_prefac(int l, double alpha, double beta) {
+    return alpha * alpha * real_se_r2_prefac(l, alpha, beta);
+}
+
+double real_se_ap2r2_prefac_r2(int l, double alpha, double beta) {
+    return alpha * alpha * real_se_r2_prefac_r2(l, alpha, beta);
+}
+
+double real_se_lapl_prefac(int l, double alpha, double beta) {
+    return 4 * real_se_ap2r2_prefac(l, alpha, beta) -
+           2 * real_se_ap_prefac(l, alpha, beta);
+}
+
+double real_se_lapl_prefac_r2(int l, double alpha, double beta) {
+    return 4 * real_se_ap2r2_prefac_r2(l, alpha, beta) -
+           2 * real_se_ap_prefac_r2(l, alpha, beta);
+}
+
+double real_se_grad_prefac_m1(int l, double alpha, double beta) {
+    return -0.0 * real_se_lapl_prefac(l, alpha, beta);
+}
+
+double real_se_grad_prefac_r2_m1(int l, double alpha, double beta) {
+    return -0.0 * real_se_lapl_prefac_r2(l, alpha, beta);
+}
+
+double real_se_grad_prefac_p1(int l, double alpha, double beta) {
+    return pow(-1.0, l) * alpha * beta / (alpha + beta) *
+           real_se_lapl_prefac(l, alpha, beta);
+}
+
+double real_se_grad_prefac_r2_p1(int l, double alpha, double beta) {
+    return pow(-1.0, l) * alpha * beta / (alpha + beta) *
+           real_se_lapl_prefac_r2(l, alpha, beta);
+}
+
+double real_se_rvec_prefac_m1(int l, double alpha, double beta) {
+    return real_se_grad_prefac_m1(l, alpha, beta) / alpha;
+}
+
+double real_se_rvec_prefac_p1(int l, double alpha, double beta) {
+    return real_se_grad_prefac_p1(l, alpha, beta) / alpha;
+}
+
+double real_se_rvec_prefac_r2_m1(int l, double alpha, double beta) {
+    return real_se_grad_prefac_r2_m1(l, alpha, beta) / alpha;
+}
+
+double real_se_rvec_prefac_r2_p1(int l, double alpha, double beta) {
+    return real_se_grad_prefac_r2_p1(l, alpha, beta) / alpha;
+}
+
+void get_real_conv_prefacs_vi(double *out, atc_basis_set *atco, double *alphas,
+                              int use_r2, int featid, int num_out, int nalpha) {
+#pragma omp parallel
+    {
+        int sh, q, l;
+        double beta;
+        int *ibas;
+        double (*prefac_func)(int, double, double);
+        // NOTE: This messy if statement can be replaced by
+        // subclassing when converting to C++.
+        if (use_r2) {
+            if (featid == 0)
+                prefac_func = &real_se_prefac_r2;
+            else if (featid == 1)
+                prefac_func = &real_se_r2_prefac_r2;
+            else if (featid == 2)
+                prefac_func = &real_se_apr2_prefac_r2;
+            else if (featid == 3)
+                prefac_func = &real_se_rvec_prefac_r2_m1;
+            else if (featid == 4)
+                prefac_func = &real_se_grad_prefac_r2_m1;
+            else if (featid == 5)
+                prefac_func = &real_se_rvec_prefac_r2_p1;
+            else if (featid == 6)
+                prefac_func = &real_se_grad_prefac_r2_p1;
+            else if (featid == 7)
+                prefac_func = &real_se_ap_prefac_r2;
+            else if (featid == 8)
+                prefac_func = &real_se_ap2r2_prefac_r2;
+            else if (featid == 9)
+                prefac_func = &real_se_lapl_prefac_r2;
+            else {
+                printf("Unsupported featid\n");
+                exit(-1);
+            }
+        } else {
+            if (featid == 0)
+                prefac_func = &real_se_prefac;
+            else if (featid == 1)
+                prefac_func = &real_se_r2_prefac;
+            else if (featid == 2)
+                prefac_func = &real_se_apr2_prefac;
+            else if (featid == 3)
+                prefac_func = &real_se_rvec_prefac_m1;
+            else if (featid == 4)
+                prefac_func = &real_se_grad_prefac_m1;
+            else if (featid == 5)
+                prefac_func = &real_se_rvec_prefac_p1;
+            else if (featid == 6)
+                prefac_func = &real_se_grad_prefac_p1;
+            else if (featid == 7)
+                prefac_func = &real_se_ap_prefac;
+            else if (featid == 8)
+                prefac_func = &real_se_ap2r2_prefac;
+            else if (featid == 9)
+                prefac_func = &real_se_lapl_prefac;
+            else {
+                printf("Unsupported featid\n");
+                exit(-1);
+            }
+        }
+#pragma omp for
+        for (sh = 0; sh < atco->nbas; sh++) {
+            for (q = 0; q < nalpha; q++) {
+                ibas = atco->bas + sh * BAS_SLOTS;
+                l = ibas[ANG_OF];
+                beta = atco->env[ibas[PTR_EXP]];
+                out[sh * nalpha * num_out + q] =
+                    prefac_func(l, alphas[q], beta);
+            }
+        }
+    }
+}
+
+void apply_vi_contractions(double *conv_vo, double *conv_vq,
+                           convolution_collection *ccl, int use_r2) {
+    int ifeat;
+    atc_basis_set *atco = ccl->atco_out;
+    int nalpha = ccl->nalpha;
+    int nbeta = ccl->nfeat_i;
+    int oqstride = nalpha * nbeta;
+    double *ints_soq =
+        (double *)malloc(nalpha * nbeta * atco->nbas * sizeof(double));
+    for (ifeat = 0; ifeat < nbeta; ifeat++) {
+        get_real_conv_prefacs_vi(ints_soq + ifeat * nalpha, atco, ccl->alphas,
+                                 use_r2, ccl->icontrib_ids[ifeat], nbeta,
+                                 nalpha);
+    }
+#pragma omp parallel
+    {
+        int sh, v, b, q;
+#pragma omp for schedule(dynamic, 4)
+        for (sh = 0; sh < atco->nbas; sh++) {
+            for (v = atco->ao_loc[sh]; v < atco->ao_loc[sh + 1]; v++) {
+                for (b = 0; b < ccl->nfeat_i; b++) {
+                    for (q = 0; q < ccl->nalpha; q++) {
+                        conv_vo[v * nbeta + b] +=
+                            ints_soq[sh * oqstride + b * nalpha + q] *
+                            conv_vq[v * nalpha + q];
+                    }
+                }
+            }
+        }
+    }
+    free(ints_soq);
+}
+
 // TODO implementing these functions could be helpful for PAW implementation
 /**
  * Given a radial distribution of functions for each spherical harmonic
diff --git a/ciderpress/lib/mod_cider/convolutions.h b/ciderpress/lib/mod_cider/convolutions.h
index 0baf5ee..db01700 100644
--- a/ciderpress/lib/mod_cider/convolutions.h
+++ b/ciderpress/lib/mod_cider/convolutions.h
@@ -31,6 +31,8 @@ typedef struct {
                       // with l-1 used for the principal angular momentum number
     double *gtrans_p; // transformation (inverted overlap) matrix for gammas
                       // with l+1 used for the principal angular momentum number
+    double *gtrans_l; // transformation (inverted overlap) matrix for gammas
+                      // using -laplacian norm
 } atc_atom;
 
 /**
diff --git a/ciderpress/pyscf/descriptors.py b/ciderpress/pyscf/descriptors.py
index 1ab81fe..9fe0d10 100644
--- a/ciderpress/pyscf/descriptors.py
+++ b/ciderpress/pyscf/descriptors.py
@@ -11,6 +11,7 @@
 from ciderpress.dft.settings import (
     FracLaplSettings,
     NLDFSettings,
+    NLDFSettingsVI,
     SDMXBaseSettings,
     SemilocalSettings,
     dalpha,
@@ -21,7 +22,11 @@
 from ciderpress.pyscf.analyzers import UHFAnalyzer
 from ciderpress.pyscf.frac_lapl import FLNumInt
 from ciderpress.pyscf.gen_cider_grid import CiderGrids
-from ciderpress.pyscf.nldf_convolutions import DEFAULT_CIDER_LMAX, PyscfNLDFGenerator
+from ciderpress.pyscf.nldf_convolutions import (
+    DEFAULT_CIDER_LMAX,
+    PyscfNLDFGenerator,
+    PyscfVIGenerator,
+)
 from ciderpress.pyscf.sdmx_slow import EXXSphGenerator
 
 NLDF_VERSION_LIST = ["i", "j", "ij", "k"]
@@ -405,6 +410,10 @@ def _nldf_desc_getter(
     nldf_generator = PyscfNLDFGenerator.from_mol_and_settings(
         mol, grids.grids_indexer, 1, nldf_settings, **kwargs
     )
+    if isinstance(nldf_settings, NLDFSettingsVI):
+        nldf_generator = PyscfVIGenerator.from_mol_and_settings(
+            mol, grids.grids_indexer, 1, nldf_settings, **kwargs
+        )
     nldf_generator.interpolator.set_coords(pgrids.coords)
     ni = NumInt()
     xctype = "MGGA"
diff --git a/ciderpress/pyscf/nldf_convolutions.py b/ciderpress/pyscf/nldf_convolutions.py
index ffe68fd..d0630fd 100644
--- a/ciderpress/pyscf/nldf_convolutions.py
+++ b/ciderpress/pyscf/nldf_convolutions.py
@@ -12,7 +12,7 @@
     get_gamma_lists_from_bas_and_env,
 )
 from ciderpress.dft.lcao_interpolation import LCAOInterpolator, LCAOInterpolatorDirect
-from ciderpress.dft.lcao_nldf_generator import LCAONLDFGenerator
+from ciderpress.dft.lcao_nldf_generator import LCAONLDFGenerator, VINLDFGen
 from ciderpress.dft.plans import VI_ID_MAP, NLDFGaussianPlan, NLDFSplinePlan
 
 
@@ -128,7 +128,8 @@ def __init__(self, settings, **kwargs):
         self.settings = settings
         self.kwargs = kwargs
 
-    def initialize_nldf_generator(self, mol, grids_indexer, nspin):
+    def initialize_nldf_generator(self, mol, grids_indexer, nspin, timer=None):
+        self.kwargs["timer"] = timer
         return PyscfNLDFGenerator.from_mol_and_settings(
             mol, grids_indexer, nspin, self.settings, **self.kwargs
         )
@@ -160,6 +161,7 @@ def from_mol_and_settings(
         aparam=0.03,
         dparam=0.04,
         nrad=200,
+        timer=None,
     ):
         if lmax is None:
             lmax = grids_indexer.lmax
@@ -241,9 +243,143 @@ def from_mol_and_settings(
                 nrad=nrad,
                 onsite_direct=onsite_direct,
             )
+        if timer is not None:
+            timer.start("Compute integrals")
         ccl.compute_integrals_()
+        if timer is not None:
+            timer.stop()
+            timer.start("Projection coefs")
         ccl.solve_projection_coefficients()
-        return cls(plan, ccl, interpolator, grids_indexer)
+        if timer is not None:
+            timer.stop("Projection coefs")
+        output = cls(plan, ccl, interpolator, grids_indexer)
+        output.timer = timer
+        return output
+
+    def get_extra_ao(self):
+        return (self.interpolator.nlm + 4) * self.natm
+
+
+class PyscfVIGenerator(VINLDFGen):
+    """
+    A PySCF-specific wrapper for the NLDF feature generator.
+    """
+
+    @classmethod
+    def from_mol_and_settings(
+        cls,
+        mol,
+        grids_indexer,
+        nspin,
+        nldf_settings,
+        plan_type="gaussian",
+        lmax=None,
+        aux_lambd=1.6,
+        aug_beta=None,
+        alpha_max=10000,
+        alpha_min=None,
+        alpha_formula=None,
+        rhocut=1e-10,
+        expcut=1e-10,
+        gbuf=2.0,
+        interpolator_type="onsite_direct",
+        aparam=0.03,
+        dparam=0.04,
+        nrad=200,
+        timer=None,
+    ):
+        if lmax is None:
+            lmax = grids_indexer.lmax
+        if lmax > grids_indexer.lmax:
+            raise ValueError("lmax cannot be larger than grids_indexer.lmax")
+        if interpolator_type not in ["onsite_direct", "onsite_spline", "train_gen"]:
+            raise ValueError
+        if aug_beta is None:
+            aug_beta = aux_lambd
+        if alpha_min is None:
+            alpha_min = nldf_settings.theta_params[0] / 256  # sensible default
+        if plan_type not in ["gaussian", "spline"]:
+            raise ValueError("plan_type must be gaussian or spline")
+        if alpha_formula is None:
+            alpha_formula = "etb" if plan_type == "gaussian" else "zexp"
+        nalpha = int(np.ceil(np.log(alpha_max / alpha_min) / np.log(aux_lambd))) + 1
+        plan_class = NLDFGaussianPlan if plan_type == "gaussian" else NLDFSplinePlan
+        plan = plan_class(
+            nldf_settings,
+            nspin,
+            alpha_min,
+            aux_lambd,
+            nalpha,
+            coef_order="gq",
+            alpha_formula=alpha_formula,
+            proc_inds=None,
+            rhocut=rhocut,
+            expcut=expcut,
+        )
+        basis = aug_etb_for_cider(mol, lmax=lmax, beta=aug_beta)[0]
+        mol = gto.M(
+            atom=mol.atom,
+            basis=basis,
+            spin=mol.spin,
+            charge=mol.charge,
+            unit=mol.unit,
+        )
+        dat = get_gamma_lists_from_mol(mol)
+        atco_inp = ATCBasis(*dat)
+        dat = get_convolution_expnts_from_expnts(
+            plan.alphas, dat[0], dat[1], dat[2], dat[4], gbuf=gbuf
+        )
+        atco_out = ATCBasis(*dat)
+        if plan.nldf_settings.nldf_type == "k":
+            ccl = ConvolutionCollectionK(
+                atco_inp, atco_out, plan.alphas, plan.alpha_norms
+            )
+        else:
+            has_vj = "j" in plan.nldf_settings.nldf_type
+            ifeat_ids = []
+            for spec in plan.nldf_settings.l0_feat_specs:
+                ifeat_ids.append(VI_ID_MAP[spec])
+            for spec in plan.nldf_settings.l1_feat_specs:
+                ifeat_ids.append(VI_ID_MAP[spec])
+            ccl = ConvolutionCollection(
+                atco_inp,
+                atco_out,
+                plan.alphas,
+                plan.alpha_norms,
+                has_vj=has_vj,
+                ifeat_ids=ifeat_ids,
+            )
+        if interpolator_type == "train_gen":
+            interpolator = LCAOInterpolator.from_ccl(
+                mol.atom_coords(unit="Bohr"),
+                ccl,
+                aparam=aparam,
+                dparam=dparam,
+                nrad=nrad,
+            )
+        else:
+            onsite_direct = interpolator_type == "onsite_direct"
+            interpolator = LCAOInterpolatorDirect.from_ccl(
+                grids_indexer,
+                mol.atom_coords(unit="Bohr"),
+                ccl,
+                aparam=aparam,
+                dparam=dparam,
+                nrad=nrad,
+                onsite_direct=onsite_direct,
+            )
+        if timer is not None:
+            timer.start("Compute integrals")
+        ccl.compute_integrals_()
+        if timer is not None:
+            timer.stop()
+            timer.start("Projection coefs")
+        ccl.solve_projection_coefficients()
+        if timer is not None:
+            timer.stop("Projection coefs")
+        output = cls(plan, ccl, interpolator, grids_indexer)
+        output.timer = timer
+        return output
 
     def get_extra_ao(self):
         return (self.interpolator.nlm + 4) * self.natm
diff --git a/ciderpress/pyscf/numint.py b/ciderpress/pyscf/numint.py
index 30d695b..80efd15 100644
--- a/ciderpress/pyscf/numint.py
+++ b/ciderpress/pyscf/numint.py
@@ -317,6 +317,7 @@ def nr_rks_nldf(
     Returns:
 
     """
+    ni.timer.start("nr_rks_nldf")
     if not hasattr(grids, "grids_indexer"):
         raise ValueError("Grids object must have indexer for NLDF evaluation")
 
@@ -362,8 +363,10 @@ def block_loop(ao_deriv):
     if any(x in xc_code.upper() for x in ("CC06", "CS", "BR89", "MK00")):
         raise NotImplementedError("laplacian in meta-GGA method")
     ao_deriv = 1
+    ni.timer.start("density")
     for i, ip0, ip1, ao, mask, weight, coords in block_loop(ao_deriv):
         rho_full[i, :, ip0:ip1] = make_rho(i, ao, mask, xctype)
+    ni.timer.stop("density")
 
     par_atom = False
     extra_ao = ni.nldfgen.get_extra_ao()
@@ -408,6 +411,7 @@ def block_loop(ao_deriv):
         for idm in range(nset):
             wv_full[idm, :, :] += ni.nldfgen.get_potential(vxc_nldf_full[idm])
 
+    ni.timer.start("potential")
     buffers = None
     pair_mask = mol.get_overlap_cond() < -np.log(ni.cutoff)
     v1 = np.zeros_like(vmat)
@@ -436,6 +440,8 @@ def block_loop(ao_deriv):
         dtype = np.result_type(*dms)
     if vmat.dtype != dtype:
         vmat = np.asarray(vmat, dtype=dtype)
+    ni.timer.stop("potential")
+    ni.timer.stop("nr_rks_nldf")
     return nelec, excsum, vmat
 
 
@@ -1109,7 +1115,7 @@ def initialize_feature_generators(self, mol, grids, nspin):
         cond = cond or self.nldfgen.plan.nspin != nspin
         if cond:
             self.nldfgen = self.nldf_init.initialize_nldf_generator(
-                mol, grids.grids_indexer, nspin
+                mol, grids.grids_indexer, nspin, timer=self.timer
             )
             self.nldfgen.interpolator.set_coords(grids.coords)
         super().initialize_feature_generators(mol, grids, nspin)
diff --git a/ciderpress/pyscf/tests/test_nldf.py b/ciderpress/pyscf/tests/test_nldf.py
index 1ecc179..c1dfaa2 100644
--- a/ciderpress/pyscf/tests/test_nldf.py
+++ b/ciderpress/pyscf/tests/test_nldf.py
@@ -169,20 +169,24 @@ def _check_nldf_equivalence(
             aug_beta=beta,
         )
         # TODO uncomment after fixing vi stability
-        # import traceback as tb
-        # errs = {}
-        # for i in range(ifeat_pred.shape[1]):
-        #     print(i)
-        #     try:
-        #         assert_allclose(ifeat_pred[:, i], ifeat[:, i], rtol=rtol, atol=atol)
-        #     except AssertionError as e:
-        #         errs[i] = ''.join(tb.format_exception(None, e, e.__traceback__))
-        # if len(errs) > 0:
-        #     estr = ''
-        #     for i, err in errs.items():
-        #         print(i, err)
-        #         estr = estr + 'FEATURE {}\n{}\n'.format(i, err)
-        #     raise AssertionError(estr)
+        import traceback as tb
+
+        errs = {}
+        for i in range(ifeat_pred.shape[1]):
+            print(i)
+            try:
+                assert_allclose(
+                    ifeat_pred[:, i], ifeat[:, i], rtol=0.1 * rtol, atol=0.1 * atol
+                )
+            except AssertionError as e:
+                # raise e
+                errs[i] = "".join(tb.format_exception(None, e, e.__traceback__))
+        if len(errs) > 0:
+            estr = ""
+            for i, err in errs.items():
+                print(i, err)
+                estr = estr + "FEATURE {}\n{}\n".format(i, err)
+            raise AssertionError(estr)
         jfeat_pred = get_descriptors(
             analyzer,
             self.vj_settings,
@@ -294,7 +298,7 @@ def _check_nldf_occ_derivs(
         )
         assert_almost_equal(rho_pred, rho_ref, 12)
         # TODO uncomment after fixing vi stability
-        # assert_almost_equal(ifeat_pred, ifeat_ref, 12)
+        assert_almost_equal(ifeat_pred, ifeat_ref, 12)
         assert_almost_equal(jfeat_pred, jfeat_ref, 12)
         # TODO uncomment after fixing vi stability
         # assert_almost_equal(ijfeat_pred, ijfeat_ref, 12)
@@ -339,8 +343,9 @@ def _check_nldf_occ_derivs(
                 )
                 occd_ifeat_fd = (ifeat_pert - ifeat_ref)[spin] / delta
                 for i in range(self.vi_settings.nfeat):
-                    # TODO uncomment after fixing vi stability
-                    continue
+                    # TODO run this check
+                    # continue
+                    print(i)
                     assert_allclose(
                         occd_pred[i], occd_ifeat_fd[i], rtol=rtol, atol=atol
                     )
@@ -726,6 +731,7 @@ def test_nldf_equivalence(self):
     def test_nldf_occ_derivs(self):
         mols = self.mols
         tols = [(2e-3, 2e-3), (2e-3, 1e-2), (2e-3, 5e-3), (2e-3, 2e-3)]
+        tols = [(1e-3, 1e-3), (1e-3, 1e-3), (1e-3, 1e-3), (1e-3, 1e-3)]
         for mol, (atol, rtol) in zip(mols, tols):
             print(mol.atom)
             ks = dft.RKS(mol) if mol.spin == 0 else dft.UKS(mol)
diff --git a/examples/pyscf/compute_ae.py b/examples/pyscf/compute_ae.py
index a7af460..7726422 100644
--- a/examples/pyscf/compute_ae.py
+++ b/examples/pyscf/compute_ae.py
@@ -102,9 +102,13 @@ def run_calc(mol, spinpol):
             ckernel="GGA_C_PBE",
         )
         ks.small_rho_cutoff = 0.0
+    else:
+        ks.xc = functional
     ks.grids.level = 3
     ks = ks.apply(scf.addons.remove_linear_dep_)
     etot = ks.kernel()
+    if is_cider:
+        ks._numint.timer.write()
     return etot