From 85e726ee781940b9565c589659bb6af19fb2246b Mon Sep 17 00:00:00 2001
From: kylebystrom <kylebystrom@gmail.com>
Date: Tue, 30 Apr 2024 14:15:07 -0400
Subject: [PATCH] add pyscf interface and lcao convolutions for NLDF

---
 .github/workflows/full_env.yml         |    1 +
 .github/workflows/nocomp_env.yml       |    1 +
 ciderpress/density.py                  |  145 +++
 ciderpress/dft/baselines.py            |  118 +++
 ciderpress/dft/debug_numint.py         |    2 +-
 ciderpress/dft/lcao_convolutions.py    |  550 ++++++++++++
 ciderpress/dft/lcao_interpolation.py   |    4 +-
 ciderpress/dft/lcao_nldf_generator.py  |    2 +-
 ciderpress/dft/plans.py                |    4 +-
 ciderpress/dft/settings.py             |    2 +-
 ciderpress/dft/sph_harm_coeff.py       |  352 ++++++++
 ciderpress/dft/xc_evaluator.py         |  474 ++++++++++
 ciderpress/gpaw/__init__.py            |    0
 ciderpress/lib/__init__.py             |    7 +
 ciderpress/lib/mod_cider/cider_coefs.c |    1 -
 ciderpress/pyscf/__init__.py           |    0
 ciderpress/pyscf/analyzers.py          |  466 ++++++++++
 ciderpress/pyscf/debug_numint.py       |  107 +++
 ciderpress/pyscf/descriptors.py        |  478 ++++++++++
 ciderpress/pyscf/dft.py                |  214 +++++
 ciderpress/pyscf/frac_lapl.py          |  888 +++++++++++++++++++
 ciderpress/pyscf/gen_cider_grid.py     |  266 ++++++
 ciderpress/pyscf/nldf_convolutions.py  |  249 ++++++
 ciderpress/pyscf/numint.py             | 1129 ++++++++++++++++++++++++
 ciderpress/pyscf/sdmx.py               |  590 +++++++++++++
 examples/pyscf/compute_ae.py           |  136 +++
 scripts/download_functionals.py        |   17 +-
 27 files changed, 6187 insertions(+), 16 deletions(-)
 create mode 100644 ciderpress/density.py
 create mode 100644 ciderpress/dft/baselines.py
 create mode 100644 ciderpress/dft/lcao_convolutions.py
 create mode 100644 ciderpress/dft/sph_harm_coeff.py
 create mode 100644 ciderpress/gpaw/__init__.py
 create mode 100644 ciderpress/pyscf/__init__.py
 create mode 100644 ciderpress/pyscf/analyzers.py
 create mode 100644 ciderpress/pyscf/debug_numint.py
 create mode 100644 ciderpress/pyscf/descriptors.py
 create mode 100644 ciderpress/pyscf/dft.py
 create mode 100644 ciderpress/pyscf/frac_lapl.py
 create mode 100644 ciderpress/pyscf/gen_cider_grid.py
 create mode 100644 ciderpress/pyscf/nldf_convolutions.py
 create mode 100644 ciderpress/pyscf/numint.py
 create mode 100644 ciderpress/pyscf/sdmx.py
 create mode 100644 examples/pyscf/compute_ae.py

diff --git a/.github/workflows/full_env.yml b/.github/workflows/full_env.yml
index 858d207..1fe4898 100644
--- a/.github/workflows/full_env.yml
+++ b/.github/workflows/full_env.yml
@@ -15,6 +15,7 @@ dependencies:
   - numba<=0.58
   - numpy>=1.20.3,<1.25
   - pyscf>=2.1
+  - pytest
   - pyyaml
   - scikit-learn>=1.0.1
   - scipy>=1.7.1
diff --git a/.github/workflows/nocomp_env.yml b/.github/workflows/nocomp_env.yml
index 1a6f3b4..fb03e7f 100644
--- a/.github/workflows/nocomp_env.yml
+++ b/.github/workflows/nocomp_env.yml
@@ -11,6 +11,7 @@ dependencies:
   - numba<=0.58
   - numpy>=1.20.3,<1.25
   - pyscf>=2.1
+  - pytest
   - pyyaml
   - scikit-learn>=1.0.1
   - scipy>=1.7.1
diff --git a/ciderpress/density.py b/ciderpress/density.py
new file mode 100644
index 0000000..b554141
--- /dev/null
+++ b/ciderpress/density.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
+import numpy
+import numpy as np
+
+LDA_FACTOR = -3.0 / 4.0 * (3.0 / np.pi) ** (1.0 / 3)
+GG_SMUL = 1.0
+GG_AMUL = 1.0
+GG_AMIN = 1.0 / 18
+CFC = (3.0 / 10) * (3 * np.pi**2) ** (2.0 / 3)
+DESC_VERSION_LIST = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"]
+
+UEG_VECTORS = {
+    "b": np.array([1.0, 0.0, 1.0, 2.0, 2.0, 2.0, 0.0, 0.0]),
+    "d": np.array([1.0, 0.0, 1.0, 2.0, 2.0, 2.0, 0.0, 0.0]),
+}
+
+
+def ldax(n):
+    """
+    LDA exchange energy density
+    Args:
+        n: Density
+    """
+    return LDA_FACTOR * n ** (4.0 / 3)
+
+
+def ldaxp(n):
+    """
+    Fully spin-polarized LDA exchange energy density
+    Args:
+        n: Density
+    """
+    return 0.5 * ldax(2 * n)
+
+
+def lsda(nu, nd):
+    """
+    LSDA exchange energy density
+    Args:
+        nu: Spin-up Density
+        nd: Spin-down Density
+    """
+    return ldaxp(nu) + ldaxp(nd)
+
+
+def get_ldax_dens(n):
+    """
+    LDA exchange energy density
+    Args:
+        n: Density
+    """
+    return LDA_FACTOR * n ** (4.0 / 3)
+
+
+def get_ldax(n):
+    """
+    LDA Exchange energy per particle
+    Args:
+        n: Density
+    """
+    return LDA_FACTOR * n ** (1.0 / 3)
+
+
+def get_xed_from_y(y, rho):
+    """
+    Get the exchange energy density (n * epsilon_x)
+    from the exchange enhancement factor y
+    and density rho.
+    """
+    return rho * get_x(y, rho)
+
+
+def get_x(y, rho):
+    return (y + 1) * get_ldax(rho)
+
+
+def get_y_from_xed(xed, rho):
+    """
+    Get the exchange enhancement factor minus one.
+    """
+    return xed / (get_ldax_dens(rho) - 1e-12) - 1
+
+
+def get_gradient_magnitude(rho_data):
+    return np.linalg.norm(rho_data[1:4, :], axis=0)
+
+
+def get_normalized_grad(rho, mag_grad):
+    sprefac = 2 * (3 * np.pi * np.pi) ** (1.0 / 3)
+    n43 = rho ** (4.0 / 3)
+    s = mag_grad / (sprefac * n43 + 1e-16)
+    return s
+
+
+def get_single_orbital_tau(rho, mag_grad):
+    return mag_grad**2 / (8 * rho + 1e-16)
+
+
+def get_uniform_tau(rho):
+    return (3.0 / 10) * (3 * np.pi**2) ** (2.0 / 3) * rho ** (5.0 / 3)
+
+
+def get_regularized_tau(tau, tau_w, tau_unif):
+    alpha = (tau - tau_w) / (tau_unif + 1e-4)
+    return alpha**3 / (alpha**2 + 1e-3)
+
+
+def get_normalized_tau(tau, tau_w, tau_unif):
+    return (tau - tau_w) / (tau_unif + 1e-16)
+
+
+def get_dft_input(rho_data):
+    rho = rho_data[0, :]
+    mag_grad = get_gradient_magnitude(rho_data)
+    s = get_normalized_grad(rho, mag_grad)
+    tau_w = get_single_orbital_tau(rho, mag_grad)
+    tau_unif = get_uniform_tau(rho)
+    alpha = get_normalized_tau(rho_data[5], tau_w, tau_unif)
+    return rho, s, alpha, tau_w, tau_unif
+
+
+def get_dft_input_gga(rho_data):
+    rho = rho_data[0, :]
+    mag_grad = get_gradient_magnitude(rho_data)
+    s = get_normalized_grad(rho, mag_grad)
+    return rho, s
diff --git a/ciderpress/dft/baselines.py b/ciderpress/dft/baselines.py
new file mode 100644
index 0000000..6c9349c
--- /dev/null
+++ b/ciderpress/dft/baselines.py
@@ -0,0 +1,118 @@
+import numpy as np
+
+from ciderpress.density import LDA_FACTOR
+
+
+def nsp_rho_basline(X0T, e=None, dedx=None):
+    nspin, nfeat, nsamp = X0T.shape
+    if e is None:
+        e = np.zeros(nsamp)
+    if dedx is None:
+        dedx = np.zeros_like(X0T)
+    nspin = X0T.shape[0]
+    e[:] += X0T[:, 0].mean(0)
+    dedx[0, :] += 1.0 / nspin
+
+
+def _lda_x_helper(X0T, e, dedx):
+    rho = X0T[0]
+    e[:] += LDA_FACTOR * rho ** (4.0 / 3)
+    dedx[0] += 4.0 / 3 * LDA_FACTOR * rho ** (1.0 / 3)
+
+
+def _vi_x_damp_helper(X0T, e, dedx):
+    rho = X0T[0]
+    damp = 2.0 / (1.0 + 0.5 * X0T[3]) ** 2
+    ddamp = -1 * damp / (1.0 + 0.5 * X0T[3])
+    e[:] += LDA_FACTOR * damp * rho ** (4.0 / 3)
+    dedx[0] += 4.0 / 3 * LDA_FACTOR * damp * rho ** (1.0 / 3)
+    dedx[3] += LDA_FACTOR * ddamp * rho ** (4.0 / 3)
+
+
+def _pbe_x_helper(X0T, e, dedx):
+    rho = X0T[0]
+    s2 = X0T[1]
+    kappa = 0.804
+    mu = 0.2195149727645171
+    mk = mu / kappa
+    fac = 1.0 / (1 + mk * s2)
+    fx = 1 + kappa - kappa * fac
+    dfx = mu * fac * fac
+    e[:] += LDA_FACTOR * rho ** (4.0 / 3) * fx
+    dedx[0] += 4.0 / 3 * LDA_FACTOR * rho ** (1.0 / 3) * fx
+    dedx[1] += LDA_FACTOR * rho ** (4.0 / 3) * dfx
+
+
+def _chachiyo_x_helper(X0T, e, dedx):
+    rho = X0T[0]
+    s2 = X0T[1]
+    c = 4 * np.pi / 9
+    x = c * np.sqrt(s2)
+    dx = c / (2 * np.sqrt(s2))
+    chfx = (3 * x**2 + np.pi**2 * np.log(1 + x)) / (
+        (np.pi**2 + 3 * x) * np.log(1 + x)
+    )
+    dchfx = (
+        -3 * x**2 * (np.pi**2 + 3 * x)
+        + 3 * x * (1 + x) * (2 * np.pi**2 + 3 * x) * np.log(1 + x)
+        - 3 * np.pi**2 * (1 + x) * np.log(1 + x) ** 2
+    ) / ((1 + x) * (np.pi**2 + 3 * x) ** 2 * np.log(1 + x) ** 2)
+    dchfx *= dx
+    chfx[s2 < 1e-8] = 1 + 8 * s2[s2 < 1e-8] / 27
+    dchfx[s2 < 1e-8] = 8.0 / 27
+    e[:] += LDA_FACTOR * rho ** (4.0 / 3) * chfx
+    dedx[0] += 4.0 / 3 * LDA_FACTOR * rho ** (1.0 / 3) * chfx
+    dedx[1] += LDA_FACTOR * rho ** (4.0 / 3) * dchfx
+
+
+def _sl_x_helper(X0T, _helper):
+    nspin, nfeat, nsamp = X0T.shape
+    e = np.zeros(nsamp)
+    dedx = np.zeros_like(X0T)
+    nspin = X0T.shape[0]
+    for s in range(nspin):
+        _helper(X0T[s], e, dedx[s])
+    e[:] /= nspin
+    dedx[:] /= nspin
+    return e, dedx
+
+
+def zero_xc(X0T):
+    nspin, nfeat, nsamp = X0T.shape
+    e = np.zeros(nsamp)
+    dedx = np.zeros_like(X0T)
+    return e, dedx
+
+
+def one_xc(X0T):
+    nspin, nfeat, nsamp = X0T.shape
+    e = np.ones(nsamp)
+    dedx = np.zeros_like(X0T)
+    return e, dedx
+
+
+def lda_x(X0T):
+    return _sl_x_helper(X0T, _lda_x_helper)
+
+
+def nlda_x_damp(X0T):
+    return _sl_x_helper(X0T, _vi_x_damp_helper)
+
+
+def gga_x_chachiyo(X0T):
+    return _sl_x_helper(X0T, _chachiyo_x_helper)
+
+
+def gga_x_pbe(X0T):
+    return _sl_x_helper(X0T, _pbe_x_helper)
+
+
+BASELINE_CODES = {
+    "RHO": nsp_rho_basline,
+    "ZERO": zero_xc,
+    "ONE": one_xc,
+    "LDA_X": lda_x,
+    "NLDA_X_DAMP": nlda_x_damp,
+    "GGA_X_PBE": gga_x_pbe,
+    "GGA_X_CHACHIYO": gga_x_chachiyo,
+}
diff --git a/ciderpress/dft/debug_numint.py b/ciderpress/dft/debug_numint.py
index f77e488..c6b4c30 100644
--- a/ciderpress/dft/debug_numint.py
+++ b/ciderpress/dft/debug_numint.py
@@ -2,8 +2,8 @@
 
 import numpy as np
 
+from ciderpress.dft.plans import get_cider_exponent
 from ciderpress.lib import load_library as load_cider_library
-from ciderpress.new_dft.plans import get_cider_exponent
 
 libcider = load_cider_library("libmcider")
 
diff --git a/ciderpress/dft/lcao_convolutions.py b/ciderpress/dft/lcao_convolutions.py
new file mode 100644
index 0000000..f78e9f5
--- /dev/null
+++ b/ciderpress/dft/lcao_convolutions.py
@@ -0,0 +1,550 @@
+import ctypes
+
+import numpy as np
+import scipy.special
+
+from ciderpress import lib
+from ciderpress.dft.grids_indexer import AtomicGridsIndexer
+from ciderpress.lib import load_library as load_cider_library
+
+libcider = load_cider_library("libmcider")
+libcider.get_atco_nbas.restype = ctypes.c_int
+
+DEFAULT_AUX_BETA = 1.6
+DEFAULT_CIDER_LMAX = 10
+ATOM_OF = 0
+ANG_OF = 1
+PTR_EXP = 5
+PTR_COEFF = 6
+
+IFEAT_ID_TO_CONTRIB = {
+    0: 0,  # squared-exponential
+    1: 1,  # R2 x squared-exp
+    2: 2,  # AR2 x squared-exp
+    3: 7,  # A x squared-exp
+    4: 8,  # A2R2 x squared-exp
+    5: 9,  # Laplacian
+    6: (4, 5),  # DR x squared-exp
+    7: (3, 6),  # A DR x squared-exp
+}
+
+
+def atom_loc_from_bas(bas):
+    natm = np.max(bas[:, ATOM_OF]) + 1  # max ia + 1 is number of atoms
+    atom_loc = np.zeros(natm + 1, dtype=np.int32)
+    for b in bas:
+        atom_loc[b[ATOM_OF] + 1] += 1
+    atom_loc = np.cumsum(atom_loc)
+    return np.asarray(atom_loc, dtype=np.int32, order="C")
+
+
+# from PySCF, helper function for normalization coeff
+def gaussian_int(n, alpha):
+    n1 = (n + 1) * 0.5
+    return scipy.special.gamma(n1) / (2.0 * alpha**n1)
+
+
+# from PySCF, helper function for normalization coeff
+def gto_norm(l, expnt):
+    if np.all(l >= 0):
+        return 1 / np.sqrt(gaussian_int(l * 2 + 2, 2 * expnt))
+    else:
+        raise ValueError("l should be >= 0")
+
+
+def get_gamma_lists_from_bas_and_env(bas, env):
+    natm = np.max(bas[:, ATOM_OF]) + 1
+    atom2l0 = [0]
+    lmaxs = []
+    gamma_loc = [0]
+    all_exps = []
+    all_coefs = []
+    for ia in range(natm):
+        ls = bas[bas[:, ATOM_OF] == ia, ANG_OF]
+        # ls must be ascending
+        assert np.all(ls[1:] >= ls[:-1])
+        lmax = np.max(ls)
+        exps = []
+        coefs = []
+        lmaxs.append(np.max(ls))
+        for l in range(lmax + 1):
+            cond = np.logical_and(bas[:, ATOM_OF] == ia, bas[:, ANG_OF] == l)
+            exps_l = env[bas[cond, PTR_EXP]]
+            coefs_l = env[bas[cond, PTR_COEFF]]
+            gamma_loc.append(len(exps_l))
+            exps.append(exps_l)
+            coefs.append(coefs_l)
+        all_exps.append(np.concatenate(exps))
+        all_coefs.append(np.concatenate(coefs))
+        atom2l0.append(len(gamma_loc) - 1)
+    gamma_loc = np.asarray(np.cumsum(gamma_loc), dtype=np.int32, order="C")
+    lmaxs = np.asarray(lmaxs, dtype=np.int32, order="C")
+    all_exps = np.asarray(np.concatenate(all_exps), dtype=np.float64, order="C")
+    all_coefs = np.asarray(np.concatenate(all_coefs), dtype=np.float64, order="C")
+    atom2l0 = np.asarray(atom2l0, dtype=np.int32, order="C")
+    return atom2l0, lmaxs, gamma_loc, all_coefs, all_exps
+
+
+def get_convolution_expnts_from_expnts(
+    gammas, atom2l0, lmaxs, gamma_loc, all_exps, gbuf=np.inf
+):
+    new_gamma_loc = [0]
+    new_all_exps = []
+    new_all_coefs = []
+    for ia in range(len(atom2l0) - 1):
+        lmax = lmaxs[ia]
+        new_exps = []
+        new_coefs = []
+        for l in range(lmax + 1):
+            loc0 = gamma_loc[atom2l0[ia] + l]
+            loc1 = gamma_loc[atom2l0[ia] + l + 1]
+            old_exps = all_exps[loc0:loc1]
+            gmax = np.max(old_exps)
+            cond = gammas < gmax * gbuf
+            new_exps_l = gammas[cond]
+            new_coefs_l = gto_norm(l, gammas[cond])
+            new_gamma_loc.append(len(new_exps_l))
+            new_exps.append(new_exps_l)
+            new_coefs.append(new_coefs_l)
+        new_all_exps.append(np.concatenate(new_exps))
+        new_all_coefs.append(np.concatenate(new_coefs))
+    gamma_loc = np.asarray(np.cumsum(new_gamma_loc), dtype=np.int32, order="C")
+    lmaxs = np.asarray(lmaxs, dtype=np.int32, order="C")
+    all_exps = np.asarray(np.concatenate(new_all_exps), dtype=np.float64, order="C")
+    all_coefs = np.asarray(np.concatenate(new_all_coefs), dtype=np.float64, order="C")
+    assert all_exps.size == all_coefs.size == gamma_loc[-1]
+    atom2l0 = np.asarray(atom2l0, dtype=np.int32, order="C")
+    return atom2l0, lmaxs, gamma_loc, all_coefs, all_exps
+
+
+class ATCBasis:
+    """
+    A wrapper for the atc_basis_set C object, which is used to store
+    a simple (but usually large), uncontracted Gaussian basis along
+    with various index lists and other data for use in performing
+    projections and manipulations of data in this basis on individual
+    atoms.
+    """
+
+    def __init__(self, atom2l0, lmaxs, gamma_loc, coefs, exps, lower=True):
+        """
+        Initialize ATCBasis.
+
+        Args:
+            atom2l0 (np.ndarray): int32 array of length natm + 1 such that
+                gamma_loc[atom2l0[ia] : atom2l0[ia + 1]] contains the global
+                gamma_loc indexes for atom ia.
+            lmaxs (np.ndarray) : int32, maximum l value for each atom.
+            gamma_loc (np.ndarray): int32, a shell indexing list.
+                gamma_loc[atom2l0[ia] + l] is the index of the first shell of
+                atom ia with angular momentum l.
+                [gamma_loc[atom2l0[ia] + l], atom2lo[ia] + l + 1]) is the range
+                of indexes with atom ia and angular momentum l. These ranges
+                are used to index coefs and exps.
+            coefs (np.ndarray): float64, list of coefficients for
+                normalizing Gaussian basis functions.
+            exps (np.ndarray): float64, list of exponents for Gaussian
+                basis functions.
+            lower (bool): Whether to store lower or upper matrix for
+                Cholesky factorizations within the C atc_basis_set object.
+        """
+        atco = lib.c_null_ptr()
+        gamma_loc = np.asarray(gamma_loc, dtype=np.int32, order="C")
+        lmaxs = np.asarray(lmaxs, dtype=np.int32, order="C")
+        exps = np.asarray(exps, dtype=np.float64, order="C")
+        coefs = np.asarray(coefs, dtype=np.float64, order="C")
+        atom2l0 = np.asarray(atom2l0, dtype=np.int32, order="C")
+        self.natm = len(atom2l0) - 1
+        libcider.generate_atc_basis_set(
+            ctypes.byref(atco),
+            atom2l0.ctypes.data_as(ctypes.c_void_p),
+            lmaxs.ctypes.data_as(ctypes.c_void_p),
+            gamma_loc.ctypes.data_as(ctypes.c_void_p),
+            exps.ctypes.data_as(ctypes.c_void_p),
+            coefs.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(len(atom2l0) - 1),
+            ctypes.c_char(b"L" if lower else b"U"),
+        )
+        self._atco = atco
+
+    def __del__(self):
+        """Free C data associated with this object."""
+        libcider.free_atc_basis_set(self._atco)
+
+    @property
+    def atco_c_ptr(self):
+        """Get a pointer to the C data for this object."""
+        return self._atco
+
+    @property
+    def ao_loc(self):
+        ao_loc = np.cumsum(2 * self.bas[:, ANG_OF] + 1)
+        ao_loc = np.append([0], ao_loc)
+        return np.asarray(ao_loc, dtype=np.int32, order="C")
+
+    @property
+    def atom_loc(self):
+        return atom_loc_from_bas(self.bas)
+
+    @property
+    def nao(self):
+        """Get the number of atomic orbitals in this atom-centered basis."""
+        return libcider.get_atco_nao(self._atco)
+
+    @property
+    def nbas(self):
+        """Get the number of shells in this atom-centered basis."""
+        return libcider.get_atco_nbas(self._atco)
+
+    @property
+    def bas(self):
+        """Get a copy of the shell information for this basis."""
+        bas = np.zeros((self.nbas, 8), order="C", dtype=np.int32)
+        libcider.get_atco_bas(
+            bas.ctypes.data_as(ctypes.c_void_p),
+            ctypes.cast(self._atco, ctypes.c_void_p),
+        )
+        return bas
+
+    @property
+    def env(self):
+        """Get a copy of the exponents and coefficients for this basis."""
+        nbas = libcider.get_atco_nbas(self._atco)
+        env = np.zeros((2 * nbas,), order="C", dtype=np.float64)
+        libcider.get_atco_env(env.ctypes.data_as(ctypes.c_void_p), self._atco)
+        return env
+
+    def convert_rad2orb_(
+        self, theta_rlmq, p_uq, loc, rads, rad2orb=True, offset=None, zero_output=True
+    ):
+        """
+        Take a set of functions stored on radial grids and spherical harmonics
+        and project it onto the atomic orbital basis. This is an in-place
+        operation, where theta_rlmq is written if rad2orb is False and
+        p_uq is written if rad2orb is True.
+        - If rad2orb is True, loc has
+          shape (natm + 1,), and theta_rlmq[loc[ia] : loc[ia+1]] is
+          the set of functions on atom ia. This corresponds to
+          the ra_loc member of an AtomicGridsIndexer
+        - If rad2orb is False, loc has shape (nrad,), and theta_rlmq[r]
+          is located on atom loc[r]. This corresponds to the ar_loc
+          member of an AtomicGridsIndexer
+
+        Args:
+            theta_rlmq (np.ndarray): float64, shape (nrad, nlm, nalpha)
+            p_uq (np.ndarray): float64, shape (ngrids, nalpha)
+            loc (np.ndarray, AtomicGridsIndexer):
+                An int32 array whose shape depends on whether rad2orb is True.
+            rads (np.ndarray): float64 list of radial coordinates with size
+                theta_rlmq.shape[0]. Corresponds to rad_arr member of
+                AtomicGridsIndexer.
+            rad2orb (bool): Whether to convert radial coordinates to orbital
+                basis (True) or vice versa (False).
+            offset (int): starting index in p_uq to add to/from
+            zero_output (bool): Whether to set output zero before
+                adding to it. Default to true. Set to False if you
+                want to add to the existing values in the output
+                array.
+        """
+        if isinstance(loc, AtomicGridsIndexer):
+            if rad2orb:
+                loc = loc.ra_loc
+            else:
+                loc = loc.ar_loc
+        nalpha = theta_rlmq.shape[-1]
+        stride = p_uq.shape[1]
+        if offset is None:
+            offset = 0
+        else:
+            assert offset >= 0
+        nrad = rads.size
+        assert loc.flags.c_contiguous
+        assert loc.dtype == np.int32
+        assert loc.ndim == 1
+        assert rads.flags.c_contiguous
+        assert rads.dtype == np.float64
+        assert theta_rlmq.ndim == 3
+        assert theta_rlmq.shape[0] == nrad
+        assert theta_rlmq.flags.c_contiguous
+        assert theta_rlmq.dtype == np.float64
+        assert p_uq.ndim == 2
+        assert p_uq.shape[0] == self.nao
+        assert p_uq.flags.c_contiguous
+        assert p_uq.dtype == np.float64
+        assert offset + nalpha <= stride
+        if rad2orb:
+            assert loc.size == self.natm + 1
+            fn = libcider.contract_rad_to_orb
+            if zero_output:
+                p_uq[:, offset : offset + nalpha] = 0.0
+        else:
+            assert loc.size == nrad
+            fn = libcider.contract_orb_to_rad
+            if zero_output:
+                theta_rlmq[:] = 0.0
+        fn(
+            theta_rlmq.ctypes.data_as(ctypes.c_void_p),
+            p_uq.ctypes.data_as(ctypes.c_void_p),
+            loc.ctypes.data_as(ctypes.c_void_p),
+            rads.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(rads.size),
+            ctypes.c_int(theta_rlmq.shape[1]),
+            self.atco_c_ptr,
+            ctypes.c_int(nalpha),
+            ctypes.c_int(stride),
+            ctypes.c_int(offset),
+        )
+
+
+class ConvolutionCollection:
+    """
+    ConvolutionCollection is an object to assist in performing
+    nonlocal density feature convolutions in a Gaussian basis set.
+    It stores Gaussian integrals corresponding to convolutions
+    of the input basis atco_inp by the squared-exponential
+    functions with exponents alphas, projected onto the output
+    basis atco_out. One can perform forward and backward convolutions.
+    """
+
+    is_vk = False
+
+    def __init__(
+        self,
+        atco_inp,
+        atco_out,
+        alphas,
+        alpha_norms,
+        has_vj=True,
+        ifeat_ids=None,
+    ):
+        """
+        Initialize ConvolutionCollection.
+
+        Args:
+            atco_inp (ATCBasis): Input basis for convolutions
+            atco_out (ATCBasis): Output basis for convolutions
+            alphas (np.ndarray): float64 list of interpolating exponents
+            alpha_norms (np.ndarray): float64 list of normalization
+                factors for interpolation exponent functions.
+            has_vj (bool): Whether to perform version j convolutions.
+            ifeat_ids (list): Version i feature contributions to compute.
+                Must be in strictly increasing order so that output
+                can be processed by interpolation routines.
+        """
+        self.atco_inp = atco_inp
+        self.atco_out = atco_out
+        self._alphas = np.ascontiguousarray(alphas, dtype=np.float64)
+        self._alpha_norms = np.ascontiguousarray(alpha_norms, dtype=np.float64)
+        self._nalpha = self._alphas.size
+        assert self._alpha_norms.size == self._nalpha
+        if ifeat_ids is None:
+            ifeat_ids = []
+        icontrib0_ids = []
+        icontrib1m_ids = []
+        icontrib1p_ids = []
+        for ifid in ifeat_ids:
+            if ifid not in IFEAT_ID_TO_CONTRIB:
+                raise ValueError("Unrecognized feature")
+            icid = IFEAT_ID_TO_CONTRIB[ifid]
+            if isinstance(icid, int):
+                if len(icontrib1m_ids) > 0:
+                    raise ValueError("l0 features must preceed l1 in ifeat_ids")
+                icontrib0_ids.append(icid)
+            else:
+                assert len(icid) == 2
+                icontrib1m_ids.append(icid[0])
+                icontrib1p_ids.append(icid[1])
+        self._ifeat_ids = ifeat_ids
+        self._icontrib0_ids = icontrib0_ids
+        self._icontrib1m_ids = icontrib1m_ids
+        self._icontrib1p_ids = icontrib1p_ids
+        self._icontrib_ids = np.ascontiguousarray(
+            np.concatenate([icontrib0_ids, icontrib1m_ids, icontrib1p_ids]),
+            dtype=np.int32,
+        )
+        if len(self._icontrib_ids) == 0 and not has_vj:
+            raise ValueError("Trying to initialize an empty collection")
+        self._nbeta = len(self._icontrib_ids)
+        if has_vj:
+            self._nbeta += self._nalpha
+        self._has_vj = has_vj
+        self._integrals_initialized = False
+        self._ccl = lib.c_null_ptr()
+        libcider.generate_convolution_collection(
+            ctypes.byref(self._ccl),
+            self.atco_inp.atco_c_ptr,
+            self.atco_out.atco_c_ptr,
+            self._alphas.ctypes.data_as(ctypes.c_void_p),
+            self._alpha_norms.ctypes.data_as(ctypes.c_void_p),
+            self._icontrib_ids.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(self._nalpha),
+            ctypes.c_int(len(self._icontrib_ids)),
+            ctypes.c_int(1 if self._has_vj else 0),
+        )
+
+    def __del__(self):
+        """Free C data associated with this object."""
+        libcider.free_convolution_collection(self._ccl)
+
+    @property
+    def n0(self):
+        n0 = len(self._icontrib0_ids)
+        if self._has_vj:
+            n0 += self.nalpha
+        return n0
+
+    @property
+    def n1(self):
+        return len(self._icontrib1m_ids)
+
+    @property
+    def nalpha(self):
+        """Number of inputs for the convolutions."""
+        return self._nalpha
+
+    @property
+    def nbeta(self):
+        """Total number of outputs of the convolutions."""
+        return self._nbeta
+
+    @property
+    def num_out(self):
+        return self.nbeta
+
+    def compute_integrals_(self):
+        """Compute and store the Gaussian integrals for
+        performing convolutions."""
+        if not self._integrals_initialized:
+            libcider.generate_atc_integrals_all(self._ccl)
+            self._integrals_initialized = True
+
+    def solve_projection_coefficients(self):
+        """Multiply convolution integral tensor by the inverse
+        overlap matrices of the input and output bases."""
+        if not self._integrals_initialized:
+            self.compute_integrals_()
+        libcider.solve_atc_coefs(self._ccl)
+
+    def multiply_atc_integrals(self, input, output=None, fwd=True):
+        """
+        Perform the convolutions from the input orbital basis
+        to the output orbital basis. If fwd is True, the input orbital
+        basis is atco_inp and the output orbital basis is atco_out.
+        If fwd is False, the input orbital basis is atco_out and the
+        output orbital basis is atco_inp (i.e. the convolution is done
+        in reverse.)
+
+        Args:
+            input (np.ndarray) : float64 array with shape
+                (atco_inp, nalpha) for fwd=True and (atco_out, nbeta)
+                for fwd=False
+            output (np.ndarray, optional) : float64 array with shape
+                (atco_out, nbeta) for fwd=True and (atco_inp, nalpha)
+                for fwd=False. NOTE: If passed, should be all zeros.
+                If None, output is initialized within the function
+                and then returned.
+            fwd (bool): Whether convolution is performed forward or backward.
+
+        Returns:
+            output
+        """
+        if output is None:
+            if fwd:
+                output = np.zeros((self.atco_out.nao, self.nbeta))
+            else:
+                output = np.zeros((self.atco_inp.nao, self.nalpha))
+        if not self._integrals_initialized:
+            raise RuntimeError("Must initialize integrals before calling")
+        assert input.flags.c_contiguous
+        assert output.flags.c_contiguous
+        if fwd:
+            assert input.shape == (self.atco_inp.nao, self.nalpha), (
+                input.shape,
+                (self.atco_inp.nao, self.nalpha),
+            )
+            assert output.shape == (self.atco_out.nao, self.nbeta), (
+                output.shape,
+                (self.atco_out.nao, self.nbeta),
+            )
+        else:
+            assert output.shape == (self.atco_inp.nao, self.nalpha), (
+                output.shape,
+                (self.atco_inp.nao, self.nalpha),
+            )
+            assert input.shape == (self.atco_out.nao, self.nbeta), (
+                input.shape,
+                (self.atco_out.nao, self.nbeta),
+            )
+        libcider.multiply_atc_integrals(
+            input.ctypes.data_as(ctypes.c_void_p),
+            output.ctypes.data_as(ctypes.c_void_p),
+            self._ccl,
+            ctypes.c_int(1 if fwd else 0),
+        )
+        return output
+
+
+class ConvolutionCollectionK(ConvolutionCollection):
+    """
+    Modified ConvolutionCollection for version k features. This is needed
+    because multiplication by the ovlp_mats tensor in C is different
+    for version k.
+    """
+
+    is_vk = True
+
+    def __init__(self, atco_inp, atco_out, alphas, alpha_norms):
+        """Simplified initializer for version K features.
+        See ConvolutionCollection.__init__ for details. feat_id is the
+        same as ifeat_ids except only one input is allowed instead of a list."""
+        super(ConvolutionCollectionK, self).__init__(
+            atco_inp, atco_out, alphas, alpha_norms, has_vj=False, ifeat_ids=[0]
+        )
+
+    @property
+    def n0(self):
+        return self.nalpha
+
+    @property
+    def num_out(self):
+        return self.nalpha
+
+    def multiply_atc_integrals(self, input, output=None, fwd=True):
+        """
+        Modified multiply_atc_integrals function for version k
+
+        Args:
+            input (np.ndarray) : float64 array with shape
+                (atco_inp, nalpha) for fwd=True and (atco_out, nalpha)
+                for fwd=False
+            output (np.ndarray, optional) : float64 array with shape
+                (atco_out, nalpha) for fwd=True and (atco_inp, nalpha)
+                for fwd=False. NOTE: If passed, should be all zeros.
+                If None, output is initialized within the function
+                and then returned.
+            fwd (bool): Whether convolution is performed forward or backward.
+
+        Returns:
+            output
+        """
+        if fwd:
+            atco_inp = self.atco_inp
+            atco_out = self.atco_out
+        else:
+            atco_inp = self.atco_out
+            atco_out = self.atco_inp
+        if output is None:
+            output = np.zeros((atco_inp.nao, self.nalpha))
+        if not self._integrals_initialized:
+            raise RuntimeError("Must initialize integrals before calling")
+        assert input.flags.c_contiguous
+        assert output.flags.c_contiguous
+        assert input.shape == (atco_inp.nao, self.nalpha)
+        assert output.shape == (atco_out.nao, self.nalpha)
+        libcider.multiply_atc_integrals_vk(
+            input.ctypes.data_as(ctypes.c_void_p),
+            output.ctypes.data_as(ctypes.c_void_p),
+            self._ccl,
+            ctypes.c_int(1 if fwd else 0),
+        )
+        return output
diff --git a/ciderpress/dft/lcao_interpolation.py b/ciderpress/dft/lcao_interpolation.py
index 2c15f41..313ff0a 100644
--- a/ciderpress/dft/lcao_interpolation.py
+++ b/ciderpress/dft/lcao_interpolation.py
@@ -3,13 +3,13 @@
 import numpy as np
 
 from ciderpress import lib
-from ciderpress.dft.sph_harm_coeff import get_deriv_ylm_coeff
-from ciderpress.new_dft.lcao_convolutions import (
+from ciderpress.dft.lcao_convolutions import (
     ATCBasis,
     atom_loc_from_bas,
     get_gamma_lists_from_bas_and_env,
     libcider,
 )
+from ciderpress.dft.sph_harm_coeff import get_deriv_ylm_coeff
 
 ATOM_OF = 0
 ANG_OF = 1
diff --git a/ciderpress/dft/lcao_nldf_generator.py b/ciderpress/dft/lcao_nldf_generator.py
index 3b63ab2..7c2eb54 100644
--- a/ciderpress/dft/lcao_nldf_generator.py
+++ b/ciderpress/dft/lcao_nldf_generator.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-from ciderpress.new_dft.lcao_interpolation import LCAOInterpolatorDirect
+from ciderpress.dft.lcao_interpolation import LCAOInterpolatorDirect
 
 
 class LCAONLDFGenerator:
diff --git a/ciderpress/dft/plans.py b/ciderpress/dft/plans.py
index 6f6f187..96f521a 100644
--- a/ciderpress/dft/plans.py
+++ b/ciderpress/dft/plans.py
@@ -8,8 +8,7 @@
 from scipy.special import gamma
 
 from ciderpress import lib
-from ciderpress.lib import load_library as load_cider_library
-from ciderpress.new_dft.settings import (
+from ciderpress.dft.settings import (
     NLDFSettings,
     SDMXBaseSettings,
     SDMXFullSettings,
@@ -19,6 +18,7 @@
     get_cider_exponent,
     get_s2,
 )
+from ciderpress.lib import load_library as load_cider_library
 
 libcider = load_cider_library("libmcider")
 
diff --git a/ciderpress/dft/settings.py b/ciderpress/dft/settings.py
index bc6e47c..e07bf0c 100644
--- a/ciderpress/dft/settings.py
+++ b/ciderpress/dft/settings.py
@@ -3,7 +3,7 @@
 import numpy as np
 from scipy.special import gamma as gamma_func
 
-from ciderpress.new_dft.feat_normalizer import (
+from ciderpress.dft.feat_normalizer import (
     ConstantNormalizer,
     DensityNormalizer,
     FeatNormalizerList,
diff --git a/ciderpress/dft/sph_harm_coeff.py b/ciderpress/dft/sph_harm_coeff.py
new file mode 100644
index 0000000..6c5685f
--- /dev/null
+++ b/ciderpress/dft/sph_harm_coeff.py
@@ -0,0 +1,352 @@
+import ctypes
+
+import numpy as np
+from sympy.physics.wigner import clebsch_gordan, gaunt
+
+from ciderpress.lib import load_library as load_cider_library
+
+libcider = load_cider_library("libmcider")
+
+
+def change_basis_real_to_complex(l: int) -> np.ndarray:
+    # https://en.wikipedia.org/wiki/Spherical_harmonics#Real_form
+    q = np.zeros((2 * l + 1, 2 * l + 1), dtype=np.complex128)
+    for m in range(-l, 0):
+        q[l + m, l + abs(m)] = 1 / np.sqrt(2)
+        q[l + m, l - abs(m)] = -1j / np.sqrt(2)
+    q[l, l] = 1
+    for m in range(1, l + 1):
+        q[l + m, l + abs(m)] = (-1) ** m / np.sqrt(2)
+        q[l + m, l - abs(m)] = 1j * (-1) ** m / np.sqrt(2)
+    return (
+        -1j
+    ) ** l * q  # Added factor of 1j**l to make the Clebsch-Gordan coefficients real
+
+
+def su2_clebsch_gordan(l1, l2, l3):
+    mat = np.zeros(
+        [
+            2 * l1 + 1,
+            2 * l2 + 1,
+            2 * l3 + 1,
+        ]
+    )
+    for m1 in range(2 * l1 + 1):
+        for m2 in range(2 * l2 + 1):
+            for m3 in range(2 * l3 + 1):
+                mat[m1, m2, m3] = clebsch_gordan(l1, l2, l3, m1 - l1, m2 - l2, m3 - l3)
+    return mat
+
+
+def clebsch_gordan_e3nn(l1: int, l2: int, l3: int) -> np.ndarray:
+    r"""The Clebsch-Gordan coefficients of the real irreducible representations of :math:`SO(3)`.
+
+    Args:
+        l1 (int): the representation order of the first irrep
+        l2 (int): the representation order of the second irrep
+        l3 (int): the representation order of the third irrep
+
+    Returns:
+        np.ndarray: the Clebsch-Gordan coefficients
+    """
+    C = su2_clebsch_gordan(l1, l2, l3)
+    Q1 = change_basis_real_to_complex(l1)
+    Q2 = change_basis_real_to_complex(l2)
+    Q3 = change_basis_real_to_complex(l3)
+    C = np.einsum("ij,kl,mn,ikn->jlm", Q1, Q2, np.conj(Q3.T), C)
+
+    assert np.all(np.abs(np.imag(C)) < 1e-5)
+    return np.real(C)
+
+
+def get_deriv_coeffs(lmax):
+    """
+    Args:
+        l: Maximum l values for coefficients
+
+    Returns:
+
+    """
+    nlm = (l + 1) * (l + 1)
+    coeff = np.zeros((3, nlm))
+    for l in range(lmax + 1):
+        for m in range(l + 1):
+            lm = l * l + m
+            coeff[IZ, lm] = gaunt(1, l, l - 1, 0, m, m).n(64)
+            coeff[IX, lm] = gaunt(1, l, l - 1, 1, m, m + 1).n(64)
+            coeff[IY, lm] = gaunt(1, l, l - 1, -1, m, m - 1).n(64)
+
+
+def get_ylm(r, lmax):
+    N = r.shape[0]
+    nlm = (lmax + 1) * (lmax + 1)
+    rnorm = np.linalg.norm(r, axis=1)
+    rhat = r / rnorm[:, np.newaxis]
+    ylm = np.zeros((N, nlm))
+    libcider.recursive_sph_harm_vec(
+        ctypes.c_int(nlm),
+        ctypes.c_int(N),
+        rhat.ctypes.data_as(ctypes.c_void_p),
+        ylm.ctypes.data_as(ctypes.c_void_p),
+    )
+    xyz = ylm[:, 1:4].copy()
+    ylm[:, 3] = xyz[:, 0]
+    ylm[:, 1] = xyz[:, 1]
+    ylm[:, 2] = xyz[:, 2]
+    for l in range(lmax + 1):
+        for m in range(2 * l + 1):
+            lm = l * l + m
+            ylm[:, lm] *= rnorm**l
+    return ylm
+
+
+def get_ylm_fd(i, r, lmax, delta=1e-6):
+    rx = r.copy()
+    rx[:, i] += 0.5 * delta
+    ylmp = get_ylm(rx, lmax)
+    rx[:, i] -= delta
+    ylmm = get_ylm(rx, lmax)
+    dylm = (ylmp - ylmm) / delta
+    return dylm
+
+
+def get_ylm_ad(r, lmax):
+    nlm = (lmax + 1) * (lmax + 1)
+    rnorm = np.linalg.norm(r, axis=1)
+    rhat = r / rnorm[:, np.newaxis]
+    N = r.shape[0]
+    print("N", N)
+    ylm = np.zeros((N, nlm))
+    dylm_tmp = np.zeros((N, 3, nlm))
+    dylm = np.empty((4, N, nlm))
+    libcider.recursive_sph_harm_deriv_vec(
+        ctypes.c_int(nlm),
+        ctypes.c_int(N),
+        rhat.ctypes.data_as(ctypes.c_void_p),
+        ylm.ctypes.data_as(ctypes.c_void_p),
+        dylm_tmp.ctypes.data_as(ctypes.c_void_p),
+    )
+    dylm[0] = ylm
+    dylm[1:] = dylm_tmp.transpose(1, 0, 2)
+    subval = np.einsum("vgl,gv->gl", dylm[1:], rhat)
+    print("SUBVAL", np.linalg.norm(subval))
+    for l in range(lmax + 1):
+        rlm1 = rnorm ** (l - 1)
+        rl = rnorm**l
+        for m in range(2 * l + 1):
+            lm = l * l + m
+            dylm[0, :, lm] *= rl
+            dylm[1:, :, lm] *= rlm1
+            dylm[1:, :, lm] += ylm[:, lm] * l * rlm1 * rhat.T
+    xyz = dylm[..., 1:4].copy()
+    dylm[..., 3] = xyz[..., 0]
+    dylm[..., 1] = xyz[..., 1]
+    dylm[..., 2] = xyz[..., 2]
+    return dylm
+
+
+def get_deriv_ylm_coeff(lmax):
+    nlm = (lmax + 1) * (lmax + 1)
+    gaunt_coeff = np.zeros((5, nlm))
+    for l in range(lmax + 1):
+        fe = clebsch_gordan_e3nn(l, 1, l + 1)
+        for im in range(2 * l + 1):
+            lm = l * l + im
+            m = abs(im - l)
+            l1m = l + 1 - m
+            fac = ((2 * l + 3) * (l + 1)) ** 0.5
+            gaunt_coeff[0, lm] = fac * fe[im, 2, im]
+            gaunt_coeff[1, lm] = fac * fe[im, 2, im + 2]
+            gaunt_coeff[2, lm] = fac * fe[im, 0, 2 * l - im]
+            gaunt_coeff[3, lm] = fac * fe[im, 0, 2 * l - im + 2]
+            gaunt_coeff[4, lm] = np.sqrt(
+                ((2.0 * l + 3) * l1m * (l + 1 + m) / (2 * l + 1))
+            )
+    return gaunt_coeff
+
+
+def get_deriv_ylm_coeff_v2(lmax):
+    nlm = (lmax + 1) * (lmax + 1)
+    gaunt_coeff = np.zeros((5, nlm))
+    for l in range(lmax + 1):
+        fe = clebsch_gordan_e3nn(l, 1, l + 1)
+        for im in range(2 * l + 1):
+            lm = l * l + im
+            m = abs(im - l)
+            l1m = l + 1 - m
+            fac = ((2 * l + 3) * (l + 1)) ** 0.5
+            gaunt_coeff[0, lm] = fac * fe[im, 2, im]
+            gaunt_coeff[1, lm] = fac * fe[im, 2, im + 2]
+            gaunt_coeff[2, lm] = fac * fe[im, 0, 2 * l - im]
+            gaunt_coeff[3, lm] = fac * fe[im, 0, 2 * l - im + 2]
+            # gaunt_coeff[:4, lm] *= np.sqrt(2 * m + 1) * (-1)**l
+            gaunt_coeff[4, lm] = np.sqrt(
+                ((2.0 * l + 3) * l1m * (l + 1 + m) / (2 * l + 1))
+            )
+    return gaunt_coeff
+
+
+if __name__ == "__main__":
+    from numpy.testing import assert_allclose, assert_almost_equal
+
+    np.random.seed(32)
+    lmax = 10
+    N = 15
+    nlm = (lmax + 1) * (lmax + 1)
+    r = np.random.normal(size=(N, 3))
+    dylm_ref = np.zeros((3, N, nlm))
+    ylm0 = get_ylm(r, lmax)
+    for i in range(3):
+        dylm_ref[i] = get_ylm_fd(i, r, lmax)
+    dylm_ref2 = get_ylm_ad(r, lmax)
+    for lm in range(0, nlm):
+        assert_allclose(ylm0[:, lm], dylm_ref2[0, :, lm], rtol=1e-6, atol=1e-5)
+        assert_allclose(dylm_ref[0, :, lm], dylm_ref2[1, :, lm], rtol=1e-6, atol=1e-5)
+        assert_allclose(dylm_ref[1, :, lm], dylm_ref2[2, :, lm], rtol=1e-6, atol=1e-5)
+        assert_allclose(dylm_ref[2, :, lm], dylm_ref2[3, :, lm], rtol=1e-6, atol=1e-5)
+
+    import time
+
+    t0 = time.monotonic()
+    gaunt_coeff = get_deriv_ylm_coeff(lmax)
+    t1 = time.monotonic()
+    print("TIME TO GEN COEFF", t1 - t0)
+    print("GAUNT", gaunt_coeff)
+    ylm0_fit = np.zeros((4, N, nlm))
+    igrad = 0
+    for l in range(lmax):
+        mmax = 2 * l + 1
+        for m in range(2 * l + 1):
+            """
+            if igrad == 0:
+                lm = l * l + m
+            else:
+                lm = (l+1) * (l+1) - 1 - m
+            lm0 = lm + 2 * l + 1
+            lm1 = lm + 2 * l + 3
+            ylm0_fit[0, :, lm0] = ylm0[:, lm]
+            ylm0_fit[1, :, lm1] = ylm0[:, lm]
+            """
+            lm = l * l + m
+            if igrad == 0:
+                lmt = lm
+            else:
+                lmt = (l + 1) * (l + 1) - 1 - m
+            lm0 = lmt + 2 * l + 1
+            lm1 = lmt + 2 * l + 3
+            # lm2 = lmh + 2 * l + 1
+            # lm3 = lmh + 2 * l + 3
+            if igrad == 0:
+                ylm0_fit[0, :, lm0] = ylm0[:, lm]
+                ylm0_fit[1, :, lm1] = ylm0[:, lm]
+            else:
+                ylm0_fit[0, :, lm0] = ylm0[:, lm]
+                ylm0_fit[1, :, lm1] = ylm0[:, lm]
+    for l in range(lmax + 1):
+        mmax = 2 * l + 1
+        for m in range(2 * l + 1):
+            lm = l * l + m
+            X = ylm0_fit[:, :, lm].T
+            X = []
+            inds = []
+            mm = m - l
+            am = abs(mm)
+            f1 = (
+                clebsch_gordan(l - 1, 1, l, mm - 1, 1, mm).n(64)
+                * ((2 * l + 1) * l) ** 0.5
+                / (2) ** 0.5
+            )
+            f2 = (
+                clebsch_gordan(l - 1, 1, l, mm + 1, -1, mm).n(64)
+                * ((2 * l + 1) * l) ** 0.5
+                / (2) ** 0.5
+                * -1
+            )
+            if l != 0:
+                fe = clebsch_gordan_e3nn(l - 1, 1, l)
+                print(mm, m, l, fe.shape)
+                # print(fe)
+                fac = ((2 * l + 1) * l) ** 0.5
+                if igrad == 0:
+                    igg = 2
+                    mmm = m
+                    m1m = m - 2
+                    m1p = m
+                elif igrad == 1:
+                    igg = 0
+                    mmm = m
+                    # fac *= -1
+                    m1p = 2 * l - m - 2
+                    m1m = 2 * l - m
+                else:
+                    igg = 1
+                    mmm = m
+                    m1m = m - 1
+                    m1p = m - 1
+                # igg = 2 if igrad == 0 else 0
+                # mmm = m if igrad == 0 else 2 * l - m
+                fxp = fe[m1p, igg, mmm] if mm < l - 1 else 0
+                fxp *= fac
+                fxm = fe[m1m, igg, mmm] if mm > -l + 1 else 0
+                fxm *= fac
+            else:
+                fxp = fxm = 0
+            if mm == 0:
+                f1 *= np.sqrt(2)
+                f2 *= np.sqrt(2)
+            # print('FACTORC', float(f1), float(f2))
+            print("FACTORB", fxp, fxm)
+            if igrad == 1:
+                mvals = [
+                    mm + 1,
+                    mm - 1,
+                ]
+            else:
+                mvals = [
+                    -mm - 1,
+                    -mm + 1,
+                ]
+            # print('MVALS', mvals)
+            for i in range(2):
+                # print('MM', m, mm, mvals[i])
+                if abs(mvals[i]) >= l:
+                    continue
+                if np.linalg.norm(ylm0_fit[i, :, lm]) == 0:
+                    continue
+                for x in X:
+                    if np.linalg.norm(x - ylm0_fit[i, :, lm]) < 1e-6:
+                        break
+                else:
+                    inds.append((i, mvals[i]))
+                    X.append(ylm0_fit[i, :, lm])
+            if len(X) == 0:
+                continue
+            X = np.array(X).T
+            y = dylm_ref[igrad, :, lm]
+            # print(X.shape, y.shape)
+            beta = np.linalg.inv(X.T.dot(X) + 1e-10 * np.identity(X.shape[1]))
+            beta = beta.dot(X.T.dot(y))
+            err = np.linalg.norm(X.dot(beta) - y)
+            assert_almost_equal(err, 0, 4)
+            print(l, m - l, inds, lm, np.round(beta, 6), err)
+            # print(gaunt_coeff[2, lm], err)
+    for lm in range(nlm):
+        c_xl = np.zeros((N, nlm))
+        c_xl[:, lm] = 1
+        dylm_pred = np.zeros((3, N, nlm))
+        libcider.fill_sph_harm_deriv_coeff(
+            c_xl.ctypes.data_as(ctypes.c_void_p),
+            dylm_pred.ctypes.data_as(ctypes.c_void_p),
+            gaunt_coeff.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(N),
+            ctypes.c_int(lmax),
+        )
+        dylm_pred = (dylm_pred * ylm0).sum(-1)
+        print(lm)
+        # print(r[:, 2])
+        # print(ylm0[:, lm])
+        # print(dylm_pred[2])
+        # print(dylm_ref[2, :, lm])
+        assert_almost_equal(dylm_pred[2], dylm_ref2[3, :, lm], 6)
+        assert_almost_equal(dylm_pred[0], dylm_ref2[1, :, lm], 6)
+        assert_almost_equal(dylm_pred[1], dylm_ref2[2, :, lm], 6)
diff --git a/ciderpress/dft/xc_evaluator.py b/ciderpress/dft/xc_evaluator.py
index 8e567a7..b8423f9 100644
--- a/ciderpress/dft/xc_evaluator.py
+++ b/ciderpress/dft/xc_evaluator.py
@@ -1,4 +1,478 @@
 import numpy as np
+import yaml
+from interpolation.splines.eval_cubic_numba import (
+    vec_eval_cubic_splines_G_1,
+    vec_eval_cubic_splines_G_2,
+    vec_eval_cubic_splines_G_3,
+    vec_eval_cubic_splines_G_4,
+)
+
+
+def get_vec_eval(grid, coeffs, X, N):
+    """
+    Call the numba-accelerated spline evaluation routines from the
+    interpolation package. Also returns derivatives
+    Args:
+        grid: start and end points + number of grids in each dimension
+        coeffs: coefficients of the spline
+        X: coordinates to interpolate
+        N: dimension of the interpolation (between 1 and 4, inclusive)
+    """
+    coeffs = np.expand_dims(coeffs, coeffs.ndim)
+    y = np.zeros((X.shape[0], 1))
+    dy = np.zeros((X.shape[0], N, 1))
+    a_, b_, orders = zip(*grid)
+    if N == 1:
+        vec_eval_cubic_splines_G_1(a_, b_, orders, coeffs, X, y, dy)
+    elif N == 2:
+        vec_eval_cubic_splines_G_2(a_, b_, orders, coeffs, X, y, dy)
+    elif N == 3:
+        vec_eval_cubic_splines_G_3(a_, b_, orders, coeffs, X, y, dy)
+    elif N == 4:
+        vec_eval_cubic_splines_G_4(a_, b_, orders, coeffs, X, y, dy)
+    else:
+        raise ValueError("invalid dimension N")
+    return np.squeeze(y, -1), np.squeeze(dy, -1)
+
+
+class XCEvalSerializable:
+    def to_dict(self):
+        raise NotImplementedError
+
+    @classmethod
+    def from_dict(cls, d):
+        raise NotImplementedError
+
+    def dump(self, fname):
+        """
+        Save the Evaluator to a file name fname as yaml format.
+        """
+        state_dict = self.to_dict()
+        with open(fname, "w") as f:
+            yaml.dump(state_dict, f)
+
+    @classmethod
+    def load(cls, fname):
+        """
+        Load an instance of this class from yaml
+        """
+        with open(fname, "r") as f:
+            state_dict = yaml.load(f, Loader=yaml.CLoader)
+        return cls.from_dict(state_dict)
+
+
+class KernelEvalBase:
+
+    mode = None
+    feature_list = None
+    _mul_basefunc = None
+    _add_basefunc = None
+
+    @property
+    def N1(self):
+        raise NotImplementedError
+
+    def get_descriptors(self, X0T):
+        """
+        Compute and return transformed descriptor matrix X1.
+
+        Args:
+            rho: Density/Kinetic energy density representation.
+            X0 (nspin, N0, Nsamp): Raw features
+            nspin (1 or 2): Number of spins
+
+        Returns:
+            X1 (Nsamp, N1)
+        """
+        nspin, N0, Nsamp = X0T.shape
+        N1 = self.N1
+        if self.mode == "SEP":
+            X1 = np.zeros((nspin, Nsamp, N1))
+            for s in range(nspin):
+                self.feature_list.fill_vals_(X1[s].T, X0T[s])
+            X1 = X1.reshape(nspin * Nsamp, N1)
+        elif self.mode == "NPOL":
+            X0T_sum = X0T.mean(0)
+            X1 = np.zeros((Nsamp, N1))
+            self.feature_list.fill_vals_(X1.T, X0T_sum)
+        else:
+            raise NotImplementedError
+        return X1
+
+    def get_descriptors_with_mul(self, X0T, multiplier):
+        """
+        Compute and return transformed descriptor matrix X1.
+
+        Args:
+            rho: Density/Kinetic energy density representation.
+            X0 (nspin, N0, Nsamp): Raw features
+            nspin (1 or 2): Number of spins
+
+        Returns:
+            X1 (Nsamp, N1)
+        """
+        nspin, N0, Nsamp = X0T.shape
+        N1 = self.N1
+        if self.mode == "SEP":
+            X1 = np.zeros((nspin, Nsamp, N1))
+            for s in range(nspin):
+                self.feature_list.fill_vals_(X1[s].T, X0T[s])
+                X1[s] *= multiplier[:, None]
+            X1 = X1.reshape(nspin * Nsamp, N1)
+        elif self.mode == "NPOL":
+            X0T_sum = X0T.mean(0)
+            X1 = np.zeros((Nsamp, N1))
+            self.feature_list.fill_vals_(X1.T, X0T_sum)
+            X1[:] *= multiplier[:, None]
+        else:
+            raise NotImplementedError
+        return X1
+
+    def _baseline(self, X0T, base_func):
+        nspin, N0, Nsamp = X0T.shape
+        if self.mode == "SEP":
+            ms = []
+            dms = []
+            for s in range(nspin):
+                m, dm = base_func(X0T[s : s + 1])
+                ms.append(m / nspin)
+                dms.append(dm / nspin)
+            return np.stack(ms), np.concatenate(dms, axis=0)
+        else:
+            raise NotImplementedError
+
+    def multiplicative_baseline(self, X0T):
+        return self._baseline(X0T, self._mul_basefunc)
+
+    def additive_baseline(self, X0T):
+        return self._baseline(X0T, self._add_basefunc)
+
+    def apply_descriptor_grad(self, X0T, dfdX1):
+        """
+
+        Args:
+            X0T (nspin, N0, Nsamp): raw feature descriptors
+            dfdX1 (nspin * Nsamp, N1): derivative with respect to transformed
+                descriptors X1
+
+        Returns:
+            dfdX0T (nspin, N0, Nsamp): derivative with respect
+                to raw descriptors X0.
+        """
+        nspin, N0, Nsamp = X0T.shape
+        N1 = self.N1
+        if self.mode == "SEP":
+            dfdX0T = np.zeros_like(X0T)
+            dfdX1 = dfdX1.reshape(nspin, Nsamp, N1)
+            for s in range(nspin):
+                self.feature_list.fill_derivs_(dfdX0T[s], dfdX1[s].T, X0T[s])
+        elif self.mode == "NPOL":
+            dfdX0T = np.zeros_like(X0T)
+            self.feature_list.fill_derivs_(dfdX0T[0], dfdX1.T, X0T.mean(0))
+            for s in range(1, nspin):
+                dfdX0T[s] = dfdX0T[0]
+            dfdX0T /= nspin
+        else:
+            raise NotImplementedError
+        return dfdX0T
+
+    def apply_baseline(self, X0T, f, dfdX0T=None, add_base=True):
+        """
+
+        Args:
+            f:
+            dfdX0T:
+            add_base:
+
+        Returns:
+
+        """
+        m, dm = self.multiplicative_baseline(X0T)
+        add_base = add_base and self.additive_baseline is not None
+        if add_base:
+            a, da = self.additive_baseline(X0T)
+
+        res = f * m
+        if add_base:
+            res += a
+        if dfdX0T is not None:
+            dres = dfdX0T * m[:, np.newaxis] + f[:, np.newaxis] * dm
+            if add_base:
+                dres += da
+            return res, dres
+        return res
+
+
+class FuncEvaluator:
+    def build(self, *args, **kwargs):
+        pass
+
+    def __call__(self, X1, res=None, dres=None):
+        raise NotImplementedError
+
+
+class KernelEvaluator(FuncEvaluator, XCEvalSerializable):
+    def __init__(self, kernel, X1ctrl, alpha):
+        self.X1ctrl = X1ctrl
+        self.kernel = kernel
+        self.alpha = alpha
+
+    def __call__(self, X1, res=None, dres=None):
+        if res is None:
+            res = np.zeros(X1.shape[0])
+        elif res.shape != X1.shape[:1]:
+            raise ValueError
+        if dres is None:
+            dres = np.zeros(X1.shape)
+        elif dres.shape != X1.shape:
+            raise ValueError
+        N = X1.shape[0]
+        dn = 2000
+        for i0 in range(0, N, dn):
+            i1 = min(N, i0 + dn)
+            k, dk = self.kernel.k_and_deriv(X1[i0:i1], self.X1ctrl)
+            res[i0:i1] += k.dot(self.alpha)
+            dres[i0:i1] += np.einsum("gcn,c->gn", dk, self.alpha)
+        return res, dres
+
+
+class SplineSetEvaluator(FuncEvaluator, XCEvalSerializable):
+    def __init__(self, scale, ind_sets, spline_grids, coeff_sets, const=0):
+        self.scale = scale
+        self.nterms = len(self.scale)
+        assert len(ind_sets) == self.nterms
+        self.ind_sets = ind_sets
+        assert len(spline_grids) == self.nterms
+        self.spline_grids = spline_grids
+        assert len(coeff_sets) == self.nterms
+        self.coeff_sets = coeff_sets
+        self.const = const
+
+    def __call__(self, X1, res=None, dres=None):
+        """
+        Note: This function adds to res and dres if they are passed,
+        rather than writing them from scratch.
+
+        Args:
+            X1:
+            res:
+            dres:
+
+        Returns:
+
+        """
+        if res is None:
+            res = np.zeros(X1.shape[0]) + self.const
+        elif res.shape != X1.shape[:1]:
+            raise ValueError
+        else:
+            res[:] += self.const
+        if dres is None:
+            dres = np.zeros(X1.shape)
+        elif dres.shape != X1.shape:
+            raise ValueError
+        for t in range(self.nterms):
+            ind_set = self.ind_sets[t]
+            y, dy = get_vec_eval(
+                self.spline_grids[t], self.coeff_sets[t], X1[:, ind_set], len(ind_set)
+            )
+            res[:] += y * self.scale[t]
+            dres[:, ind_set] += dy * self.scale[t]
+        return res, dres
+
+    def to_dict(self):
+        return {
+            "scale": np.array(self.scale),
+            "ind_sets": self.ind_sets,
+            "spline_grids": self.spline_grids,
+            "coeff_sets": self.coeff_sets,
+            "const": self.const,
+        }
+
+    @classmethod
+    def from_dict(cls, d):
+        return cls(
+            d["scale"],
+            d["ind_sets"],
+            d["spline_grids"],
+            d["coeff_sets"],
+            const=d["const"],
+        )
+
+
+class NNEvaluator(FuncEvaluator, XCEvalSerializable):
+    def __init__(self, model, device=None):
+        self.model = model
+        self.device = device
+        import torch
+        from torch.autograd import grad
+
+        self.cuda_is_available = torch.cuda.is_available
+        self.device_init = torch.device
+        self.tensor_init = torch.tensor
+        self.eval_grad = grad
+        self.set_device(device=device)
+
+    def set_device(self, device=None):
+        if device is None:
+            if self.cuda_is_available():
+                device = "cuda:0"
+            else:
+                device = "cpu"
+        if isinstance(device, str):
+            device = self.device_init(device)
+        self.device = device
+        self.model = self.model.to(self.device)
+        self.model.eval()
+
+    def build(self, *args, **kwargs):
+        self.set_device()
+
+    def __call__(self, X1, res=None, dres=None):
+        if res is None:
+            res = np.zeros(X1.shape[0])
+        elif res.shape != X1.shape[:1]:
+            raise ValueError
+        if dres is None:
+            dres = np.zeros(X1.shape)
+        elif dres.shape != X1.shape:
+            raise ValueError
+        X1_torch = self.tensor_init(X1, device=self.device, requires_grad=True)
+        self.model.zero_grad()
+        output = self.model(X1_torch)
+        output_grad = self.eval_grad(output.sum(), X1_torch, retain_graph=False)[0]
+        res[:] += output.cpu().detach().numpy()
+        dres[:] += output_grad.cpu().detach().numpy()
+        return res, dres
+
+
+class GlobalLinearEvaluator(FuncEvaluator, XCEvalSerializable):
+    def __init__(self, consts):
+        self.consts = np.asarray(consts, dtype=np.float64, order="C")
+
+    def __call__(self, X1, res=None, dres=None):
+        if res is None:
+            res = np.zeros(X1.shape[0])
+        elif res.shape != X1.shape[:1]:
+            raise ValueError
+        if dres is None:
+            dres = np.zeros(X1.shape)
+        elif dres.shape != X1.shape:
+            raise ValueError
+        res[:] += X1.dot(self.consts)
+        dres[:] += self.consts
+        return res, dres
+
+
+class MappedDFTKernel(KernelEvalBase, XCEvalSerializable):
+    """
+    This class evaluates the XC term arising from a single DFTKernel
+    object, using one or a list of FuncEvaluator objects.
+    """
+
+    def __init__(
+        self,
+        fevals,
+        feature_list,
+        mode,
+        multiplicative_baseline,
+        additive_baseline=None,
+    ):
+        self.fevals = fevals
+        if not isinstance(self.fevals, list):
+            self.fevals = [self.fevals]
+        for feval in self.fevals:
+            if not isinstance(feval, FuncEvaluator):
+                raise ValueError
+        self.mode = mode
+        self.feature_list = feature_list
+        self._mul_basefunc = multiplicative_baseline
+        self._add_basefunc = additive_baseline
+
+    @property
+    def N1(self):
+        return self.feature_list.nfeat
+
+    def __call__(self, X0T, add_base=True, rhocut=0):
+        X1 = self.get_descriptors(X0T)
+        Nsamp_internal = X1.shape[0]
+        f = np.zeros(Nsamp_internal)
+        df = np.zeros((Nsamp_internal, self.N1))
+        for feval in self.fevals:
+            feval(X1, f, df)
+        if self.mode == "SEP":
+            f = f.reshape(X0T.shape[0], -1)
+        dfdX0T = self.apply_descriptor_grad(X0T, df)
+        res, dres = self.apply_baseline(X0T, f, dfdX0T)
+        if rhocut > 0:
+            if self.mode == "SEP":
+                cond = X0T[:, 0] < rhocut
+                for s in range(X0T.shape[0]):
+                    res[s][cond[s]] = 0.0
+                    dres[s][:, cond[s]] = 0.0
+            else:
+                cond = X0T[:, 0].sum(0) < rhocut
+                res[..., cond] = 0.0
+                dres[..., cond] = 0.0
+        if self.mode == "SEP":
+            res = res.sum(0)
+        return res, dres
+
+    def to_dict(self):
+        return {
+            "fevals": [fe.to_dict() for fe in self.fevals],
+            "feature_list": self.feature_list.to_dict(),
+            "mode": self.mode,
+        }
+
+    @classmethod
+    def from_dict(cls, d):
+        raise NotImplementedError
+
+
+class MappedFunctional:
+    def __init__(self, mapped_kernels, desc_params, libxc_baseline=None):
+        self.kernels = mapped_kernels
+        self.desc_params = desc_params
+        self.libxc_baseline = libxc_baseline
+
+    @property
+    def desc_version(self):
+        return self.desc_params.version[0]
+
+    @property
+    def vvmul(self):
+        return self.desc_params.vvmul
+
+    @property
+    def a0(self):
+        return self.desc_params.a0
+
+    @property
+    def fac_mul(self):
+        return self.desc_params.fac_mul
+
+    @property
+    def amin(self):
+        return self.desc_params.amin
+
+    @property
+    def feature_list(self):
+        # TODO misleading because this is X functional only
+        return self.kernels[0].feature_list
+
+    def set_baseline_mode(self, mode):
+        # TODO baseline can be GPAW or PySCF mode.
+        # Need to implement for more complicated XC.
+        raise NotImplementedError
+
+    def __call__(self, X0T, rhocut=0):
+        res, dres = 0, 0
+        for kernel in self.kernels:
+            tmp, dtmp = kernel(X0T, rhocut=rhocut)
+            res += tmp
+            dres += dtmp
+        return res, dres
 
 
 class ModelWithNormalizer:
diff --git a/ciderpress/gpaw/__init__.py b/ciderpress/gpaw/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/ciderpress/lib/__init__.py b/ciderpress/lib/__init__.py
index e69de29..744da5d 100644
--- a/ciderpress/lib/__init__.py
+++ b/ciderpress/lib/__init__.py
@@ -0,0 +1,7 @@
+import ctypes
+
+from ciderpress.lib.load import load_library
+
+c_double_p = ctypes.POINTER(ctypes.c_double)
+c_int_p = ctypes.POINTER(ctypes.c_int)
+c_null_ptr = ctypes.POINTER(ctypes.c_void_p)
diff --git a/ciderpress/lib/mod_cider/cider_coefs.c b/ciderpress/lib/mod_cider/cider_coefs.c
index 47ae8bb..271c5be 100644
--- a/ciderpress/lib/mod_cider/cider_coefs.c
+++ b/ciderpress/lib/mod_cider/cider_coefs.c
@@ -1,5 +1,4 @@
 #include "fblas.h"
-// #include "nr_cider_numint.h"
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/ciderpress/pyscf/__init__.py b/ciderpress/pyscf/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/ciderpress/pyscf/analyzers.py b/ciderpress/pyscf/analyzers.py
new file mode 100644
index 0000000..ce160a1
--- /dev/null
+++ b/ciderpress/pyscf/analyzers.py
@@ -0,0 +1,466 @@
+from abc import ABC, abstractmethod
+
+import numpy as np
+from pyscf import dft, gto, lib, scf
+from pyscf.dft.gen_grid import Grids
+
+from ciderpress.external.sgx_tools import get_jk_densities, sgx_fit
+
+CALC_TYPES = {
+    "RHF": scf.hf.RHF,
+    "UHF": scf.uhf.UHF,
+}
+
+
+def recursive_remove_none(obj):
+    if type(obj) == dict:
+        return {k: recursive_remove_none(v) for k, v in obj.items() if v is not None}
+    else:
+        return obj
+
+
+def recursive_bytes_to_str(obj):
+    if isinstance(obj, dict):
+        return {k: recursive_bytes_to_str(v) for k, v in obj.items()}
+    elif isinstance(obj, (list, tuple)):
+        lst = [recursive_bytes_to_str(item) for item in obj]
+        if isinstance(obj, tuple):
+            lst = tuple(lst)
+        return lst
+    elif isinstance(obj, bytes):
+        return obj.decode("utf-8")
+    else:
+        return obj
+
+
+class ElectronAnalyzer(ABC):
+    """
+    A class for generating and storing data derived from a PySCF
+    electronic structure calculation, in particular distributions
+    on a real-space grid such as the density, exchange (correlation)
+    energy density, Coulomb energy density, etc.
+
+    Attributes:
+
+        TODO
+
+    """
+
+    _atype = None
+
+    def __init__(
+        self, mol, dm, grids_level=3, mo_occ=None, mo_coeff=None, mo_energy=None
+    ):
+        if mo_occ is None and hasattr(dm, "mo_occ"):
+            self.mo_occ = dm.mo_occ
+        else:
+            self.mo_occ = mo_occ
+        if mo_coeff is None and hasattr(dm, "mo_coeff"):
+            self.mo_coeff = dm.mo_coeff
+        else:
+            self.mo_coeff = mo_coeff
+        if mo_energy is None and hasattr(dm, "mo_energy"):
+            self.mo_energy = dm.mo_energy
+        else:
+            self.mo_energy = mo_energy
+        self.dm = dm
+        self.mol = mol
+        self.mol.verbose = 3
+        self.mol.build()
+        self.grids_level = grids_level
+        self.grids = Grids(self.mol)
+        self.grids.level = self.grids_level
+        self._data = {}
+        self.grids.build(with_non0tab=False)
+
+    def keys(self):
+        return self._data.keys()
+
+    def set(self, k, v):
+        self._data[k] = v
+
+    def as_dict(self):
+        return {
+            "atype": self._atype,
+            "data": self._data,
+            "dm": self.dm,
+            "mo_occ": self.mo_occ,
+            "mo_coeff": self.mo_coeff,
+            "mo_energy": self.mo_energy,
+            "grids_level": self.grids_level,
+            "mol": gto.mole.pack(self.mol),
+        }
+
+    @staticmethod
+    def from_dict(d):
+        args = [
+            gto.mole.unpack(d["mol"]),
+            d["dm"],
+        ]
+        kwargs = {
+            "grids_level": d["grids_level"],
+            "mo_occ": d.get("mo_occ"),
+            "mo_coeff": d.get("mo_coeff"),
+            "mo_energy": d.get("mo_energy"),
+        }
+        atype = d.get("atype")
+        if atype is None:
+            cls = ElectronAnalyzer
+        elif atype == "RHF" or atype == "RKS":
+            cls = RHFAnalyzer
+        elif atype == "UHF" or atype == "UKS":
+            cls = UHFAnalyzer
+        else:
+            raise ValueError("Unrecognized analyzer type {}".format(atype))
+        analyzer = cls(*args, **kwargs)
+        analyzer._data.update(d["data"])
+        return analyzer
+
+    def dump(self, fname):
+        """
+        Dump self to an hdf5 file called fname.
+
+        Args:
+            fname (str): Name of file to dump to
+        """
+        h5dict = recursive_remove_none(self.as_dict())
+        lib.chkfile.dump(fname, "analyzer", h5dict)
+
+    @staticmethod
+    def load(fname):
+        """
+        Load instance of cls from hdf5
+        Args:
+            fname (str): Name of file from which to load
+            max_mem: See __init__
+        """
+        analyzer_dict = lib.chkfile.load(fname, "analyzer")
+        analyzer_dict = recursive_bytes_to_str(analyzer_dict)
+        return ElectronAnalyzer.from_dict(analyzer_dict)
+
+    def get(self, name):
+        return self._data[name]
+
+    def calculate_vxc(self, xcname, xcfunc=None, grids=None, xctype="MGGA"):
+        """
+        Computes the XC potential matrix for a functional,
+        store it, and return it.
+
+        Args:
+            xcname (str): XC functional
+            xcfunc (callable): XC function in case one
+                wants to use an XC functional not in libxc
+            grids (optional): Grid to use to calculate XC.
+
+        Returns:
+            vxc (nspin x nao x nao): XC potential matrix
+        """
+        if not isinstance(xcname, str):
+            raise ValueError("XC must be string")
+        if self._atype == "RHF":
+            calc = dft.RKS(self.mol)
+        elif self._atype == "UHF":
+            calc = dft.UKS(self.mol)
+        else:
+            raise NotImplementedError
+        calc.xc = xcname
+        if grids is not None:
+            calc.grids = grids
+        if xcfunc is not None:
+            calc.define_xc_(xcfunc, xctype=xctype)
+        vxc = calc.get_veff(dm=self.rdm1)
+        self._data["EXC_{}".format(xcname)] = vxc.exc
+        vxc = vxc - vxc.vj
+        self._data["VXC_{}".format(xcname)] = vxc
+        return vxc
+
+    def calculate_vxc_on_mo(self, xcname, orbs=None, **kwargs):
+        """
+        Compute contributions of XC potential to the eigenvalues. If
+        VXC_{xcname} is not in _data, calculate_vxc is called first.
+
+        Args:
+            xcname (str): Name of XC functional (for libxc)
+            orbs: Orbital index dictionary for computing orbital derivatives,
+                in the format use for descriptors.get_descriptors.
+            **kwargs: extra arguments to pass to calculate_vxc, if it is called.
+
+        Returns:
+            eigenvalue contributions. If orbs is provided, it will be in
+                the same format used by descritpors.get_descriptors.
+                Otherwise, it will be a numpy array with all the eigenvalue
+                contributions.
+        """
+        if "VXC_{}".format(xcname) not in self._data.keys():
+            vxc = self.calculate_vxc(xcname, **kwargs)
+        else:
+            vxc = self._data["VXC_{}".format(xcname)]
+        assert self.mo_coeff is not None
+        if orbs is None:
+            orbvals = vxc.dot(self.mo_coeff)
+            eigvals = np.einsum("...ui,ui->...i", self.mo_coeff, orbvals)
+            self._data["ORBXC_{}".format(xcname)] = eigvals
+        else:
+            from ciderpress.descriptors import get_labels_and_coeffs, unpack_eigvals
+
+            labels, coeffs, en_list, sep_spins = get_labels_and_coeffs(
+                orbs, self.mo_coeff, self.mo_occ, self.mo_energy
+            )
+            if self.mo_occ.ndim == 2:
+                orbvals = {}
+                for s in range(2):
+                    c = np.array(coeffs[s])
+                    if len(c) == 0:
+                        orbvals[s] = None
+                        continue
+                    orbvals[s] = np.array(c).dot(vxc[s])
+                    orbvals[s] = np.einsum("iu,iu->i", c, orbvals[s])
+            else:
+                c = np.array(coeffs)
+                orbvals = c.dot(vxc)
+                orbvals = np.einsum("iu,iu->i", c, orbvals)
+            if sep_spins:
+                eigvals = (
+                    unpack_eigvals(orbs[0], labels[0], orbvals[0]),
+                    unpack_eigvals(orbs[1], labels[1], orbvals[1]),
+                )
+            elif self.mo_occ.ndim == 2:
+                eigvals = {}
+                for k in list(orbs.keys()):
+                    eigvals[k] = {}
+                for s in range(2):
+                    eigvals_tmp = unpack_eigvals(orbs, labels[s], orbvals[s])
+                    for k in list(orbs.keys()):
+                        eigvals[k].update(eigvals_tmp[k])
+            else:
+                eigvals = unpack_eigvals(orbs, labels, orbvals)
+            self._data["ORBXC_{}".format(xcname)] = eigvals
+        return self._data["ORBXC_{}".format(xcname)]
+
+    @staticmethod
+    def from_calc(calc, grids_level=None, store_energy_orig=True):
+        """
+        NOTE: This has side effects on calc, see notes below.
+
+        Args:
+            calc: SCF object from PySCF
+            grids_level: Size of PySCF grids for XC integration
+            store_energy_orig: Whether to store original xc energy.
+
+        Returns:
+
+        """
+        if isinstance(calc, scf.uhf.UHF):
+            cls = UHFAnalyzer
+        elif isinstance(calc, scf.hf.RHF):
+            cls = RHFAnalyzer
+        else:
+            raise ValueError("Must be HF calculation")
+
+        if grids_level is None:
+            if isinstance(calc, dft.rks.KohnShamDFT):
+                grids_level = calc.grids.level
+            else:
+                grids_level = 3
+        analyzer = cls(
+            calc.mol,
+            calc.make_rdm1(),
+            grids_level=grids_level,
+            mo_occ=calc.mo_occ,
+            mo_coeff=calc.mo_coeff,
+            mo_energy=calc.mo_energy,
+        )
+        if store_energy_orig and isinstance(calc, dft.rks.KohnShamDFT):
+            # save initial exc for reference
+            if (calc.scf_summary.get("exc") is None) or (
+                grids_level != calc.grids.level
+            ):
+                # NOTE this should leave calc almost the same
+                # as it started, but might make some
+                # minor changes if there are non-default
+                # settings/screenings on the grids. Also
+                # scf_summary will be overwritten
+                old_level = calc.grids.level
+                calc.grids.level = grids_level
+                calc.grids.build()
+                if hasattr(calc, "with_df") and hasattr(calc.with_df, "grids"):
+                    calc.with_df.build()
+                e_tot = calc.energy_tot(analyzer.dm)
+                calc.grids.level = old_level
+                calc.grids.build()
+            else:
+                e_tot = calc.e_tot
+            analyzer._data["xc_orig"] = calc.xc
+            analyzer._data["exc_orig"] = calc.scf_summary["exc"]
+            analyzer._data["e_tot_orig"] = e_tot
+        return analyzer
+
+    def get_rho_data(self, overwrite=False):
+        if "rho_data" in self.keys() and not overwrite:
+            return self.get("rho_data")
+        ni = dft.numint.NumInt()
+        spinpol = self.dm.ndim == 3
+        if spinpol:
+            rho_list_a = []
+            rho_list_b = []
+        else:
+            rho_list = []
+        for ao, non0, weight, coord in ni.block_loop(self.mol, self.grids, deriv=2):
+            if spinpol:
+                rho_list_a.append(ni.eval_rho(self.mol, ao, self.dm[0], xctype="MGGA"))
+                rho_list_b.append(ni.eval_rho(self.mol, ao, self.dm[1], xctype="MGGA"))
+            else:
+                rho_list.append(ni.eval_rho(self.mol, ao, self.dm, xctype="MGGA"))
+        if spinpol:
+            rhoa = np.concatenate(rho_list_a, axis=-1)
+            rhob = np.concatenate(rho_list_b, axis=-1)
+            self._data["rho_data"] = np.stack([rhoa, rhob], axis=0)
+        else:
+            self._data["rho_data"] = np.concatenate(rho_list, axis=-1)
+        return self._data["rho_data"]
+
+    @abstractmethod
+    def perform_full_analysis(self):
+        pass
+
+    @property
+    def rdm1(self):
+        return self.dm
+
+    @property
+    def rho_data(self):
+        return self.get("rho_data")
+
+
+class RHFAnalyzer(ElectronAnalyzer):
+
+    _atype = "RHF"
+
+    def get_xc(self, xcname):
+        """
+        Function for accessing XC functional data from _data
+        """
+        if xcname in self._data["xc_list"]:
+            return self._data["xc"][xcname]
+        else:
+            raise ValueError("Data not computed for xc functional")
+
+    def get_rs(self, term, omega, tol=1e-9):
+        """
+        Function for accessing range-separated data from _data
+        Finds match based on a tolerance, in case rounding
+        issues arise with floats.
+        """
+        if not (term in ["ee", "ha", "ex"]):
+            raise ValueError("Term must be ee, ha, or ex")
+        name = "{}_energy_density_rs".format(term)
+        for ind, omega_tst in enumerate(self._data["omega_list"]):
+            if abs(omega_tst - omega) < tol:
+                return self._data[name][ind]
+        else:
+            raise RuntimeError("RS data not found for omega={}".format(omega))
+
+    def get_xc_energy(self, xcname):
+        """
+        Store and return the XC energy for the functional given by xcname
+        """
+        if self._data.get("xc") is None:
+            self._data["xc_list"] = []
+            self._data["xc"] = {}
+        ni = dft.numint.NumInt()
+        if isinstance(self, UHFAnalyzer):
+            _, exc, _ = ni.nr_uks(
+                self.mol, self.grids, xcname, self.dm, relativity=0, hermi=1
+            )
+        else:
+            _, exc, _ = ni.nr_rks(
+                self.mol, self.grids, xcname, self.dm, relativity=0, hermi=1
+            )
+        self._data["xc_list"].append(xcname)
+        self._data["xc"][xcname] = exc
+        return exc
+
+    def get_ha_energy_density(self):
+        self.get_ee_energy_density()
+        return self._data["ha_energy_density"]
+
+    def get_ex_energy_density(self):
+        self.get_ee_energy_density()
+        return self._data["ex_energy_density"]
+
+    get_fx_energy_density = get_ex_energy_density
+
+    def _get_ej_ek(self, omega=None):
+        calc_tmp = sgx_fit(scf.RHF(self.mol))
+        calc_tmp.build()
+        calc_tmp.with_df.grids = self.grids
+        if omega is None:
+            ej, ek = get_jk_densities(calc_tmp.with_df, self.dm)
+        else:
+            with calc_tmp.mol.with_range_coulomb(omega):
+                ej, ek = get_jk_densities(calc_tmp.with_df, self.dm)
+        return ej, ek
+
+    def _get_ee(self, omega=None):
+        ej, ek = self._get_ej_ek(omega)
+        ej = ej[0]
+        ek = 0.5 * ek[0]
+        return ej, ek, ej + ek
+
+    def get_ee_energy_density(self):
+        """
+        Returns the sum of E_{Ha} and E_{X}, i.e. the Coulomb repulsion
+        energy of the HF or KS Slater determinant.
+        """
+        ej, ek, ee = self._get_ee()
+        self._data["ha_energy_density"] = ej
+        self._data["ex_energy_density"] = ek
+        self._data["ee_energy_density"] = ee
+        return self._data["ee_energy_density"]
+
+    def get_ee_energy_density_rs(self, omega):
+        if self._data.get("omega_list") is None:
+            self._data["omega_list"] = []
+            self._data["ee_energy_density_rs"] = []
+            self._data["ha_energy_density_rs"] = []
+            self._data["ex_energy_density_rs"] = []
+        ej, ek, ee = self._get_ee(omega)
+        self._data["omega_list"].append(omega)
+        self._data["ha_energy_density_rs"].append(ej)
+        self._data["ex_energy_density_rs"].append(ek)
+        self._data["ee_energy_density_rs"].append(ee)
+        return self._data["ee_energy_density_rs"][-1]
+
+    def perform_full_analysis(self):
+        self.get_rho_data()
+        self.get_ee_energy_density()
+
+    @property
+    def atype(self):
+        return self._atype
+
+
+RKSAnalyzer = RHFAnalyzer
+
+
+class UHFAnalyzer(RHFAnalyzer):
+
+    _atype = "UHF"
+
+    def _get_ej_ek(self, omega=None):
+        calc_tmp = sgx_fit(scf.UHF(self.mol))
+        calc_tmp.build()
+        calc_tmp.with_df.grids = self.grids
+        if omega is None:
+            ej, ek = get_jk_densities(calc_tmp.with_df, self.dm)
+        else:
+            with calc_tmp.mol.with_range_coulomb(omega):
+                ej, ek = get_jk_densities(calc_tmp.with_df, self.dm)
+        return ej, ek
+
+    def _get_ee(self, omega=None):
+        ej, ek = self._get_ej_ek(omega)
+        return ej, ek, ej + ek
+
+
+UKSAnalyzer = UHFAnalyzer
diff --git a/ciderpress/pyscf/debug_numint.py b/ciderpress/pyscf/debug_numint.py
new file mode 100644
index 0000000..d663fd1
--- /dev/null
+++ b/ciderpress/pyscf/debug_numint.py
@@ -0,0 +1,107 @@
+import numpy as np
+from pyscf.dft.gen_grid import Grids
+from pyscf.dft.numint import NumInt
+
+from ciderpress.dft.debug_numint import get_get_exponent, get_nonlocal_features
+
+
+def get_nldf_numint(
+    mol, grids, dms, gg_kwargs, vv_gg_kwargs, version="i", inner_grids_level=3
+):
+    if dms.ndim == 2:
+        return _get_nldf_numint(
+            mol,
+            grids,
+            dms,
+            gg_kwargs,
+            vv_gg_kwargs,
+            version=version,
+            inner_grids_level=inner_grids_level,
+        )[None, :]
+    else:
+        assert dms.shape[0] == 2
+        dms = dms * 2
+        res0 = _get_nldf_numint(
+            mol,
+            grids,
+            dms[0],
+            gg_kwargs,
+            vv_gg_kwargs,
+            version=version,
+            inner_grids_level=inner_grids_level,
+        )
+        res1 = _get_nldf_numint(
+            mol,
+            grids,
+            dms[1],
+            gg_kwargs,
+            vv_gg_kwargs,
+            version=version,
+            inner_grids_level=inner_grids_level,
+        )
+        return np.stack([res0, res1])
+
+
+def _get_nldf_numint(
+    mol, grids, dms, gg_kwargs, vv_gg_kwargs, version="i", inner_grids_level=3
+):
+    ni = NumInt()
+    make_rho, nset, nao = ni._gen_rho_evaluator(mol, dms, 1, False)
+    max_memory = 2000
+    assert nset == 1
+
+    get_exponent_r = get_get_exponent(gg_kwargs)
+    get_exponent_rp = get_get_exponent(vv_gg_kwargs)
+
+    assert nset == 1
+    smgrids = Grids(mol)
+    smgrids.level = inner_grids_level
+    smgrids.prune = None
+    smgrids.build()
+
+    nfeat = 12 if version == "i" else 4
+    desc = np.empty((0, nfeat))
+    ao_deriv = 1
+    vvrho = np.empty([nset, 5, 0])
+    vvweight = np.empty([nset, 0])
+    vvcoords = np.empty([nset, 0, 3])
+    for ao, mask, weight, coords in ni.block_loop(
+        mol, smgrids, nao, ao_deriv, max_memory
+    ):
+        rhotmp = np.empty([0, 5, weight.size])
+        weighttmp = np.empty([0, weight.size])
+        coordstmp = np.empty([0, weight.size, 3])
+        for idm in range(nset):
+            rho = make_rho(idm, ao, mask, "MGGA")
+            rho = np.expand_dims(rho, axis=0)
+            rhotmp = np.concatenate((rhotmp, rho), axis=0)
+            weighttmp = np.concatenate(
+                (weighttmp, np.expand_dims(weight, axis=0)),
+                axis=0,
+            )
+            coordstmp = np.concatenate(
+                (coordstmp, np.expand_dims(coords, axis=0)),
+                axis=0,
+            )
+        vvrho = np.concatenate((vvrho, rhotmp), axis=2)
+        vvweight = np.concatenate((vvweight, weighttmp), axis=1)
+        vvcoords = np.concatenate((vvcoords, coordstmp), axis=1)
+    aow = None
+    for ao, mask, weight, coords in ni.block_loop(
+        mol, grids, nao, ao_deriv, max_memory
+    ):
+        aow = np.ndarray(ao[0].shape, order="F", buffer=aow)
+        for idm in range(nset):
+            rho = make_rho(idm, ao, mask, "MGGA")
+            feat = get_nonlocal_features(
+                rho,
+                coords,
+                vvrho[idm],
+                vvweight[idm],
+                vvcoords[idm],
+                get_exponent_r,
+                get_exponent_rp,
+                version=version,
+            )
+            desc = np.append(desc, feat, axis=0)
+    return desc
diff --git a/ciderpress/pyscf/descriptors.py b/ciderpress/pyscf/descriptors.py
new file mode 100644
index 0000000..476a13c
--- /dev/null
+++ b/ciderpress/pyscf/descriptors.py
@@ -0,0 +1,478 @@
+"""
+This module is for fast evaluation of features and their derivatives with
+respect to orbital occupations and rotations. Only for b and d features
+"""
+
+import numpy as np
+from pyscf import lib
+from pyscf.dft.numint import NumInt
+
+from ciderpress.dft.settings import (
+    FracLaplSettings,
+    NLDFSettings,
+    SDMXBaseSettings,
+    SemilocalSettings,
+    dalpha,
+    ds2,
+    get_alpha,
+    get_s2,
+)
+from ciderpress.pyscf.analyzers import UHFAnalyzer
+from ciderpress.pyscf.frac_lapl import FLNumInt, FracLaplPlan
+from ciderpress.pyscf.gen_cider_grid import CiderGrids
+from ciderpress.pyscf.nldf_convolutions import DEFAULT_CIDER_LMAX, PyscfNLDFGenerator
+from ciderpress.pyscf.xedsadm import EXXSphGenerator
+
+NLDF_VERSION_LIST = ["i", "j", "ij", "k"]
+
+
+def get_descriptors(analyzer, settings, orbs=None, **kwargs):
+    """
+    Get the descriptors for version b or d (l=0 only). Analyzer must
+    contain mo_occ, mo_energy, and mo_coeff attributes.
+    Can get feature derivatives with respect to orbital occupation using orbs.
+    Spin polarization is determined automatically from the analyzer class type.
+
+    orbs format:
+        orbs is a dictionary of lists or (for a spin-polarized system) a
+        2-tuple of dictionary of lists. Each key is either 'B', 'O', or 'U',
+        with the value being a list of orbital indexes with respect to which to
+        compute occupation derivatives of the features. 'B' indexes up from the
+        bottom, i.e. the lowest-energy core states. 'O' indexes down from the
+        highest occupied molecular orbital. 'U' indexes up from the lowest
+        unoccupied molecular orbital. 0-indexing is used in all cases.
+        For the spin-polarized case, if orbs is a single dictionary rather than
+        a 2-tuple, orbital indexing is performed over the combined set of
+        orbitals.
+
+    Args:
+        analyzer (RHFAnalyzer or UHFAnalyzer): analyzer object containing the
+            desired molecule and density matrix/orbitals for which to compute
+            features and their derivatives.
+        version: 'b', 'd', or 'l', feature version to evaluate. 'l' stands
+            for local and computes [rho, drho_x, drho_y, drho_z, tau]
+        orbs: Orbital index dictionary for computing orbital derivatives.
+        **kwargs: Any settings one wants to change from the default for the
+                  PyscfNLDFGenerator object.
+
+    Returns:
+        desc (np.ndarray (nspin, nfeat, ngrid)): Descriptors for the system
+        ddesc (dict(str : dict(int : array(nfeat, ngrid)))) or (dict, dict):
+            Derivatives of the features with respect to the orbitals given by
+            orbs, stored in a nested dictionary first by indexing style
+            (B, O, or U) and then by an orbital index. If orbs was a 2-tuple,
+            ddesc is a 2-tuple. Index format is the same as orbs, so a
+            U-indexed orbital in orbs is U-indexed in ddesc. If calculation
+            was spin-polarized but orbs was not spin-separated, each value in
+            the ddesc dictionary is a 2-tuple (spin, deriv_array)
+        eigvals (dict(str : dict(int : float)) or (dict, dict):
+            Eigenvalues of the relevant orbitals. These are important as
+            baseline energy derivatives, assuming the baseline functional
+            is the same as the reference functional for the eigenvalues.
+    """
+    spinpol = isinstance(analyzer, UHFAnalyzer)
+    mo_occ = analyzer.mo_occ
+    mo_energy = analyzer.mo_energy
+    mo_coeff = analyzer.mo_coeff
+    if orbs is None:
+        labels, en_list, sep_spins = None, None, None
+        if spinpol:
+            coeffs = [None, None]
+        else:
+            coeffs = None
+    else:
+        labels, coeffs, en_list, sep_spins = get_labels_and_coeffs(
+            orbs, mo_coeff, mo_occ, mo_energy
+        )
+
+    if isinstance(settings, str):
+        if settings != "l":
+            raise ValueError("Settings must be Settings object or letter l")
+        my_desc_getter = _density_getter
+    elif isinstance(settings, SemilocalSettings):
+        my_desc_getter = _sl_desc_getter
+    elif isinstance(settings, NLDFSettings):
+        my_desc_getter = _nldf_desc_getter
+    elif isinstance(settings, FracLaplSettings):
+        my_desc_getter = _fl_desc_getter
+    elif isinstance(settings, SDMXBaseSettings):
+        my_desc_getter = _sadm_desc_getter
+    else:
+        raise ValueError("Unsupported settings")
+
+    if spinpol:
+        desc = []
+        for s in range(2):
+            desc.append(
+                my_desc_getter(
+                    analyzer.mol,
+                    analyzer.grids,
+                    2 * analyzer.rdm1[s],
+                    settings,
+                    coeffs=coeffs[s],
+                    **kwargs
+                )
+            )
+        desc_a, desc_b = desc
+        if orbs is not None:
+            desc_a, deriv_arr_a = desc_a
+            desc_b, deriv_arr_b = desc_b
+            deriv_arr_a *= 2.0
+            deriv_arr_b *= 2.0
+            if sep_spins:
+                ddesc_a, eigvals_a = unpack_feature_derivs(
+                    orbs[0], labels[0], deriv_arr_a, en_list[0]
+                )
+                ddesc_b, eigvals_b = unpack_feature_derivs(
+                    orbs[1], labels[1], deriv_arr_b, en_list[1]
+                )
+                return (
+                    np.stack([desc_a, desc_b]),
+                    (ddesc_a, ddesc_b),
+                    (eigvals_a, eigvals_b),
+                )
+            else:
+                deriv_arr = [deriv_arr_a, deriv_arr_b]
+                ddesc = {}
+                eigvals = {}
+                for k in list(orbs.keys()):
+                    ddesc[k] = {}
+                    eigvals[k] = {}
+                for s in range(2):
+                    ddesc_tmp, eigvals_tmp = unpack_feature_derivs(
+                        orbs, labels[s], deriv_arr[s], en_list[s]
+                    )
+                    for k in list(orbs.keys()):
+                        ddesc[k].update(
+                            {kk: (s, vv) for kk, vv in ddesc_tmp[k].items()}
+                        )
+                        eigvals[k].update(
+                            {kk: (s, vv) for kk, vv in eigvals_tmp[k].items()}
+                        )
+                return np.stack([desc_a, desc_b]), ddesc, eigvals
+        else:
+            return np.stack([desc_a, desc_b])
+    else:
+        desc = my_desc_getter(
+            analyzer.mol,
+            analyzer.grids,
+            analyzer.rdm1,
+            settings,
+            coeffs=coeffs,
+            **kwargs
+        )
+        if orbs is not None:
+            desc, deriv_arr = desc
+            ddesc, eigvals = unpack_feature_derivs(orbs, labels, deriv_arr, en_list)
+            return desc[None, :], ddesc, eigvals
+        else:
+            return desc[None, :]
+
+
+def unpack_feature_derivs(orbs, labels, deriv_arr, en_list):
+    ddesc = {}
+    eigvals = {}
+    for k in list(orbs.keys()):
+        ddesc[k] = {}
+        eigvals[k] = {}
+    for n, (k, iorb) in enumerate(labels):
+        ddesc[k][iorb] = deriv_arr[n].copy()
+        eigvals[k][iorb] = en_list[n]
+    return ddesc, eigvals
+
+
+def unpack_eigvals(orbs, labels, en_list):
+    eigvals = {}
+    for k in list(orbs.keys()):
+        eigvals[k] = {}
+    for n, (k, iorb) in enumerate(labels):
+        eigvals[k][iorb] = en_list[n]
+    return eigvals
+
+
+def _get_labels_and_coeffs(orbs, mo_coeff, mo_occ, mo_energy, spins=None):
+    index_spin = spins is not None
+    if index_spin:
+        labels = ([], [])
+        coeff_list = ([], [])
+        en_list = ([], [])
+    else:
+        labels = []
+        coeff_list = []
+        en_list = []
+    for k, v in orbs.items():
+        if k == "O":
+            inds = (mo_occ > 0.5).nonzero()[0][::-1]
+        elif k == "U":
+            inds = (mo_occ <= 0.5).nonzero()[0]
+        else:
+            inds = np.arange(mo_occ.size)
+        mo_sub = mo_coeff[:, inds]
+        en_sub = mo_energy[inds]
+        if index_spin:
+            spin_sub = spins[inds]
+        else:
+            spin_sub = None
+        for iorb in v:
+            l = (k, iorb)
+            c = mo_sub[:, iorb]
+            e = en_sub[iorb]
+            if index_spin:
+                s = spin_sub[iorb]
+                labels[s].append(l)
+                coeff_list[s].append(c)
+                en_list[s].append(e)
+            else:
+                labels.append(l)
+                coeff_list.append(c)
+                en_list.append(e)
+    return labels, coeff_list, en_list
+
+
+def get_labels_and_coeffs(orbs, mo_coeff, mo_occ, mo_energy):
+    spinpol = mo_occ.ndim == 2
+    norb = mo_occ.shape[-1]
+    sep_spins = False
+    if spinpol:
+        sep_spins = (isinstance(orbs, tuple) or isinstance(orbs, list)) and len(
+            orbs
+        ) == 2
+        if sep_spins:
+            labels_a, coeffs_a, en_list_a = _get_labels_and_coeffs(
+                orbs[0], mo_coeff[0], mo_occ[0], mo_energy[0]
+            )
+            labels_b, coeffs_b, en_list_b = _get_labels_and_coeffs(
+                orbs[1], mo_coeff[1], mo_occ[1], mo_energy[1]
+            )
+            labels = (labels_a, labels_b)
+            coeffs = (coeffs_a, coeffs_b)
+            en_list = (en_list_a, en_list_b)
+        else:
+            assert isinstance(orbs, dict)
+            spins = np.append(
+                np.zeros(norb, dtype=np.int32),
+                np.ones(norb, dtype=np.int32),
+            )
+            mo_occ = mo_occ.flatten()
+            mo_energy = mo_energy.flatten()
+            mo_coeff = mo_coeff.transpose(1, 0, 2).reshape(norb, 2 * norb)
+            inds = np.argsort(mo_energy)
+            spins = np.ascontiguousarray(spins[inds])
+            mo_occ = np.ascontiguousarray(mo_occ[inds])
+            mo_energy = np.ascontiguousarray(mo_energy[inds])
+            mo_coeff = np.ascontiguousarray(mo_coeff[:, inds])
+            labels, coeffs, en_list = _get_labels_and_coeffs(
+                orbs, mo_coeff, mo_occ, mo_energy, spins=spins
+            )
+    else:
+        labels, coeffs, en_list = _get_labels_and_coeffs(
+            orbs, mo_coeff, 0.5 * mo_occ, mo_energy
+        )
+    return labels, coeffs, en_list, sep_spins
+
+
+def get_full_rho(ni, mol, dms, grids, xctype):
+    hermi = 1
+    max_memory = 2000
+    make_rho, nset, nao = ni._gen_rho_evaluator(mol, dms, hermi, False, grids)
+    ao_deriv = 1
+    Nrhofeat = 4 if xctype == "GGA" else 5
+    rho_full = np.zeros(
+        (nset, Nrhofeat, grids.weights.size), dtype=np.float64, order="C"
+    )
+    ip0 = 0
+    for ao, mask, weight, coords in ni.block_loop(
+        mol, grids, nao, ao_deriv, max_memory
+    ):
+        mask = None  # TODO remove once screening is fixed
+        ip1 = ip0 + weight.size
+        for idm in range(nset):
+            rho = make_rho(idm, ao, mask, xctype)
+            rho_full[idm, :, ip0:ip1] = rho
+        ip0 = ip1
+    return rho_full
+
+
+def get_full_rho_with_fl(ni, mol, dms, grids):
+    xctype = "MGGA"
+    hermi = 1
+    max_memory = 2000
+    make_rho, nset, nao = ni._gen_rho_evaluator(mol, dms, hermi, False, grids)
+    ao_deriv = 1
+    Nrhofeat = ni.plan.settings.nrho + 5
+    rho_full = np.zeros(
+        (nset, Nrhofeat, grids.weights.size), dtype=np.float64, order="C"
+    )
+    ip0 = 0
+    for ao, mask, weight, coords in ni.block_loop(
+        mol, grids, nao, ao_deriv, max_memory
+    ):
+        mask = None  # TODO remove once screening is fixed
+        ip1 = ip0 + weight.size
+        for idm in range(nset):
+            rho = make_rho(idm, ao, mask, xctype)
+            rho_full[idm, :, ip0:ip1] = rho
+        ip0 = ip1
+    return rho_full
+
+
+def get_mo_densities(ni, mol, mo_vecs, grids, xctype):
+    nmo = mo_vecs.shape[0]
+    mo_coeff = np.ascontiguousarray(mo_vecs[:, :, np.newaxis])
+    mo_occ = np.ones((nmo, 1))
+    dms = np.empty((nmo, mol.nao_nr(), 0))
+    dms = lib.tag_array(dms, mo_coeff=mo_coeff, mo_occ=mo_occ)
+    return get_full_rho(ni, mol, dms, grids, xctype)
+
+
+def get_mo_densities_with_fl(ni, mol, mo_vecs, grids):
+    nmo = mo_vecs.shape[0]
+    mo_coeff = np.ascontiguousarray(mo_vecs[:, :, np.newaxis])
+    mo_occ = np.ones((nmo, 1))
+    dms = np.empty((nmo, mol.nao_nr(), 0))
+    dms = lib.tag_array(dms, mo_coeff=mo_coeff, mo_occ=mo_occ)
+    return get_full_rho_with_fl(ni, mol, dms, grids)
+
+
+def _density_getter(mol, pgrids, dms, nldf_settings, coeffs=None, **kwargs):
+    ni = NumInt()
+    desc = get_full_rho(ni, mol, dms, pgrids, "MGGA")[0]
+    if coeffs is not None:
+        if len(coeffs) == 0:
+            return desc, 0
+        coeffs = np.array(coeffs)
+        ddesc = get_mo_densities(ni, mol, coeffs, pgrids, "MGGA")
+        return desc, ddesc
+    else:
+        return desc
+
+
+def _sl_desc_getter(mol, pgrids, dms, settings, coeffs=None, **kwargs):
+    ni = NumInt()
+    xctype = "MGGA"
+    prho = get_full_rho(ni, mol, dms, pgrids, xctype)[0]
+    psigma = np.einsum("xg,xg->g", prho[1:4], prho[1:4])
+    if settings.mode != "npa":
+        raise NotImplementedError
+    p = get_s2(prho[0], psigma)
+    dpdn, dpdsigma = ds2(prho[0], psigma)
+    alpha = get_alpha(prho[0], psigma, prho[4])
+    dalpha_rho, dalpha_sigma, dalpha_tau = dalpha(prho[0], psigma, prho[4])
+    nfeat = settings.nfeat
+    assert nfeat == 3
+    feat_ig = np.stack([prho[0], p, alpha])
+
+    if coeffs is None:
+        return feat_ig
+    if len(coeffs) == 0:
+        return feat_ig, 0
+
+    coeffs = np.array(coeffs)
+    nmo = coeffs.shape[0]
+    dfeat_jig = np.empty((nmo, nfeat, prho.shape[-1]))
+    dprho_dphi = get_mo_densities(ni, mol, coeffs, pgrids, xctype)
+    dp_jg = dpdn * dprho_dphi[:, 0]
+    dalpha_jg = dalpha_rho * dprho_dphi[:, 0] + dalpha_tau * dprho_dphi[:, 4]
+    for imo in range(nmo):
+        dsigma_dphi = 2 * np.einsum("xg,xg->g", dprho_dphi[imo, 1:4], prho[1:4])
+        dp_jg[imo] += dpdsigma * dsigma_dphi
+        dalpha_jg[imo] += dalpha_sigma * dsigma_dphi
+    dfeat_jig[:, 0, :] = dprho_dphi[:, 0]
+    dfeat_jig[:, 1, :] = dp_jg
+    dfeat_jig[:, 2, :] = dalpha_jg
+    return feat_ig, dfeat_jig
+
+
+def _nldf_desc_getter(
+    mol,
+    pgrids,
+    dms,
+    nldf_settings,
+    coeffs=None,
+    inner_grids_level=3,
+    inner_grids=None,
+    **kwargs
+):
+    kwargs["lmax"] = kwargs.get("lmax") or DEFAULT_CIDER_LMAX
+    kwargs["interpolator_type"] = "train_gen"
+    if inner_grids is None:
+        grids = CiderGrids(mol, lmax=kwargs["lmax"])
+        grids.level = inner_grids_level
+        grids.build(with_non0tab=False)
+    else:
+        grids = inner_grids
+    nldf_generator = PyscfNLDFGenerator.from_mol_and_settings(
+        mol, grids.grids_indexer, 1, nldf_settings, **kwargs
+    )
+    nldf_generator.interpolator.set_coords(pgrids.coords)
+    ni = NumInt()
+    xctype = "MGGA"
+
+    rho = get_full_rho(ni, mol, dms, grids, xctype)[0]
+    prho = get_full_rho(ni, mol, dms, pgrids, xctype)[0]
+    if coeffs is not None and len(coeffs) > 0:
+        coeffs = np.array(coeffs)
+        drho_dphi = get_mo_densities(ni, mol, coeffs, grids, xctype)
+        dprho_dphi = get_mo_densities(ni, mol, coeffs, pgrids, xctype)
+    else:
+        drho_dphi = np.empty(0)
+        dprho_dphi = np.empty(0)
+
+    res = nldf_generator.get_features_and_occ_derivs(rho, drho_dphi, prho, dprho_dphi)
+    if coeffs is None:
+        return res[0]
+    elif len(coeffs) > 0:
+        return res
+    else:
+        return res[0], 0
+
+
+def _fl_desc_getter(mol, pgrids, dms, flsettings, coeffs=None, **kwargs):
+    plan = FracLaplPlan(flsettings, 1)
+    ni = FLNumInt(plan)
+    desc = get_full_rho_with_fl(ni, mol, dms, pgrids)
+    feat = plan.get_feat(desc, make_l1_data_copy=False)
+    if coeffs is not None:
+        if len(coeffs) == 0:
+            return feat[0], 0
+        coeffs = np.array(coeffs)
+        ddesc = get_mo_densities_with_fl(ni, mol, coeffs, pgrids)
+        dfeat = plan.get_occd(desc[0], ddesc)
+        return feat[0], dfeat
+    else:
+        return feat[0]
+
+
+def _sadm_desc_getter(mol, pgrids, dm, settings, coeffs=None, **kwargs):
+    exxgen = EXXSphGenerator.from_settings_and_mol(settings, 1, mol, **kwargs)
+    ni = NumInt()
+    ngrids = pgrids.coords.shape[0]
+    nfeat = settings.nfeat
+    desc = np.zeros((nfeat, ngrids))
+    if coeffs is None:
+        ddesc = 0
+    else:
+        coeffs = np.asarray(coeffs)
+        ddesc = np.zeros((coeffs.shape[0], nfeat, ngrids))
+    max_memory = 2000
+    nao = mol.nao_nr()
+    ip0 = 0
+    maxmem = int(max_memory / exxgen.plan.nalpha) + 1
+    if settings.n1terms > 0:
+        maxmem //= 4
+        maxmem += 1
+    for ao, mask, weight, coords in ni.block_loop(mol, pgrids, nao, 0, maxmem):
+        mask = None  # TODO remove once screening is fixed
+        ip1 = ip0 + weight.size
+        if coeffs is None or len(coeffs) == 0:
+            desc[:, ip0:ip1] = exxgen.get_features(dm, mol, coords)
+        else:
+            desc[:, ip0:ip1], ddesc[:, :, ip0:ip1] = exxgen.get_feat_and_occd(
+                dm, coeffs, mol, coords
+            )
+        ip0 = ip1
+    if coeffs is None:
+        return desc
+    elif len(coeffs) == 0:
+        return desc, 0
+    return desc, ddesc
diff --git a/ciderpress/pyscf/dft.py b/ciderpress/pyscf/dft.py
new file mode 100644
index 0000000..42edb99
--- /dev/null
+++ b/ciderpress/pyscf/dft.py
@@ -0,0 +1,214 @@
+import joblib
+import yaml
+from pyscf import lib
+from pyscf.dft.gen_grid import Grids
+
+from ciderpress.dft.xc_evaluator import MappedXC
+from ciderpress.pyscf.gen_cider_grid import CiderGrids
+from ciderpress.pyscf.nldf_convolutions import PySCFNLDFInitializer
+from ciderpress.pyscf.numint import CiderNumInt, NLDFNLOFNumInt, NLDFNumInt, NLOFNumInt
+
+# from ciderpress.pyscf.xedsadm import PySCFSDMXInitializer
+from ciderpress.pyscf.sdmx import PySCFSDMXInitializer
+
+
+def make_cider_calc(
+    ks,
+    mlfunc,
+    xmix=1.0,
+    xc=None,
+    xkernel=None,
+    ckernel=None,
+    mlfunc_format=None,
+    nlc_coeff=None,
+    nldf_init=None,
+    sdmx_init=None,
+    rhocut=None,
+):
+    """
+    Decorate the PySCF DFT object ks with a CIDER functional mlfunc.
+    If xc, xkernel, ckernel, and xmix are not specified,
+    the equivalent of HF with CIDER in place of EXX is performed.
+    The XC energy is
+
+    If CIDER is X only:
+        E_xc = xmix * E_x^CIDER + (1-xmix) * xkernel + ckernel + xc
+    If CIDER is X + C:
+        E_xc = xmix * E_xc^CIDER + (1-xmix) * (xkernel + ckernel) + xc
+
+    NOTE: Only GGA-level XC functionals can be used with GGA-level
+        (orbital-independent) CIDER functionals currently.
+
+    Args:
+        ks (pyscf.dft.KohnShamDFT): DFT object
+        mlfunc (MappedXC or str): CIDER exchange functional or file name
+        xmix (float): Fraction of CIDER exchange used.
+        xc (str or None): If specified, this semi-local XC code is evaluated
+             and added to the total XC energy.
+        xkernel (str or None): Semi-local X code in libxc. Scaled by (1-xmix).
+        ckernel (str or None): Semi-local C code in libxc.
+        mlfunc_format (str or None): 'joblib' or 'yaml', specifies the format
+            of mlfunc if it is a string corresponding to a file name.
+            If unspecified, infer from file extension and raise error
+            if file type cannot be determined.
+        nlc_coeff (tuple or None):
+            VV10 coefficients. If None, VV10 term is not evaluated.
+        nldf_init (PySCFNLDFInitializer)
+        sdmx_init (PySCFSDMXInitializer)
+        rhocut (float)
+
+    Returns:
+        A decorated Kohn-Sham object for performing a CIDER calculation.
+    """
+    mlfunc = load_cider_model(mlfunc, mlfunc_format)
+    ks.xc = get_slxc_settings(xc, xkernel, ckernel, xmix)
+    new_ks = _CiderKS(
+        ks,
+        mlfunc,
+        xmix=xmix,
+        nldf_init=nldf_init,
+        sdmx_init=sdmx_init,
+        rhocut=rhocut,
+        nlc_coeff=nlc_coeff,
+    )
+    return lib.set_class(new_ks, (_CiderKS, ks.__class__))
+
+
+def load_cider_model(mlfunc, mlfunc_format):
+    if isinstance(mlfunc, str):
+        if mlfunc_format is None:
+            if mlfunc.endswith(".yaml"):
+                mlfunc_format = "yaml"
+            elif mlfunc.endswith(".joblib"):
+                mlfunc_format = "joblib"
+            else:
+                raise ValueError("Unsupported file format")
+        if mlfunc_format == "yaml":
+            with open(mlfunc, "r") as f:
+                mlfunc = yaml.load(f, Loader=yaml.CLoader)
+        elif mlfunc_format == "joblib":
+            mlfunc = joblib.load(mlfunc)
+        else:
+            raise ValueError("Unsupported file format")
+    if not isinstance(mlfunc, MappedXC):
+        raise ValueError("mlfunc must be MappedXC")
+    return mlfunc
+
+
+def get_slxc_settings(xc, xkernel, ckernel, xmix):
+    if xc is None:
+        # xc is another way to specify non-mixed part of kernel
+        xc = ""
+    if ckernel is not None:
+        xc = ckernel + " + " + xc
+    if xkernel is not None:
+        xc = "{} * {} + {}".format(1 - xmix, xkernel, xc)
+    if xc.endswith(" + "):
+        xc = xc[:-3]
+    return xc
+
+
+class _CiderKS:
+
+    grids = None
+
+    def __init__(
+        self,
+        mf,
+        mlxc,
+        xmix=1.0,
+        nldf_init=None,
+        sdmx_init=None,
+        rhocut=None,
+        nlc_coeff=None,
+    ):
+        self.__dict__.update(mf.__dict__)
+        # self.mlxc = None
+        # self.xmix = None
+        # self.nldf_init = None
+        # self.sdmx_init = None
+        # self.rhocut = rhocut
+        self.set_mlxc(
+            mlxc,
+            xmix=xmix,
+            nldf_init=nldf_init,
+            sdmx_init=sdmx_init,
+            rhocut=rhocut,
+            nlc_coeff=nlc_coeff,
+        )
+
+    def set_mlxc(
+        self,
+        mlxc,
+        xmix=1.0,
+        nldf_init=None,
+        sdmx_init=None,
+        rhocut=None,
+        nlc_coeff=None,
+    ):
+        # self.mlxc = mlxc
+        # self.xmix = xmix
+        # self.nldf_init = nldf_init
+        # self.sdmx_init = sdmx_init
+        if nldf_init is None and mlxc.settings.has_nldf:
+            nldf_init = PySCFNLDFInitializer(mlxc.settings.nldf_settings)
+        if sdmx_init is None and mlxc.settings.has_sdmx:
+            sdmx_init = PySCFSDMXInitializer(mlxc.settings.sadm_settings, lowmem=False)
+        old_grids = self.grids
+        changed = False
+        if mlxc.settings.has_nldf:
+            if not isinstance(old_grids, CiderGrids):
+                changed = True
+                self.grids = CiderGrids(self.mol)
+        else:
+            if old_grids is None or isinstance(old_grids, CiderGrids):
+                changed = True
+                self.grids = Grids(self.mol)
+        if changed:
+            for key in (
+                "atom_grid",
+                "atomic_radii",
+                "radii_adjust",
+                "radi_method",
+                "becke_scheme",
+                "prune",
+                "level",
+            ):
+                self.grids.__setattr__(key, old_grids.__getattribute__(key))
+        settings = mlxc.settings
+        has_nldf = not settings.nldf_settings.is_empty
+        has_nlof = not settings.nlof_settings.is_empty
+        if has_nldf and has_nlof:
+            cls = NLDFNLOFNumInt
+        elif has_nldf:
+            cls = NLDFNumInt
+        elif has_nlof:
+            cls = NLOFNumInt
+        else:
+            cls = CiderNumInt
+        self._numint = cls(
+            mlxc, nldf_init, sdmx_init, xmix=xmix, rhocut=rhocut, nlc_coeff=nlc_coeff
+        )
+
+    def build(self, mol=None, **kwargs):
+        self._numint.build(mol=mol)
+        return super().build(mol, **kwargs)
+
+    def reset(self, mol=None):
+        self._numint.reset(mol=mol)
+        return super().reset(mol)
+
+    def method_not_implemented(self, *args, **kwargs):
+        raise NotImplementedError
+
+    nuc_grad_method = Gradients = method_not_implemented
+    Hessian = method_not_implemented
+    NMR = method_not_implemented
+    NSR = method_not_implemented
+    Polarizability = method_not_implemented
+    RotationalGTensor = method_not_implemented
+    MP2 = method_not_implemented
+    CISD = method_not_implemented
+    CCSD = method_not_implemented
+    CASCI = method_not_implemented
+    CASSCF = method_not_implemented
diff --git a/ciderpress/pyscf/frac_lapl.py b/ciderpress/pyscf/frac_lapl.py
new file mode 100644
index 0000000..d1a5a5f
--- /dev/null
+++ b/ciderpress/pyscf/frac_lapl.py
@@ -0,0 +1,888 @@
+import ctypes
+
+import numpy as np
+from pyscf import lib
+from pyscf.dft import numint
+from pyscf.dft.gen_grid import ALIGNMENT_UNIT, BLKSIZE, CUTOFF, NBINS, make_mask
+from pyscf.dft.numint import (
+    OCCDROP,
+    _contract_rho,
+    _contract_rho_sparse,
+    _dot_ao_ao_sparse,
+    _dot_ao_dm,
+    _dot_ao_dm_sparse,
+    _empty_aligned,
+    _NumIntMixin,
+    _scale_ao_sparse,
+    _sparse_enough,
+)
+from pyscf.gto.eval_gto import (
+    BLKSIZE,
+    _get_intor_and_comp,
+    libcgto,
+    make_loc,
+    make_screen_index,
+)
+from pyscf.gto.mole import ANG_OF
+from scipy.special import gamma, hyp1f1
+
+from ciderpress.lib import load_library as load_cider_library
+
+libcider = load_cider_library("libmcider")
+
+GLOBAL_FRAC_LAPL_DATA = {}
+DEFAULT_FRAC_LAPL_POWER = 0.5
+DEFAULT_FRAC_LAPL_A = 0.25
+DEFAULT_FRAC_LAPL_D = 0.002
+DEFAULT_FRAC_LAPL_SIZE = 4000
+XCUT_1F1 = 100
+
+
+class FracLaplBuf:
+    def __init__(self, a, d, size, lmax, s):
+        print("INITIALIZING FLAPL", s)
+        self.a = a
+        self.d = d
+        self.size = size
+        self.lmax = lmax
+        self.s = s
+        self.buf = np.empty((lmax + 1, 4, size))
+        x = a * (np.exp(d * np.arange(size)) - 1)
+        f = np.zeros((lmax + 1, size))
+        xbig = x[x >= XCUT_1F1]
+        xmax = x[x < XCUT_1F1][-1]
+        for l in range(lmax + 1):
+            f[l] = hyp1f1(1.5 + s + l, 1.5 + l, -x)
+            f[l] *= 2 ** (2 * s) * gamma(1.5 + s + l) / gamma(1.5 + l)
+            fsmall = f[l, x < XCUT_1F1][-1]
+            f[l, x > XCUT_1F1] = fsmall * (xmax / xbig) ** (1.5 + s + l)
+        libcider.initialize_spline_1f1(
+            self.buf.ctypes.data_as(ctypes.c_void_p),
+            f.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(self.size),
+            ctypes.c_int(self.lmax),
+        )
+        df = np.zeros((lmax + 1, size))
+        for l in range(lmax + 1):
+            df[l] = hyp1f1(2.5 + s + l, 2.5 + l, -x)
+            df[l] *= 2 ** (2 * s) * gamma(2.5 + s + l) / gamma(2.5 + l)
+            fsmall = df[l, x < XCUT_1F1][-1]
+            df[l, x > XCUT_1F1] = fsmall * (xmax / xbig) ** (2.5 + s + l)
+        self.dbuf = np.empty((lmax + 1, 4, size))
+        libcider.initialize_spline_1f1(
+            self.dbuf.ctypes.data_as(ctypes.c_void_p),
+            df.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(self.size),
+            ctypes.c_int(self.lmax),
+        )
+
+    def set_data(self):
+        # libcider.set_spline_1f1(
+        #    self.buf.ctypes.data_as(ctypes.c_void_p),
+        #    ctypes.c_double(self.a),
+        #    ctypes.c_double(self.d),
+        #    ctypes.c_int(self.size),
+        #    ctypes.c_int(self.lmax),
+        #    ctypes.c_double(self.s),
+        # )
+        libcider.set_spline_1f1_with_grad(
+            self.buf.ctypes.data_as(ctypes.c_void_p),
+            self.dbuf.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_double(self.a),
+            ctypes.c_double(self.d),
+            ctypes.c_int(self.size),
+            ctypes.c_int(self.lmax),
+            ctypes.c_double(self.s),
+        )
+
+
+def _set_flapl(a=0.025, d=0.004, size=6000, lmax=6, s=0.5):
+    s_key = int(s * 100)
+    if s_key not in GLOBAL_FRAC_LAPL_DATA or GLOBAL_FRAC_LAPL_DATA[s_key].lmax < lmax:
+        GLOBAL_FRAC_LAPL_DATA[s_key] = FracLaplBuf(a, d, size, lmax, s)
+    GLOBAL_FRAC_LAPL_DATA[s_key].set_data()
+
+
+def set_frac_lapl(s, lmax=6):
+    _set_flapl(lmax=lmax, s=s)
+
+
+def eval_flapl_gto(
+    slst,
+    mol,
+    coords,
+    shls_slice=None,
+    non0tab=None,
+    ao_loc=None,
+    cutoff=None,
+    out=None,
+    debug=False,
+    deriv=0,
+):
+    if non0tab is not None:
+        if (non0tab == 0).any():
+            # TODO implement some sort of screening later
+            raise NotImplementedError
+    if mol.cart:
+        feval = "GTOval_cart_deriv%d" % deriv
+    else:
+        feval = "GTOval_sph_deriv%d" % deriv
+    eval_name, comp = _get_intor_and_comp(mol, feval)
+    comp = len(slst)
+
+    atm = np.asarray(mol._atm, dtype=np.int32, order="C")
+    bas = np.asarray(mol._bas, dtype=np.int32, order="C")
+    env = np.asarray(mol._env, dtype=np.double, order="C")
+    coords = np.asarray(coords, dtype=np.double, order="F")
+    natm = atm.shape[0]
+    nbas = bas.shape[0]
+    ngrids = coords.shape[0]
+
+    if eval_name == "GTOval_sph_deriv0":
+        deriv = 0
+    elif eval_name == "GTOval_sph_deriv1":
+        deriv = 1
+    else:
+        raise NotImplementedError
+    if debug:
+        assert deriv == 0
+        contract_fn = getattr(libcgto, "GTOcontract_exp0")
+        eval_fn = getattr(libcgto, "GTOshell_eval_grid_cart")
+    else:
+        if deriv == 0:
+            contract_fn = getattr(libcider, "GTOcontract_flapl0")
+            eval_fn = getattr(libcgto, "GTOshell_eval_grid_cart")
+        elif deriv == 1:
+            contract_fn = getattr(libcider, "GTOcontract_flapl1")
+            eval_fn = getattr(libcgto, "GTOshell_eval_grid_cart_deriv1")
+            comp *= 4
+        else:
+            raise ValueError
+
+    if ao_loc is None:
+        ao_loc = make_loc(bas, eval_name)
+
+    if shls_slice is None:
+        shls_slice = (0, nbas)
+    sh0, sh1 = shls_slice
+    nao = ao_loc[sh1] - ao_loc[sh0]
+    if "spinor" in eval_name:
+        ao = np.ndarray(
+            (2, comp, nao, ngrids), dtype=np.complex128, buffer=out
+        ).transpose(0, 1, 3, 2)
+    else:
+        ao = np.ndarray((comp, nao, ngrids), buffer=out).transpose(0, 2, 1)
+
+    if non0tab is None:
+        if cutoff is None:
+            non0tab = np.ones(((ngrids + BLKSIZE - 1) // BLKSIZE, nbas), dtype=np.uint8)
+        else:
+            non0tab = make_screen_index(mol, coords, shls_slice, cutoff)
+
+    # drv = getattr(libcgto, eval_name)
+    # normal call tree is GTOval_sph_deriv0 -> GTOval_sph -> GTOeval_sph_drv
+    drv = getattr(libcgto, "GTOeval_sph_drv")
+    for i, s in enumerate(slst):
+        set_frac_lapl(s, lmax=mol._bas[:, ANG_OF].max())
+        if deriv == 0:
+            j = i
+            param = (1, 1)
+        else:
+            j = i * 4
+            param = (1, 4)
+        drv(
+            eval_fn,
+            contract_fn,
+            ctypes.c_double(1),
+            ctypes.c_int(ngrids),
+            (ctypes.c_int * 2)(*param),
+            (ctypes.c_int * 2)(*shls_slice),
+            ao_loc.ctypes.data_as(ctypes.c_void_p),
+            ao[j].ctypes.data_as(ctypes.c_void_p),
+            coords.ctypes.data_as(ctypes.c_void_p),
+            non0tab.ctypes.data_as(ctypes.c_void_p),
+            atm.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(natm),
+            bas.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nbas),
+            env.ctypes.data_as(ctypes.c_void_p),
+        )
+    if comp == 1:
+        if "spinor" in eval_name:
+            ao = ao[:, 0]
+        else:
+            ao = ao[0]
+    return ao
+
+
+def eval_rho(
+    mol,
+    all_ao,
+    dm,
+    flsettings,
+    non0tab=None,
+    xctype="LDA",
+    hermi=0,
+    with_lapl=True,
+    verbose=None,
+):
+    ao, kao = all_ao
+    xctype = xctype.upper()
+    ngrids, nao = ao.shape[-2:]
+
+    shls_slice = (0, mol.nbas)
+    ao_loc = mol.ao_loc_nr()
+    if with_lapl:
+        raise NotImplementedError
+    else:
+        rho = np.empty((5 + flsettings.nrho, ngrids))
+        tau_idx = 4
+
+    c0 = _dot_ao_dm(mol, ao[0], dm, non0tab, shls_slice, ao_loc)
+    #:rho[0] = numpy.einsum('pi,pi->p', ao[0], c0)
+    rho[0] = _contract_rho(ao[0], c0)
+    nk0 = flsettings.nk0
+    nk1 = flsettings.nk1
+    nd1 = flsettings.nd1
+
+    for ik in range(nk0):
+        ir = ik + 1 + tau_idx
+        indk = flsettings.get_ik0(ik)
+        rho[ir] = _contract_rho(kao[indk], c0)
+        if not hermi:
+            rho[ir] += _contract_rho(c0, kao[indk])
+            rho[ir] *= 0.5
+
+    for ik in range(nd1):
+        ir = (3 * nk1 + nk0) + (1 + tau_idx) + (3 * ik)
+        indk = flsettings.get_ik0(ik)
+        for i in range(3):
+            rho[ir + i] = _contract_rho(c0, kao[indk + 1 + i])
+
+    rho[tau_idx] = 0
+    for ik in range(flsettings.ndd):
+        ir = (3 * nk1 + nk0) + (1 + tau_idx) + (3 * nd1) + ik
+        rho[ir] = 0
+    for i in range(1, 4):
+        c1 = _dot_ao_dm(mol, ao[i], dm, non0tab, shls_slice, ao_loc)
+        #:rho[tau_idx] += numpy.einsum('pi,pi->p', c1, ao[i])
+        rho[tau_idx] += _contract_rho(ao[i], c1)
+
+        #:rho[i] = numpy.einsum('pi,pi->p', c0, ao[i])
+        rho[i] = _contract_rho(ao[i], c0)
+        if hermi:
+            rho[i] *= 2
+        else:
+            rho[i] += _contract_rho(c1, ao[0])
+
+        for ik in range(flsettings.nk1):
+            ir = 3 * ik + i + tau_idx + nk0
+            indk = flsettings.get_ik0(ik)
+            rho[ir] = _contract_rho(kao[indk], c1)
+            if not hermi:
+                rho[ir] += _contract_rho(c1, kao[indk])
+                rho[ir] *= 0.5
+
+        for ik in range(flsettings.ndd):
+            ir = (3 * nk1 + nk0) + (1 + tau_idx) + (3 * nd1) + ik
+            indk = flsettings.get_ik0(ik) + i
+            rho[ir] += _contract_rho(kao[indk], c1)
+            if not hermi:
+                rho[ir] += _contract_rho(c1, kao[indk])
+
+    # tau = 1/2 (\nabla f)^2
+    rho[tau_idx] *= 0.5
+    for ik in range(flsettings.ndd):
+        ir = (3 * nk1 + nk0) + (1 + tau_idx) + (3 * nd1) + ik
+        if not hermi:
+            rho[ir] *= 0.5
+    return rho
+
+
+def eval_rho1(
+    mol,
+    all_ao,
+    dm,
+    flsettings,
+    screen_index=None,
+    xctype="LDA",
+    hermi=0,
+    with_lapl=True,
+    cutoff=None,
+    ao_cutoff=CUTOFF,
+    pair_mask=None,
+    verbose=None,
+):
+    ao, kao = all_ao
+    if flsettings.nd1 != 0 or flsettings.ndd != 0:
+        raise NotImplementedError
+    xctype = xctype.upper()
+    ngrids = ao.shape[-2]
+
+    if cutoff is None:
+        cutoff = CUTOFF
+    cutoff = min(cutoff, 0.1)
+    nbins = NBINS * 2 - int(NBINS * np.log(cutoff) / np.log(ao_cutoff))
+
+    if pair_mask is None:
+        ovlp_cond = mol.get_overlap_cond()
+        pair_mask = ovlp_cond < -np.log(cutoff)
+
+    ao_loc = mol.ao_loc_nr()
+    if with_lapl:
+        raise NotImplementedError
+    else:
+        rho = np.empty((5 + flsettings.nrho, ngrids))
+        tau_idx = 4
+    c0 = _dot_ao_dm_sparse(ao[0], dm, nbins, screen_index, pair_mask, ao_loc)
+    rho[0] = _contract_rho_sparse(ao[0], c0, screen_index, ao_loc)
+    nk0 = flsettings.nk0
+    nk1 = flsettings.nk1
+    nd1 = flsettings.nd1
+
+    for ik in range(nk0):
+        # TODO might be able to use some kind of screening here
+        ir = ik + 1 + tau_idx
+        indk = flsettings.get_ik0(ik)
+        # rho[ir] = _contract_rho_sparse(kao[indk], c0, screen_index, ao_loc)
+        rho[ir] = _contract_rho(kao[indk], c0)
+        if not hermi:
+            # rho[ir] += _contract_rho_sparse(c0, kao[indk], screen_index, ao_loc)
+            rho[ir] += _contract_rho(c0, kao[indk])
+            rho[ir] *= 0.5
+
+    for ik in range(nd1):
+        ir = (3 * nk1 + nk0) + (1 + tau_idx) + (3 * ik)
+        indk = flsettings.get_ik0(ik)
+        for i in range(3):
+            # rho[ir + i] = _contract_rho_sparse(
+            #    c0, kao[indk + 1 + i], screen_index, ao_loc
+            # )
+            rho[ir + i] = _contract_rho(c0, kao[indk + 1 + i])
+
+    rho[tau_idx] = 0
+    for ik in range(flsettings.ndd):
+        ir = (3 * nk1 + nk0) + (1 + tau_idx) + (3 * nd1) + ik
+        rho[ir] = 0
+    for i in range(1, 4):
+        c1 = _dot_ao_dm_sparse(ao[i], dm.T, nbins, screen_index, pair_mask, ao_loc)
+        rho[tau_idx] += _contract_rho_sparse(ao[i], c1, screen_index, ao_loc)
+
+        rho[i] = _contract_rho_sparse(ao[i], c0, screen_index, ao_loc)
+        if hermi:
+            rho[i] *= 2
+        else:
+            rho[i] += _contract_rho_sparse(c1, ao[0], screen_index, ao_loc)
+
+        for ik in range(flsettings.nk1):
+            ir = 3 * ik + i + tau_idx + nk0
+            indk = flsettings.get_ik0(ik)
+            # rho[ir] = _contract_rho_sparse(kao[indk], c1, screen_index, ao_loc)
+            rho[ir] = _contract_rho(kao[indk], c1)
+            if not hermi:
+                # rho[ir] += _contract_rho_sparse(c1, kao[indk], screen_index, ao_loc)
+                rho[ir] += _contract_rho(c1, kao[indk])
+                rho[ir] *= 0.5
+
+        for ik in range(flsettings.ndd):
+            ir = (3 * nk1 + nk0) + (1 + tau_idx) + (3 * nd1) + ik
+            indk = flsettings.get_ik0(ik) + i
+            # rho[ir] += _contract_rho_sparse(kao[indk], c1, screen_index, ao_loc)
+            rho[ir] += _contract_rho(kao[indk], c1)
+            if not hermi:
+                # rho[ir] += _contract_rho_sparse(c1, kao[indk], screen_index, ao_loc)
+                rho[ir] += _contract_rho(c1, kao[indk])
+
+    rho[tau_idx] *= 0.5
+    for ik in range(flsettings.ndd):
+        ir = (3 * nk1 + nk0) + (1 + tau_idx) + (3 * nd1) + ik
+        if not hermi:
+            rho[ir] *= 0.5
+    return rho
+
+
+def _odp_dot_sparse_(
+    all_ao,
+    wv,
+    nbins,
+    mask,
+    pair_mask,
+    ao_loc,
+    flsettings,
+    aow1=None,
+    aow2=None,
+    vmat=None,
+    v1=None,
+):
+    ao, kao = all_ao
+    wv[0] *= 0.5  # *.5 for v+v.conj().T
+    wv[4] *= 0.5  # *.5 for 1/2 in tau
+    if flsettings.nrho > 0:
+        wv[5:] *= 0.5
+
+    aow1 = _scale_ao_sparse(ao[:4], wv[:4], mask, ao_loc, out=aow1)
+    nk0 = flsettings.nk0
+    nk1 = flsettings.nk1
+    nd1 = flsettings.nd1
+    ndd = flsettings.ndd
+    tau_idx = 4
+    if nd1 > 0 or ndd > 0:
+        wvtmp = np.zeros(kao.shape[:2])
+        for ik in range(nk0):
+            ir = ik + 1 + tau_idx
+            indk = flsettings.get_ik0(ik)
+            wvtmp[indk] = wv[ir]
+            # rho[ir] = _contract_rho(kao[indk], c0)
+            # if not hermi:
+            #    rho[ir] += _contract_rho(c0, kao[indk])
+            #    rho[ir] *= 0.5
+        for ik in range(nd1):
+            ir = (3 * nk1 + nk0) + (1 + tau_idx) + (3 * ik)
+            indk = flsettings.get_ik0(ik)
+            wvtmp[indk + 1 : indk + 4] = wv[ir : ir + 3]
+        aow2 = _scale_ao_sparse(kao, wvtmp, None, ao_loc, out=aow2)
+        aow1 += aow2
+        vmat = _dot_ao_ao_sparse(
+            ao[0], aow1, None, nbins, None, pair_mask, ao_loc, hermi=0, out=vmat
+        )
+    elif nk0 > 0:
+        # TODO might be able to use some kind of screening here
+        # aow2 = _scale_ao_sparse(kao, wv[5:5 + nk0], mask, ao_loc, out=aow2)
+        # aow1 += aow2
+        # vmat = _dot_ao_ao_sparse(ao[0], aow1, None, nbins, mask, pair_mask,
+        #                         ao_loc, hermi=0, out=vmat)
+        aow2 = _scale_ao_sparse(kao, wv[5 : 5 + nk0], None, ao_loc, out=aow2)
+        print(np.sum(aow2), np.sum(aow1))
+        aow1 += aow2
+        vmat = _dot_ao_ao_sparse(
+            ao[0], aow1, None, nbins, None, pair_mask, ao_loc, hermi=0, out=vmat
+        )
+    else:
+        vmat = _dot_ao_ao_sparse(
+            ao[0], aow1, None, nbins, mask, pair_mask, ao_loc, hermi=0, out=vmat
+        )
+
+    if ndd > 0 or nd1 > 0:
+        for j in range(3):
+            i = j + 1
+            # TODO might be able to use some kind of screening here
+            aow1 = _scale_ao_sparse(ao[i], wv[4], mask, ao_loc, out=aow1)
+            for ik in range(nk1):
+                ir = 3 * ik + i + tau_idx + nk0
+                indk = flsettings.get_ik0(ik)
+                aow2 = _scale_ao_sparse(kao[indk], wv[ir], None, ao_loc, out=aow2)
+                aow1 += aow2
+            for ik in range(ndd):
+                ir = (3 * nk1 + nk0) + (1 + tau_idx) + (3 * nd1) + ik
+                indk = flsettings.get_ik0(ik) + i
+                aow2 = _scale_ao_sparse(kao[indk], wv[ir], None, ao_loc, out=aow2)
+                aow1 += aow2
+            vmat = _dot_ao_ao_sparse(
+                ao[i], aow1, None, nbins, None, pair_mask, ao_loc, hermi=0, out=vmat
+            )
+    elif nk1 > 0:
+        kwv = np.empty((nk1, wv.shape[-1]))
+        gkao = kao[:nk1]
+        for j in range(3):
+            i = j + 1
+            # TODO might be able to use some kind of screening here
+            kwv[:] = wv[5 + nk0 + j : 5 + nk0 + 3 * nk1 : 3]
+            aow1 = _scale_ao_sparse(ao[i], wv[4], mask, ao_loc, out=aow1)
+            # aow2 = _scale_ao_sparse(gkao, kwv, mask, ao_loc, out=aow2)
+            # aow1 += aow2
+            # vmat = _dot_ao_ao_sparse(ao[i], aow1, None, nbins, mask, pair_mask,
+            #                         ao_loc, hermi=0, out=vmat)
+            aow2 = _scale_ao_sparse(gkao, kwv, None, ao_loc, out=aow2)
+            aow1 += aow2
+            vmat = _dot_ao_ao_sparse(
+                ao[i], aow1, None, nbins, None, pair_mask, ao_loc, hermi=0, out=vmat
+            )
+    else:
+        for j in range(3):
+            i = j + 1
+            aow1 = _scale_ao_sparse(ao[i], wv[4], mask, ao_loc, out=aow1)
+            v1 = _dot_ao_ao_sparse(
+                ao[i], aow1, None, nbins, mask, pair_mask, ao_loc, hermi=1, out=v1
+            )
+
+    return aow1, aow2, vmat, v1
+
+
+def eval_rho2(
+    mol,
+    all_ao,
+    mo_coeff,
+    mo_occ,
+    flsettings,
+    non0tab=None,
+    xctype="LDA",
+    with_lapl=True,
+    verbose=None,
+):
+    ao, kao = all_ao
+    xctype = xctype.upper()
+    ngrids, nao = ao.shape[-2:]
+
+    shls_slice = (0, mol.nbas)
+    ao_loc = mol.ao_loc_nr()
+    pos = mo_occ > OCCDROP
+    if np.any(pos):
+        cpos = np.einsum("ij,j->ij", mo_coeff[:, pos], np.sqrt(mo_occ[pos]))
+        if with_lapl:
+            raise NotImplementedError
+        else:
+            rho = np.empty((5 + flsettings.nrho, ngrids))
+            tau_idx = 4
+        c0 = _dot_ao_dm(mol, ao[0], cpos, non0tab, shls_slice, ao_loc)
+        #:rho[0] = numpy.einsum('pi,pi->p', c0, c0)
+        rho[0] = _contract_rho(c0, c0)
+        cklst = []
+        for i in range(kao.shape[0]):
+            # TODO might be able to use some kind of screening here
+            cklst.append(
+                _dot_ao_dm(mol, kao[i], cpos, None, shls_slice, ao_loc)
+                # _dot_ao_dm(mol, kao[i], cpos, non0tab, shls_slice, ao_loc)
+            )
+        nk0 = flsettings.nk0
+        nk1 = flsettings.nk1
+        nd1 = flsettings.nd1
+        for ik in range(nk0):
+            ir = ik + 1 + tau_idx
+            indk = flsettings.get_ik0(ik)
+            rho[ir] = _contract_rho(cklst[indk], c0)
+
+        for ik in range(nd1):
+            ir = (3 * nk1 + nk0) + (1 + tau_idx) + (3 * ik)
+            indk = flsettings.get_ik0(ik)
+            for i in range(3):
+                rho[ir + i] = _contract_rho(c0, cklst[indk + 1 + i])
+
+        rho[tau_idx] = 0
+        for ik in range(flsettings.ndd):
+            ir = (3 * nk1 + nk0) + (1 + tau_idx) + (3 * nd1) + ik
+            rho[ir] = 0
+        for i in range(1, 4):
+            c1 = _dot_ao_dm(mol, ao[i], cpos, non0tab, shls_slice, ao_loc)
+            rho[i] = _contract_rho(c0, c1) * 2
+            rho[tau_idx] += _contract_rho(c1, c1)
+
+            for ik in range(flsettings.nk1):
+                ir = 3 * ik + i + tau_idx + nk0
+                indk = flsettings.get_ik0(ik)
+                rho[ir] = _contract_rho(cklst[indk], c1)
+
+            for ik in range(flsettings.ndd):
+                ir = (3 * nk1 + nk0) + (1 + tau_idx) + (3 * nd1) + ik
+                indk = flsettings.get_ik0(ik) + i
+                rho[ir] += _contract_rho(cklst[indk], c1)
+
+        rho[tau_idx] *= 0.5
+        return rho
+    else:
+        rho = np.zeros((5 + flsettings.nrho, ngrids))
+        return rho
+
+
+def _block_loop(
+    ni,
+    mol,
+    grids,
+    nao=None,
+    deriv=0,
+    max_memory=2000,
+    non0tab=None,
+    blksize=None,
+    buf=None,
+):
+    """Define this macro to loop over grids by blocks."""
+    if grids.coords is None:
+        grids.build(with_non0tab=True)
+    if nao is None:
+        nao = mol.nao
+    ngrids = grids.coords.shape[0]
+    comp = (deriv + 1) * (deriv + 2) * (deriv + 3) // 6
+    flsettings = ni.plan.settings
+    kcomp = flsettings.npow + 3 * flsettings.nd1
+    # NOTE to index grids.non0tab, the blksize needs to be an integer
+    # multiplier of BLKSIZE
+    if blksize is None:
+        blksize = int(max_memory * 1e6 / ((comp + kcomp + 1) * nao * 8 * BLKSIZE))
+        blksize = max(4, min(blksize, ngrids // BLKSIZE + 1, 1200)) * BLKSIZE
+    assert blksize % BLKSIZE == 0
+
+    if non0tab is None and mol is grids.mol:
+        non0tab = grids.non0tab
+    if non0tab is None:
+        non0tab = np.empty(
+            ((ngrids + BLKSIZE - 1) // BLKSIZE, mol.nbas), dtype=np.uint8
+        )
+        non0tab[:] = NBINS + 1  # Corresponding to AO value ~= 1
+    screen_index = non0tab
+
+    # the xxx_sparse() functions require ngrids 8-byte aligned
+    allow_sparse = ngrids % ALIGNMENT_UNIT == 0
+
+    if buf is None:
+        buf = _empty_aligned((comp + kcomp) * blksize * nao)
+    for ip0, ip1 in lib.prange(0, ngrids, blksize):
+        coords = grids.coords[ip0:ip1]
+        weight = grids.weights[ip0:ip1]
+        mask = screen_index[ip0 // BLKSIZE :]
+        # TODO: pass grids.cutoff to eval_ao
+        ao_buf = np.ndarray((comp + kcomp, weight.size * nao), buffer=buf)
+        ao = ni.eval_ao(
+            mol, coords, deriv=deriv, non0tab=mask, cutoff=grids.cutoff, out=ao_buf
+        )
+        kao = ni.eval_kao(
+            flsettings.slist,
+            mol,
+            coords,
+            deriv=0,
+            non0tab=None,
+            cutoff=grids.cutoff,
+            out=ao_buf[comp:],
+            n1=flsettings.nd1,
+        )
+        if not allow_sparse and not _sparse_enough(mask):
+            # Unset mask for dense AO tensor. It determines which eval_rho
+            # to be called in make_rho
+            mask = None
+        yield (ao, kao), mask, weight, coords
+
+
+def eval_kao(
+    slst,
+    mol,
+    coords,
+    deriv=0,
+    shls_slice=None,
+    non0tab=None,
+    cutoff=None,
+    out=None,
+    n1=0,
+    verbose=None,
+):
+    assert deriv == 0
+    comp = len(slst)
+    if mol.cart:
+        "GTOval_cart_deriv%d" % deriv
+    else:
+        "GTOval_sph_deriv%d" % deriv
+    if n1 == 0:
+        return eval_flapl_gto(
+            slst, mol, coords, shls_slice, non0tab, cutoff=cutoff, out=out
+        )
+    else:
+        comp += 3 * n1
+        nao = mol.nao_nr()
+        ngrids = coords.shape[0]
+        if out is None:
+            out = np.ndarray((comp, nao, ngrids), buffer=out)
+        else:
+            assert out.shape[0] == comp
+            assert out.flags.c_contiguous
+            out = out.reshape((comp, nao, ngrids))
+        eval_flapl_gto(
+            slst[:n1],
+            mol,
+            coords,
+            shls_slice,
+            non0tab,
+            cutoff=cutoff,
+            out=out[: 4 * n1],
+            deriv=1,
+        )
+        eval_flapl_gto(
+            slst[n1:],
+            mol,
+            coords,
+            shls_slice,
+            non0tab,
+            cutoff=cutoff,
+            out=out[4 * n1 :],
+        )
+        return out.transpose(0, 2, 1)
+
+
+def nr_rks(
+    ni, mol, grids, xc_code, dms, relativity=0, hermi=1, max_memory=2000, verbose=None
+):
+    if relativity != 0:
+        raise NotImplementedError
+
+    xctype = "MGGA"
+
+    make_rho, nset, nao = ni._gen_rho_evaluator(mol, dms, hermi, False, grids)
+    ao_loc = mol.ao_loc_nr()
+    cutoff = grids.cutoff * 1e2
+    nbins = NBINS * 2 - int(NBINS * np.log(cutoff) / np.log(grids.cutoff))
+
+    nelec = np.zeros(nset)
+    excsum = np.zeros(nset)
+    vmat = np.zeros((nset, nao, nao))
+
+    def block_loop(ao_deriv):
+        for ao, mask, weight, coords in ni.block_loop(
+            mol, grids, nao, ao_deriv, max_memory=max_memory
+        ):
+            for i in range(nset):
+                rho = make_rho(i, ao, mask, xctype)
+                exc, vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[:2]
+                den = rho[0] * weight
+                nelec[i] += den.sum()
+                excsum[i] += np.dot(den, exc)
+                wv = weight * vxc
+                yield i, ao, mask, wv
+
+    aow1 = None
+    aow2 = None
+    pair_mask = mol.get_overlap_cond() < -np.log(ni.cutoff)
+    if any(x in xc_code.upper() for x in ("CC06", "CS", "BR89", "MK00")):
+        raise NotImplementedError("laplacian in meta-GGA method")
+    ao_deriv = 1
+    v1 = np.zeros_like(vmat)
+    for i, ao, mask, wv in block_loop(ao_deriv):
+        _odp_dot_sparse_(
+            ao,
+            wv,
+            nbins,
+            mask,
+            pair_mask,
+            ao_loc,
+            ni.flsettings,
+            aow1=aow1,
+            aow2=aow2,
+            vmat=vmat[i],
+            v1=v1[i],
+        )
+    vmat = lib.hermi_sum(vmat, axes=(0, 2, 1))
+    vmat += v1
+
+
+class FLNumInt(_NumIntMixin):
+    """Numerical integration methods for non-relativistic RKS and UKS
+    with fractional Laplacian density descriptors."""
+
+    cutoff = CUTOFF * 1e2  # cutoff for small AO product
+
+    def __init__(self, plan):
+        """
+        Initialize FLNumInt object.
+
+        Args:
+            plan (FracLaplPlan): Plan object for how to evaluate fractional
+                Laplacian features
+        """
+        super(FLNumInt, self).__init__()
+        self.plan = plan
+
+    def nr_vxc(
+        self,
+        mol,
+        grids,
+        xc_code,
+        dms,
+        spin=0,
+        relativity=0,
+        hermi=0,
+        max_memory=2000,
+        verbose=None,
+    ):
+        if spin == 0:
+            return self.nr_rks(
+                mol, grids, xc_code, dms, relativity, hermi, max_memory, verbose
+            )
+        else:
+            return self.nr_uks(
+                mol, grids, xc_code, dms, relativity, hermi, max_memory, verbose
+            )
+
+    get_vxc = nr_vxc
+
+    nr_rks = nr_rks
+    nr_uks = None  # TODO write nr_uks
+
+    # nr_nlc_vxc = numint.nr_nlc_vxc
+    nr_sap = nr_sap_vxc = numint.nr_sap_vxc
+
+    make_mask = staticmethod(make_mask)
+    eval_ao = staticmethod(numint.eval_ao)
+    eval_kao = staticmethod(eval_kao)
+    eval_rho = staticmethod(eval_rho)
+    eval_rho1 = lib.module_method(eval_rho1, absences=["cutoff"])
+    eval_rho2 = staticmethod(eval_rho2)
+    get_rho = numint.get_rho
+
+    block_loop = _block_loop
+
+    def _gen_rho_evaluator(self, mol, dms, hermi=0, with_lapl=True, grids=None):
+        if getattr(dms, "mo_coeff", None) is not None:
+            # TODO: test whether dm.mo_coeff matching dm
+            mo_coeff = dms.mo_coeff
+            mo_occ = dms.mo_occ
+            if isinstance(dms, np.ndarray) and dms.ndim == 2:
+                mo_coeff = [mo_coeff]
+                mo_occ = [mo_occ]
+        else:
+            mo_coeff = mo_occ = None
+
+        if isinstance(dms, np.ndarray) and dms.ndim == 2:
+            dms = dms[np.newaxis]
+
+        if hermi != 1 and dms[0].dtype == np.double:
+            # (D + D.T)/2 because eval_rho computes 2*(|\nabla i> D_ij <j|) instead of
+            # |\nabla i> D_ij <j| + |i> D_ij <\nabla j| for efficiency when dm is real
+            dms = lib.hermi_sum(np.asarray(dms, order="C"), axes=(0, 2, 1)) * 0.5
+            hermi = 1
+
+        nao = dms[0].shape[0]
+        ndms = len(dms)
+
+        ovlp_cond = mol.get_overlap_cond()
+        if dms[0].dtype == np.double:
+            dm_cond = np.max(
+                [mol.condense_to_shell(dm, "absmax") for dm in dms], axis=0
+            )
+            pair_mask = (np.exp(-ovlp_cond) * dm_cond) > self.cutoff
+        else:
+            pair_mask = ovlp_cond < -np.log(self.cutoff)
+
+        def make_rho(idm, ao, sindex, xctype):
+            if sindex is not None and grids is not None:
+                return self.eval_rho1(
+                    mol,
+                    ao,
+                    dms[idm],
+                    self.plan.settings,
+                    sindex,
+                    xctype,
+                    hermi,
+                    with_lapl,
+                    cutoff=self.cutoff,
+                    ao_cutoff=grids.cutoff,
+                    pair_mask=pair_mask,
+                )
+            elif mo_coeff is not None:
+                return self.eval_rho2(
+                    mol,
+                    ao,
+                    mo_coeff[idm],
+                    mo_occ[idm],
+                    self.plan.settings,
+                    sindex,
+                    xctype,
+                    with_lapl,
+                )
+            else:
+                return self.eval_rho(
+                    mol,
+                    ao,
+                    dms[idm],
+                    self.plan.settings,
+                    sindex,
+                    xctype,
+                    hermi,
+                    with_lapl,
+                )
+
+        return make_rho, ndms, nao
diff --git a/ciderpress/pyscf/gen_cider_grid.py b/ciderpress/pyscf/gen_cider_grid.py
new file mode 100644
index 0000000..da80139
--- /dev/null
+++ b/ciderpress/pyscf/gen_cider_grid.py
@@ -0,0 +1,266 @@
+import ctypes
+
+import numpy as np
+from pyscf import gto
+from pyscf.dft.gen_grid import (
+    LEBEDEV_NGRID,
+    LEBEDEV_ORDER,
+    NELEC_ERROR_TOL,
+    Grids,
+    _default_ang,
+    _default_rad,
+    _padding_size,
+    arg_group_grids,
+    libdft,
+    logger,
+    nwchem_prune,
+    radi,
+)
+
+from ciderpress.dft.grids_indexer import AtomicGridsIndexer, libcider
+
+CIDER_DEFAULT_LMAX = 10
+
+
+class CiderGrids(Grids):
+    """
+    Attributes (in addition to those in pyscf Grids object):
+        lmax (int): Maximum l value for integration of CIDER features
+        nlm (int): (lmax + 1)**2
+        grids_indexer (AtomicGridsIndexer): object to sort and index
+            grids for CIDER features.
+    """
+
+    grids_indexer: AtomicGridsIndexer
+
+    _keys = {"grids_indexer", "nlm", "lmax"}
+
+    def __init__(self, mol, lmax=CIDER_DEFAULT_LMAX):
+        super(CiderGrids, self).__init__(mol)
+        # self.becke_scheme = becke_lko
+        self.lmax = lmax
+        self.nlm = (lmax + 1) * (lmax + 1)
+        self.grids_indexer = None
+
+    def gen_atomic_grids(
+        self, mol, atom_grid=None, radi_method=None, level=None, prune=None, **kwargs
+    ):
+        """
+        Same as gen_atomic_grids for PySCF grids, except it also
+        uses the outputs of gen_atomic_grids_cider to initialize the
+        grids_indexer object for CIDER feature routines.
+        """
+        if atom_grid is None:
+            atom_grid = self.atom_grid
+        if radi_method is None:
+            radi_method = self.radi_method
+        if level is None:
+            level = self.level
+        if prune is None:
+            prune = self.prune
+        (
+            atom_grids_tab,
+            lmax_tab,
+            rad_loc_tab,
+            ylm_tab,
+            ylm_loc_tab,
+            rad_tab,
+            dr_tab,
+        ) = gen_atomic_grids_cider(
+            mol, atom_grid, self.radi_method, level, prune, **kwargs
+        )
+        self.grids_indexer = AtomicGridsIndexer.from_tabs(
+            mol, self.lmax, rad_loc_tab, ylm_loc_tab, rad_tab, ylm_tab
+        )
+        return atom_grids_tab
+
+    def build(self, mol=None, with_non0tab=False, sort_grids=True, **kwargs):
+        """
+        Build the grids. Same as with PySCF grids, but also stores an idx_map
+        in the grids_indexer object so that one can map between the atomic grids
+        and sorted grids.
+        """
+        if mol is None:
+            mol = self.mol
+        if self.verbose >= logger.WARN:
+            self.check_sanity()
+        atom_grids_tab = self.gen_atomic_grids(
+            mol, self.atom_grid, self.radi_method, self.level, self.prune, **kwargs
+        )
+        # TODO cleaner version of this way of calling VXCgen_grid_lko
+        # tmp = libdft.VXCgen_grid
+        # libdft.VXCgen_grid = libcider.VXCgen_grid_lko
+        self.coords, self.weights = self.get_partition(
+            mol, atom_grids_tab, self.radii_adjust, self.atomic_radii, self.becke_scheme
+        )
+        # libdft.VXCgen_grid = tmp
+        self.grids_indexer.set_weights(self.weights)
+
+        if sort_grids:
+            idx = arg_group_grids(mol, self.coords)
+            self.coords = self.coords[idx]
+            self.weights = self.weights[idx]
+            self.grids_indexer.set_idx(idx)
+        else:
+            self.grids_indexer.set_idx(np.arange(self.weights.size))
+
+        if self.alignment > 1:
+            padding = _padding_size(self.size, self.alignment)
+            logger.debug(self, "Padding %d grids", padding)
+            if padding > 0:
+                self.coords = np.vstack(
+                    [self.coords, np.repeat([[1e-4] * 3], padding, axis=0)]
+                )
+                self.weights = np.hstack([self.weights, np.zeros(padding)])
+                self.grids_indexer.set_padding(padding)
+
+        self.coords = np.asarray(self.coords, order="C")
+        self.weights = np.asarray(self.weights, order="C")
+
+        if with_non0tab:
+            self.non0tab = self.make_mask(mol, self.coords)
+            self.screen_index = self.non0tab
+        else:
+            self.screen_index = self.non0tab = None
+        logger.info(self, "tot grids = %d", len(self.weights))
+        return self
+
+    def reset(self, mol=None):
+        self.grids_indexer = None
+        return super().reset(mol=mol)
+
+    def prune_by_density_(self, rho, threshold=0):
+        """
+        Prune grids if the electron density on the grid is small.
+        Only difference from PySCF version is that grids_indexer.set_idx
+        is called to make sure the map between unsorted and sorted/screened
+        grids is correct.
+        """
+        if threshold == 0:
+            return self
+
+        mol = self.mol
+        n = np.dot(rho, self.weights)
+        if abs(n - mol.nelectron) < NELEC_ERROR_TOL * n:
+            rho *= self.weights
+            idx = abs(rho) > threshold / self.weights.size
+            logger.debug(
+                self, "Drop grids %d", self.weights.size - np.count_nonzero(idx)
+            )
+            self.coords = np.asarray(self.coords[idx], order="C")
+            self.weights = np.asarray(self.weights[idx], order="C")
+            old_idx = self.grids_indexer.get_idx()
+            old_idx_size = old_idx.size
+            self.grids_indexer.set_idx(old_idx[idx[:old_idx_size]])
+            if self.alignment > 1:
+                padding = _padding_size(self.size, self.alignment)
+                logger.debug(self, "prune_by_density_: %d padding grids", padding)
+                if padding > 0:
+                    self.coords = np.vstack(
+                        [self.coords, np.repeat([[1e-4] * 3], padding, axis=0)]
+                    )
+                    self.weights = np.hstack([self.weights, np.zeros(padding)])
+                    self.grids_indexer.set_padding(padding)
+            self.non0tab = self.make_mask(mol, self.coords)
+            self.screen_index = self.non0tab
+            self.coords = np.asarray(self.coords, order="C")
+            self.weights = np.asarray(self.weights, order="C")
+        return self
+
+
+LMAX_DICT = {v: k // 2 for k, v in LEBEDEV_ORDER.items()}
+
+
+def gen_atomic_grids_cider(
+    mol,
+    atom_grid=None,
+    radi_method=radi.gauss_chebyshev,
+    level=3,
+    prune=nwchem_prune,
+    full_lmax=CIDER_DEFAULT_LMAX,
+    **kwargs
+):
+    if atom_grid is None:
+        atom_grid = {}
+    if isinstance(atom_grid, (list, tuple)):
+        atom_grid = dict([(mol.atom_symbol(ia), atom_grid) for ia in range(mol.natm)])
+    atom_grids_tab = {}
+    lmax_tab = {}
+    rad_loc_tab = {}
+    ylm_tab = {}
+    ylm_loc_tab = {}
+    rad_tab = {}
+    dr_tab = {}
+    for ia in range(mol.natm):
+        symb = mol.atom_symbol(ia)
+
+        if symb not in atom_grids_tab:
+            chg = gto.charge(symb)
+            if symb in atom_grid:
+                n_rad, n_ang = atom_grid[symb]
+                if n_ang not in LEBEDEV_NGRID:
+                    raise ValueError("Unsupported angular grids %d" % n_ang)
+            else:
+                n_rad = _default_rad(chg, level)
+                n_ang = _default_ang(chg, level)
+            rad, dr = radi_method(n_rad, chg, ia, **kwargs)
+
+            rad_weight = 4 * np.pi * rad**2 * dr
+
+            if callable(prune):
+                angs = prune(chg, rad, n_ang)
+            else:
+                angs = [n_ang] * n_rad
+            logger.debug(
+                mol, "atom %s rad-grids = %d, ang-grids = %s", symb, n_rad, angs
+            )
+
+            angs = np.array(angs)
+            coords = []
+            vol = []
+            rad_loc = np.array([0], dtype=np.int32)
+            lmaxs = np.array([LMAX_DICT[ang] for ang in angs]).astype(np.int32)
+            lmaxs = np.minimum(lmaxs, full_lmax)
+            nlm = (full_lmax + 1) * (full_lmax + 1)
+            ylm_full = np.empty((0, nlm), order="C")
+            ylm_loc = []
+            rads = []
+            drs = []
+            for n in sorted(set(angs)):
+                grid = np.empty((n, 4))
+                libdft.MakeAngularGrid(
+                    grid.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(n)
+                )
+                idx = np.where(angs == n)[0]
+                yloc_curr = ylm_full.shape[0]
+                ylm = np.zeros((n, nlm), order="C")
+                sphgd = np.ascontiguousarray(grid[:, :3])
+                libcider.recursive_sph_harm_vec(
+                    ctypes.c_int(nlm),
+                    ctypes.c_int(n),
+                    sphgd.ctypes.data_as(ctypes.c_void_p),
+                    ylm.ctypes.data_as(ctypes.c_void_p),
+                )
+                lmax_shl = LMAX_DICT[n]
+                nlm_shl = (lmax_shl + 1) * (lmax_shl + 1)
+                ylm[:, nlm_shl:] = 0.0
+                ylm_full = np.append(ylm_full, ylm, axis=0)
+                coords.append(
+                    np.einsum("i,jk->ijk", rad[idx], grid[:, :3]).reshape(-1, 3)
+                )
+                vol.append(np.einsum("i,j->ij", rad_weight[idx], grid[:, 3]).ravel())
+                rads.append(rad[idx])
+                drs.append(dr[idx])
+                rad_loc = np.append(
+                    rad_loc, rad_loc[-1] + n * np.arange(1, len(idx) + 1)
+                )
+                ylm_loc.append(yloc_curr * np.ones(idx.size, dtype=np.int32))
+            atom_grids_tab[symb] = (np.vstack(coords), np.hstack(vol))
+            lmax_tab[symb] = lmaxs
+            rad_loc_tab[symb] = rad_loc
+            ylm_tab[symb] = ylm_full
+            ylm_loc_tab[symb] = np.concatenate(ylm_loc).astype(np.int32)
+            rad_tab[symb] = np.concatenate(rads).astype(np.float64)
+            dr_tab[symb] = np.concatenate(drs).astype(np.float64)
+
+    return atom_grids_tab, lmax_tab, rad_loc_tab, ylm_tab, ylm_loc_tab, rad_tab, dr_tab
diff --git a/ciderpress/pyscf/nldf_convolutions.py b/ciderpress/pyscf/nldf_convolutions.py
new file mode 100644
index 0000000..9318162
--- /dev/null
+++ b/ciderpress/pyscf/nldf_convolutions.py
@@ -0,0 +1,249 @@
+import numpy as np
+from pyscf import gto
+from pyscf.data.radii import COVALENT as COVALENT_RADII
+
+from ciderpress.dft.lcao_convolutions import (
+    DEFAULT_AUX_BETA,
+    DEFAULT_CIDER_LMAX,
+    ATCBasis,
+    ConvolutionCollection,
+    ConvolutionCollectionK,
+    get_convolution_expnts_from_expnts,
+    get_gamma_lists_from_bas_and_env,
+)
+from ciderpress.dft.lcao_interpolation import LCAOInterpolator, LCAOInterpolatorDirect
+from ciderpress.dft.lcao_nldf_generator import LCAONLDFGenerator
+from ciderpress.dft.plans import VI_ID_MAP, NLDFGaussianPlan, NLDFSplinePlan
+
+
+def aug_etb_for_cider(
+    mol,
+    beta=DEFAULT_AUX_BETA,
+    upper_fac=1.0,
+    lower_fac=1.0,
+    lmax=DEFAULT_CIDER_LMAX,
+    def_afac_max=4.0,
+    def_afac_min=0.5,
+):
+    """
+    augment weigend basis with even-tempered gaussian basis
+    exps = alpha*beta^i for i = 1..N
+    For this implementation, exponent values should be on the
+    same 'grid' ..., 1/beta, 1, beta, beta^2, ...
+
+    Args:
+        mol (pyscf.gto.Mole): molecule object
+        beta (float): exponent parameter for ETB basis
+        upper_fac (float): Max exponent should be at least
+            upper_fac * max exponent in original basis
+            for each l value.
+        lower_fac (float): Min exponent should be at most
+            lower_fac * min exponent in original basis
+            for each l value.
+        lmax (int): Maximum l value for expansion.
+        def_afac_max (float): Max exponent relative to Bohr radius
+            if no orbital products with a particular l exist
+            in the original basis.
+        def_afac_min (float): Min exponent relative to Bohr radius
+            if not orbital products with a particular l exist
+            in the original basis
+
+    Returns:
+        New basis set for expanding CIDER theta contributions.
+    """
+    uniq_atoms = set([a[0] for a in mol._atom])
+
+    newbasis = {}
+    for symb in uniq_atoms:
+        nuc_charge = gto.charge(symb)
+        max_shells = lmax
+        emin_by_l = [1e99] * 8
+        emax_by_l = [0] * 8
+        l_max = 7
+        for b in mol._basis[symb]:
+            l = b[0]
+            if l >= max_shells + 1:
+                continue
+
+            if isinstance(b[1], int):
+                e_c = np.array(b[2:])
+            else:
+                e_c = np.array(b[1:])
+            es = e_c[:, 0]
+            cs = e_c[:, 1:]
+            es = es[abs(cs).max(axis=1) > 1e-3]
+            emax_by_l[l] = max(es.max(), emax_by_l[l])
+            emin_by_l[l] = min(es.min(), emin_by_l[l])
+
+        l_max1 = l_max + 1
+        emin_by_l = np.array(emin_by_l)
+        emax_by_l = np.array(emax_by_l)
+
+        # Estimate the exponents ranges by geometric average
+        emax = np.sqrt(np.einsum("i,j->ij", emax_by_l, emax_by_l))
+        emin = np.sqrt(np.einsum("i,j->ij", emin_by_l, emin_by_l))
+        liljsum = np.arange(l_max1)[:, None] + np.arange(l_max1)
+        emax_by_l = [emax[liljsum == ll].max() for ll in range(l_max1 * 2 - 1)]
+        emin_by_l = [emin[liljsum == ll].min() for ll in range(l_max1 * 2 - 1)]
+        # Tune emin and emax
+        emin_by_l = np.array(emin_by_l) * lower_fac  # *2 for alpha+alpha on same center
+        emax_by_l = np.array(emax_by_l) * upper_fac  # / (np.arange(l_max1*2-1)*.5+1)
+
+        def_amax = def_afac_max / COVALENT_RADII[nuc_charge] ** 2
+        def_amin = def_afac_min / COVALENT_RADII[nuc_charge] ** 2
+
+        cond = emax_by_l == 0
+        emax_by_l[cond] = def_amax
+        emin_by_l[cond] = def_amin
+        emax_by_l = np.maximum(def_amax, emax_by_l[: lmax + 1])
+        emin_by_l = np.minimum(def_amin, emin_by_l[: lmax + 1])
+
+        nmaxs = np.ceil(np.log(emax_by_l) / np.log(beta))
+        nmins = np.floor(np.log(emin_by_l) / np.log(beta))
+        emin_by_l = beta**nmins
+        ns = nmaxs - nmins + 1
+        etb = []
+        for l, n in enumerate(np.ceil(ns).astype(int)):
+            # print(l, n, emin_by_l[l], emax_by_l[l], beta)
+            if n > 0 and l <= lmax:
+                etb.append((l, n, emin_by_l[l], beta))
+        newbasis[symb] = gto.expand_etbs(etb)
+
+    return newbasis, ns, np.max(nmaxs) - nmaxs
+
+
+def get_gamma_lists_from_mol(mol):
+    return get_gamma_lists_from_bas_and_env(mol._bas, mol._env)
+
+
+class PySCFNLDFInitializer:
+    def __init__(self, settings, **kwargs):
+        """
+        Args:
+            settings (NLDFSettings): settings for features
+            **kwargs: Any of the optional arguments in the
+                PyscfNLDFGenerator.from_mol_and_settings
+                function.
+        """
+        self.settings = settings
+        self.kwargs = kwargs
+
+    def initialize_nldf_generator(self, mol, grids_indexer, nspin):
+        return PyscfNLDFGenerator.from_mol_and_settings(
+            mol, grids_indexer, nspin, self.settings, **self.kwargs
+        )
+
+
+class PyscfNLDFGenerator(LCAONLDFGenerator):
+    """
+    A PySCF-specific wrapper for the NLDF feature generator.
+    """
+
+    @classmethod
+    def from_mol_and_settings(
+        cls,
+        mol,
+        grids_indexer,
+        nspin,
+        nldf_settings,
+        plan_type="gaussian",
+        lmax=10,
+        aux_lambd=1.6,
+        aug_beta=1.6,
+        alpha_max=3000,
+        alpha_min=1.43702907e-03,
+        alpha_formula="etb",
+        rhocut=1e-9,
+        expcut=1e-9,
+        gbuf=2.0,
+        interpolator_type="onsite_direct",
+        aparam=0.03,
+        dparam=0.04,
+        nrad=200,
+    ):
+        if lmax is None:
+            lmax = grids_indexer.lmax
+        if lmax > grids_indexer.lmax:
+            raise ValueError("lmax cannot be larger than grids_indexer.lmax")
+        if interpolator_type not in ["onsite_direct", "onsite_spline", "train_gen"]:
+            raise ValueError
+        if aug_beta is None:
+            aug_beta = aux_lambd
+        if alpha_min is None:
+            alpha_min = nldf_settings.theta_params[0] / 128  # sensible default
+        if plan_type not in ["gaussian", "spline"]:
+            raise ValueError("plan_type must be gaussian or spline")
+        if alpha_formula is None:
+            alpha_formula = "etb" if plan_type == "gaussian" else "zexp"
+        nalpha = int(np.ceil(np.log(alpha_max / alpha_min) / np.log(aux_lambd))) + 1
+        plan_class = NLDFGaussianPlan if plan_type == "gaussian" else NLDFSplinePlan
+        plan = plan_class(
+            nldf_settings,
+            nspin,
+            alpha_min,
+            aux_lambd,
+            nalpha,
+            coef_order="gq",
+            alpha_formula=alpha_formula,
+            proc_inds=None,
+            rhocut=rhocut,
+            expcut=expcut,
+        )
+        basis = aug_etb_for_cider(mol, lmax=lmax, beta=aug_beta)[0]
+        mol = gto.M(
+            atom=mol.atom,
+            basis=basis,
+            spin=mol.spin,
+            charge=mol.charge,
+            unit=mol.unit,
+        )
+        dat = get_gamma_lists_from_mol(mol)
+        atco_inp = ATCBasis(*dat)
+        dat = get_convolution_expnts_from_expnts(
+            plan.alphas, dat[0], dat[1], dat[2], dat[4], gbuf=gbuf
+        )
+        atco_out = ATCBasis(*dat)
+        if plan.nldf_settings.nldf_type == "k":
+            ccl = ConvolutionCollectionK(
+                atco_inp, atco_out, plan.alphas, plan.alpha_norms
+            )
+        else:
+            has_vj = "j" in plan.nldf_settings.nldf_type
+            ifeat_ids = []
+            for spec in plan.nldf_settings.l0_feat_specs:
+                ifeat_ids.append(VI_ID_MAP[spec])
+            for spec in plan.nldf_settings.l1_feat_specs:
+                ifeat_ids.append(VI_ID_MAP[spec])
+            ccl = ConvolutionCollection(
+                atco_inp,
+                atco_out,
+                plan.alphas,
+                plan.alpha_norms,
+                has_vj=has_vj,
+                ifeat_ids=ifeat_ids,
+            )
+        if interpolator_type == "train_gen":
+            interpolator = LCAOInterpolator.from_ccl(
+                mol.atom_coords(unit="Bohr"),
+                ccl,
+                aparam=aparam,
+                dparam=dparam,
+                nrad=nrad,
+            )
+        else:
+            onsite_direct = interpolator_type == "onsite_direct"
+            interpolator = LCAOInterpolatorDirect.from_ccl(
+                grids_indexer,
+                mol.atom_coords(unit="Bohr"),
+                ccl,
+                aparam=aparam,
+                dparam=dparam,
+                nrad=nrad,
+                onsite_direct=onsite_direct,
+            )
+        ccl.compute_integrals_()
+        ccl.solve_projection_coefficients()
+        return cls(plan, ccl, interpolator, grids_indexer)
+
+    def get_extra_ao(self):
+        return (self.interpolator.nlm + 4) * self.natm
diff --git a/ciderpress/pyscf/numint.py b/ciderpress/pyscf/numint.py
new file mode 100644
index 0000000..84e20f0
--- /dev/null
+++ b/ciderpress/pyscf/numint.py
@@ -0,0 +1,1129 @@
+import numpy as np
+from ase.utils.timing import Timer
+from pyscf import lib
+from pyscf.dft import numint
+from pyscf.dft.gen_grid import ALIGNMENT_UNIT, BLKSIZE, NBINS
+from pyscf.dft.numint import _dot_ao_ao_sparse, _scale_ao_sparse, eval_ao
+
+import ciderpress.pyscf.frac_lapl as nlof
+from ciderpress.dft.plans import FracLaplPlan, SemilocalPlan
+from ciderpress.dft.settings import FeatureSettings
+from ciderpress.pyscf.nldf_convolutions import PyscfNLDFGenerator
+from ciderpress.pyscf.sdmx import EXXSphGenerator
+
+DEFAULT_RHOCUT = 1e-9
+
+
+def _tau_dot_sparse(
+    bra, ket, wv, nbins, screen_index, pair_mask, ao_loc, out=None, aow=None
+):
+    """Similar to _tau_dot, while sparsity is explicitly considered. Note the
+    return may have ~1e-13 difference to _tau_dot.
+    """
+    nao = bra.shape[1]
+    if out is None:
+        out = np.zeros((nao, nao), dtype=bra.dtype)
+    hermi = 1
+    aow = _scale_ao_sparse(ket[1], wv, screen_index, ao_loc, out=aow)
+    _dot_ao_ao_sparse(
+        bra[1], aow, None, nbins, screen_index, pair_mask, ao_loc, hermi, out
+    )
+    aow = _scale_ao_sparse(ket[2], wv, screen_index, ao_loc, out=aow)
+    _dot_ao_ao_sparse(
+        bra[2], aow, None, nbins, screen_index, pair_mask, ao_loc, hermi, out
+    )
+    aow = _scale_ao_sparse(ket[3], wv, screen_index, ao_loc, out=aow)
+    _dot_ao_ao_sparse(
+        bra[3], aow, None, nbins, screen_index, pair_mask, ao_loc, hermi, out
+    )
+    return out
+
+
+def _get_sdmx_orbs(ni, mol, coords, ao):
+    ni.timer.start("sdmx ao")
+    if ni.has_sdmx and ni.sdmxgen.fast:
+        if ao is None:
+            sdmx_ao = eval_ao(mol, coords, deriv=0)
+        else:
+            sdmx_ao = ao[0] if ao.ndim == 3 else ao
+        sdmx_cao = ni.sdmxgen.get_cao(mol, coords, save_buf=True)
+    elif ni.has_sdmx:
+        sdmx_ao, sdmx_cao = ni.sdmxgen.get_orb_vals(mol, coords, save_buf=True)
+    else:
+        sdmx_ao, sdmx_cao = None, None
+    ni.timer.stop("sdmx ao")
+    return sdmx_ao, sdmx_cao
+
+
+def nr_rks(
+    ni, mol, grids, xc_code, dms, relativity=0, hermi=1, max_memory=2000, verbose=None
+):
+    """
+
+    Args:
+        ni (CiderNumInt):
+        mol (pyscf.gto.Mole):
+        grids (pyscf.dft.gen_grid.Grids):
+        xc_code:
+        dms:
+        relativity:
+        hermi:
+        max_memory:
+        verbose:
+
+    Returns:
+
+    """
+    max_memory = 2000
+    ni.timer.start("nr_rks")
+    if relativity != 0:
+        raise NotImplementedError
+
+    xctype = "MGGA"
+    ni.initialize_feature_generators(mol, grids, 1)
+
+    make_rho, nset, nao = ni._gen_rho_evaluator(mol, dms, hermi, False, grids)
+    ao_loc = mol.ao_loc_nr()
+    cutoff = grids.cutoff * 1e2
+    nbins = NBINS * 2 - int(NBINS * np.log(cutoff) / np.log(grids.cutoff))
+
+    if dms.ndim == 2:
+        dms = dms[None, :, :]
+
+    nelec = np.zeros(nset)
+    excsum = np.zeros(nset)
+    vmat = np.zeros((nset, nao, nao))
+
+    def block_loop(ao_deriv):
+        if ni.has_sdmx:
+            extra_ao = ni.sdmxgen.get_extra_ao(mol)
+        else:
+            extra_ao = 0
+        for ao, mask, weight, coords in ni.block_loop(
+            mol, grids, nao, ao_deriv, max_memory=max_memory, extra_ao=extra_ao
+        ):
+            sdmx_ao, sdmx_cao = _get_sdmx_orbs(ni, mol, coords, ao)
+            for i in range(nset):
+                rho = make_rho(i, ao, mask, xctype)
+                ni.timer.start("sdmx fwd")
+                if ni.has_sdmx:
+                    sdmx_feat = ni.sdmxgen.get_features(
+                        dms[i],
+                        mol,
+                        coords,
+                        ao=sdmx_ao,
+                        cao=sdmx_cao,
+                    )
+                else:
+                    sdmx_feat = None
+                ni.timer.stop("sdmx fwd")
+                ni.timer.start("xc cider")
+                exc, (vxc, vxc_nldf, vxc_sdmx) = ni.eval_xc_cider(
+                    xc_code, rho, None, sdmx_feat, deriv=1, xctype=xctype
+                )[:2]
+                ni.timer.stop("xc cider")
+                assert vxc_nldf is None
+                ni.timer.start("sdmx bwd")
+                if ni.has_sdmx:
+                    ni.sdmxgen.get_vxc_(vmat[i], vxc_sdmx[0] * weight)
+                ni.timer.stop("sdmx bwd")
+                den = rho[0] * weight
+                nelec[i] += den.sum()
+                excsum[i] += np.dot(den, exc)
+                wv = weight * vxc
+                yield i, ao, mask, wv
+        # if ni.has_sdmx:
+        #     ni.sdmxgen.reset_buffers()
+
+    buffers = None
+    pair_mask = mol.get_overlap_cond() < -np.log(ni.cutoff)
+    if any(x in xc_code.upper() for x in ("CC06", "CS", "BR89", "MK00")):
+        raise NotImplementedError("laplacian in meta-GGA method")
+    ao_deriv = 1
+    v1 = np.zeros_like(vmat)
+    for i, ao, mask, wv in block_loop(ao_deriv):
+        vmats, buffers = ni.contract_wv(
+            ao,
+            wv,
+            nbins,
+            mask,
+            pair_mask,
+            ao_loc,
+            vmats=(vmat[i], v1[i]),
+            buffers=buffers,
+        )
+    vmat = lib.hermi_sum(vmat, axes=(0, 2, 1))
+    vmat += v1
+
+    if ni.has_sdmx:
+        ni.sdmxgen._cached_ao_data = None
+
+    if nset == 1:
+        nelec = nelec[0]
+        excsum = excsum[0]
+        vmat = vmat[0]
+    if isinstance(dms, np.ndarray):
+        dtype = dms.dtype
+    else:
+        dtype = np.result_type(*dms)
+    if vmat.dtype != dtype:
+        vmat = np.asarray(vmat, dtype=dtype)
+    # if ni.has_sdmx:
+    #    ni.sdmxgen.reset_buffers()
+    ni.timer.stop("nr_rks")
+    return nelec, excsum, vmat
+
+
+def nr_uks(
+    ni, mol, grids, xc_code, dms, relativity=0, hermi=1, max_memory=2000, verbose=None
+):
+    max_memory = 2000
+    ni.timer.start("nr_uks")
+    if relativity != 0:
+        raise NotImplementedError
+
+    xctype = "MGGA"
+    ni.initialize_feature_generators(mol, grids, 2)
+
+    dma, dmb = numint._format_uks_dm(dms)
+    if dma.ndim == 2:
+        dma = dma[None, :, :]
+        dmb = dmb[None, :, :]
+        is_2d = True
+    else:
+        is_2d = False
+    nao = dma.shape[-1]
+    make_rhoa, nset = ni._gen_rho_evaluator(mol, dma, hermi, False, grids)[:2]
+    make_rhob = ni._gen_rho_evaluator(mol, dmb, hermi, False, grids)[0]
+    ao_loc = mol.ao_loc_nr()
+    cutoff = grids.cutoff * 1e2
+    nbins = NBINS * 2 - int(NBINS * np.log(cutoff) / np.log(grids.cutoff))
+    if ni.has_sdmx:
+        dm_for_sdmx = [np.stack([dma[i], dmb[i]]) for i in range(nset)]
+    else:
+        dm_for_sdmx = None
+
+    nelec = np.zeros((2, nset))
+    excsum = np.zeros(nset)
+    vmat = np.zeros((2, nset, nao, nao))
+
+    def block_loop(ao_deriv):
+        if ni.has_sdmx:
+            extra_ao = ni.sdmxgen.get_extra_ao(mol)
+        else:
+            extra_ao = 0
+        for ao, mask, weight, coords in ni.block_loop(
+            mol, grids, nao, ao_deriv, max_memory=max_memory, extra_ao=extra_ao
+        ):
+            sdmx_ao, sdmx_cao = _get_sdmx_orbs(ni, mol, coords, ao)
+            for i in range(nset):
+                rho_a = make_rhoa(i, ao, mask, xctype)
+                rho_b = make_rhob(i, ao, mask, xctype)
+                rho = (rho_a, rho_b)
+                ni.timer.start("sdmx fwd")
+                if ni.has_sdmx:
+                    sdmx_feat = ni.sdmxgen.get_features(
+                        dm_for_sdmx[i],
+                        mol,
+                        coords,
+                        ao=sdmx_ao,
+                        cao=sdmx_cao,
+                    )
+                else:
+                    sdmx_feat = None
+                ni.timer.stop("sdmx fwd")
+                ni.timer.start("xc cider")
+                exc, (vxc, vxc_nldf, vxc_sdmx) = ni.eval_xc_cider(
+                    xc_code, rho, None, sdmx_feat, deriv=1, xctype=xctype
+                )[:2]
+                ni.timer.stop("xc cider")
+                assert vxc_nldf is None
+                ni.timer.start("sdmx bwd")
+                if ni.has_sdmx:
+                    ni.sdmxgen.get_vxc_(vmat[:, i], vxc_sdmx * weight)
+                ni.timer.stop("sdmx bwd")
+                den_a = rho_a[0] * weight
+                den_b = rho_b[0] * weight
+                nelec[0, i] += den_a.sum()
+                nelec[1, i] += den_b.sum()
+                excsum[i] += np.dot(den_a, exc)
+                excsum[i] += np.dot(den_b, exc)
+                wv = weight * vxc
+                yield i, ao, mask, wv
+
+    buffers = None
+    pair_mask = mol.get_overlap_cond() < -np.log(ni.cutoff)
+    if any(x in xc_code.upper() for x in ("CC06", "CS", "BR89", "MK00")):
+        raise NotImplementedError("laplacian in meta-GGA method")
+    ao_deriv = 1
+    v1 = np.zeros_like(vmat)
+    for i, ao, mask, wv in block_loop(ao_deriv):
+        buffers = ni.contract_wv(
+            ao,
+            wv[0],
+            nbins,
+            mask,
+            pair_mask,
+            ao_loc,
+            vmats=(vmat[0, i], v1[0, i]),
+            buffers=buffers,
+        )[1]
+        buffers = ni.contract_wv(
+            ao,
+            wv[1],
+            nbins,
+            mask,
+            pair_mask,
+            ao_loc,
+            vmats=(vmat[1, i], v1[1, i]),
+            buffers=buffers,
+        )[1]
+    vmat = lib.hermi_sum(vmat.reshape(-1, nao, nao), axes=(0, 2, 1)).reshape(
+        2, nset, nao, nao
+    )
+    vmat += v1
+
+    if ni.has_sdmx:
+        ni.sdmxgen._cached_ao_data = None
+
+    if isinstance(dma, np.ndarray) and is_2d:
+        vmat = vmat[:, 0]
+        nelec = nelec.reshape(2)
+        excsum = excsum[0]
+
+    dtype = np.result_type(dma, dmb)
+    if vmat.dtype != dtype:
+        vmat = np.asarray(vmat, dtype=dtype)
+    ni.timer.stop("nr_uks")
+    return nelec, excsum, vmat
+
+
+def nr_rks_nldf(
+    ni, mol, grids, xc_code, dms, relativity=0, hermi=1, max_memory=2000, verbose=None
+):
+    """
+
+    Args:
+        ni (NLDFNumInt):
+        mol (pyscf.gto.Mole):
+        grids (pyscf.dft.gen_grid.Grids):
+        xc_code (str):
+        dms (numpy.ndarray):
+        relativity:
+        hermi:
+        max_memory:
+        verbose:
+
+    Returns:
+
+    """
+    if not hasattr(grids, "grids_indexer"):
+        raise ValueError("Grids object must have indexer for NLDF evaluation")
+
+    if relativity != 0:
+        raise NotImplementedError
+
+    xctype = "MGGA"
+    ni.initialize_feature_generators(mol, grids, 1)
+    assert ni.has_nldf
+
+    make_rho, nset, nao = ni._gen_rho_evaluator(mol, dms, hermi, False, grids)
+    ao_loc = mol.ao_loc_nr()
+    cutoff = grids.cutoff * 1e2
+    nbins = NBINS * 2 - int(NBINS * np.log(cutoff) / np.log(grids.cutoff))
+
+    if dms.ndim == 2:
+        dms = dms[None, :, :]
+
+    nelec = np.zeros(nset)
+    excsum = np.zeros(nset)
+    vmat = np.zeros((nset, nao, nao))
+    nrho = 5
+    if not ni.settings.nlof_settings.is_empty:
+        nrho += ni.settings.nlof_settings.nrho
+    rho_full = np.zeros((nset, nrho, grids.weights.size), dtype=np.float64, order="C")
+    wv_full = np.zeros((nset, nrho, grids.weights.size), dtype=np.float64, order="C")
+
+    num_nldf = ni.settings.nldf_settings.nfeat
+    vxc_nldf_full = np.zeros(
+        (nset, num_nldf, grids.weights.size), dtype=np.float64, order="C"
+    )
+
+    def block_loop(ao_deriv):
+        ip0 = 0
+        for ao, mask, weight, coords in ni.block_loop(
+            mol, grids, nao, ao_deriv, max_memory=max_memory
+        ):
+            ip1 = ip0 + weight.size
+            for i in range(nset):
+                yield i, ip0, ip1, ao, mask, weight, coords
+            ip0 = ip1
+
+    if any(x in xc_code.upper() for x in ("CC06", "CS", "BR89", "MK00")):
+        raise NotImplementedError("laplacian in meta-GGA method")
+    ao_deriv = 1
+    for i, ip0, ip1, ao, mask, weight, coords in block_loop(ao_deriv):
+        rho_full[i, :, ip0:ip1] = make_rho(i, ao, mask, xctype)
+
+    par_atom = False
+    extra_ao = ni.nldfgen.get_extra_ao()
+    if ni.has_sdmx:
+        extra_ao += ni.sdmxgen.get_extra_ao(mol)
+    if par_atom:
+        raise NotImplementedError
+    else:
+        nldf_feat = []
+        for idm in range(nset):
+            nldf_feat.append(ni.nldfgen.get_features(rho_full[idm]))
+        ip0 = 0
+        for mask, weight, coords in ni.extra_block_loop(
+            mol, grids, max_memory=max_memory, extra_ao=extra_ao
+        ):
+            ip1 = ip0 + weight.size
+            sdmx_ao, sdmx_cao = _get_sdmx_orbs(ni, mol, coords, None)
+            for idm in range(nset):
+                rho = rho_full[idm, :, ip0:ip1]
+                if ni.has_sdmx:
+                    sdmx_feat = ni.sdmxgen.get_features(
+                        dms[idm], mol, coords, ao=sdmx_ao, cao=sdmx_cao
+                    )
+                else:
+                    sdmx_feat = None
+                print((nldf_feat[idm] * weight).sum(axis=-1))
+                exc, (vxc, vxc_nldf, vxc_sdmx) = ni.eval_xc_cider(
+                    xc_code,
+                    rho,
+                    nldf_feat[idm][:, ip0:ip1],
+                    sdmx_feat,
+                    deriv=1,
+                    xctype=xctype,
+                )[:2]
+                if ni.has_sdmx:
+                    ni.sdmxgen.get_vxc_(vmat[idm], vxc_sdmx[0] * weight)
+                vxc_nldf_full[idm, :, ip0:ip1] = vxc_nldf * weight
+                den = rho[0] * weight
+                nelec[idm] += den.sum()
+                excsum[idm] += np.dot(den, exc)
+                wv_full[idm, :, ip0:ip1] = weight * vxc
+            ip0 = ip1
+        for idm in range(nset):
+            wv_full[idm, :, :] += ni.nldfgen.get_potential(vxc_nldf_full[idm])
+
+    buffers = None
+    pair_mask = mol.get_overlap_cond() < -np.log(ni.cutoff)
+    v1 = np.zeros_like(vmat)
+    for i, ip0, ip1, ao, mask, weight, coords in block_loop(ao_deriv):
+        wv = wv_full[i, :, ip0:ip1]
+        vmats, buffers = ni.contract_wv(
+            ao,
+            wv,
+            nbins,
+            mask,
+            pair_mask,
+            ao_loc,
+            vmats=(vmat[i], v1[i]),
+            buffers=buffers,
+        )
+    vmat = lib.hermi_sum(vmat, axes=(0, 2, 1))
+    vmat += v1
+
+    if nset == 1:
+        nelec = nelec[0]
+        excsum = excsum[0]
+        vmat = vmat[0]
+    if isinstance(dms, np.ndarray):
+        dtype = dms.dtype
+    else:
+        dtype = np.result_type(*dms)
+    if vmat.dtype != dtype:
+        vmat = np.asarray(vmat, dtype=dtype)
+    return nelec, excsum, vmat
+
+
+def nr_uks_nldf(
+    ni, mol, grids, xc_code, dms, relativity=0, hermi=1, max_memory=2000, verbose=None
+):
+    max_memory = 2000
+    ni.timer.start("nr_uks")
+    if relativity != 0:
+        raise NotImplementedError
+
+    xctype = "MGGA"
+    ni.initialize_feature_generators(mol, grids, 2)
+
+    dma, dmb = numint._format_uks_dm(dms)
+    if dma.ndim == 2:
+        dma = dma[None, :, :]
+        dmb = dmb[None, :, :]
+        is_2d = True
+    else:
+        is_2d = False
+    nao = dma.shape[-1]
+    make_rhoa, nset = ni._gen_rho_evaluator(mol, dma, hermi, False, grids)[:2]
+    make_rhob = ni._gen_rho_evaluator(mol, dmb, hermi, False, grids)[0]
+    ao_loc = mol.ao_loc_nr()
+    cutoff = grids.cutoff * 1e2
+    nbins = NBINS * 2 - int(NBINS * np.log(cutoff) / np.log(grids.cutoff))
+    if ni.has_sdmx:
+        dm_for_sdmx = [np.stack([dma[i], dmb[i]]) for i in range(nset)]
+    else:
+        dm_for_sdmx = None
+
+    nelec = np.zeros((2, nset))
+    excsum = np.zeros(nset)
+    vmat = np.zeros((2, nset, nao, nao))
+    nrho = 5
+    if not ni.settings.nlof_settings.is_empty:
+        nrho += ni.settings.nlof_settings.nrho
+    rhoa_full = np.zeros((nset, nrho, grids.weights.size), dtype=np.float64, order="C")
+    rhob_full = np.zeros((nset, nrho, grids.weights.size), dtype=np.float64, order="C")
+    wva_full = np.zeros((nset, nrho, grids.weights.size), dtype=np.float64, order="C")
+    wvb_full = np.zeros((nset, nrho, grids.weights.size), dtype=np.float64, order="C")
+
+    num_nldf = ni.settings.nldf_settings.nfeat
+    vxc_nldf_full = np.zeros(
+        (nset, 2, num_nldf, grids.weights.size), dtype=np.float64, order="C"
+    )
+
+    def block_loop(ao_deriv):
+        ip0 = 0
+        for ao, mask, weight, coords in ni.block_loop(
+            mol, grids, nao, ao_deriv, max_memory=max_memory
+        ):
+            ip1 = ip0 + weight.size
+            for i in range(nset):
+                yield i, ip0, ip1, ao, mask, weight, coords
+            ip0 = ip1
+
+    if any(x in xc_code.upper() for x in ("CC06", "CS", "BR89", "MK00")):
+        raise NotImplementedError("laplacian in meta-GGA method")
+    ao_deriv = 1
+    for i, ip0, ip1, ao, mask, weight, coords in block_loop(ao_deriv):
+        rhoa_full[i, :, ip0:ip1] = make_rhoa(i, ao, mask, xctype)
+        rhob_full[i, :, ip0:ip1] = make_rhob(i, ao, mask, xctype)
+
+    par_atom = False
+    extra_ao = ni.nldfgen.get_extra_ao()
+    if ni.has_sdmx:
+        extra_ao += ni.sdmxgen.get_extra_ao(mol)
+    if par_atom:
+        raise NotImplementedError
+    else:
+        nldf_feat = []
+        for idm in range(nset):
+            nldf_feat.append(
+                np.stack(
+                    [
+                        ni.nldfgen.get_features(rhoa_full[idm]),
+                        ni.nldfgen.get_features(rhob_full[idm]),
+                    ]
+                )
+            )
+        ip0 = 0
+        for mask, weight, coords in ni.extra_block_loop(
+            mol, grids, max_memory=max_memory, extra_ao=extra_ao
+        ):
+            ip1 = ip0 + weight.size
+            sdmx_ao, sdmx_cao = _get_sdmx_orbs(ni, mol, coords, None)
+            for idm in range(nset):
+                rho_a = rhoa_full[idm, :, ip0:ip1]
+                rho_b = rhob_full[idm, :, ip0:ip1]
+                rho = (rho_a, rho_b)
+                if ni.has_sdmx:
+                    sdmx_feat = ni.sdmxgen.get_features(
+                        dm_for_sdmx[i],
+                        mol,
+                        coords,
+                        ao=sdmx_ao,
+                        cao=sdmx_cao,
+                    )
+                else:
+                    sdmx_feat = None
+                exc, (vxc, vxc_nldf, vxc_sdmx) = ni.eval_xc_cider(
+                    xc_code,
+                    rho,
+                    nldf_feat[idm][..., ip0:ip1],
+                    sdmx_feat,
+                    deriv=1,
+                    xctype=xctype,
+                )[:2]
+                if ni.has_sdmx:
+                    ni.sdmxgen.get_vxc_(vmat[:, i], vxc_sdmx * weight)
+                vxc_nldf_full[idm, ..., ip0:ip1] = vxc_nldf * weight
+                den_a = rho_a[0] * weight
+                den_b = rho_b[0] * weight
+                nelec[0, i] += den_a.sum()
+                nelec[1, i] += den_b.sum()
+                excsum[i] += np.dot(den_a, exc)
+                excsum[i] += np.dot(den_b, exc)
+                wva_full[idm, :, ip0:ip1] = weight * vxc[0]
+                wvb_full[idm, :, ip0:ip1] = weight * vxc[1]
+            ip0 = ip1
+        for idm in range(nset):
+            wva_full[idm, :, :] += ni.nldfgen.get_potential(vxc_nldf_full[idm, 0])
+            wvb_full[idm, :, :] += ni.nldfgen.get_potential(vxc_nldf_full[idm, 1])
+
+    buffers = None
+    pair_mask = mol.get_overlap_cond() < -np.log(ni.cutoff)
+    if any(x in xc_code.upper() for x in ("CC06", "CS", "BR89", "MK00")):
+        raise NotImplementedError("laplacian in meta-GGA method")
+    ao_deriv = 1
+    v1 = np.zeros_like(vmat)
+    for i, ip0, ip1, ao, mask, weight, coords in block_loop(ao_deriv):
+        wva = wva_full[i, :, ip0:ip1]
+        wvb = wvb_full[i, :, ip0:ip1]
+        buffers = ni.contract_wv(
+            ao,
+            wva,
+            nbins,
+            mask,
+            pair_mask,
+            ao_loc,
+            vmats=(vmat[0, i], v1[0, i]),
+            buffers=buffers,
+        )[1]
+        buffers = ni.contract_wv(
+            ao,
+            wvb,
+            nbins,
+            mask,
+            pair_mask,
+            ao_loc,
+            vmats=(vmat[1, i], v1[1, i]),
+            buffers=buffers,
+        )[1]
+
+    vmat = lib.hermi_sum(vmat.reshape(-1, nao, nao), axes=(0, 2, 1)).reshape(
+        2, nset, nao, nao
+    )
+    vmat += v1
+
+    if ni.has_sdmx:
+        ni.sdmxgen._cached_ao_data = None
+
+    if isinstance(dma, np.ndarray) and is_2d:
+        vmat = vmat[:, 0]
+        nelec = nelec.reshape(2)
+        excsum = excsum[0]
+
+    dtype = np.result_type(dma, dmb)
+    if vmat.dtype != dtype:
+        vmat = np.asarray(vmat, dtype=dtype)
+    ni.timer.stop("nr_uks")
+    return nelec, excsum, vmat
+
+
+class CiderNumIntMixin:
+
+    sl_plan: SemilocalPlan
+    fl_plan: FracLaplPlan
+    sdmxgen: EXXSphGenerator
+    nldfgen: PyscfNLDFGenerator
+    settings: FeatureSettings
+    xmix: float
+    rhocut: float
+
+    def nlc_coeff(self, xc_code):
+        if self._nlc_coeff is not None:
+            return self._nlc_coeff
+        else:
+            return super().nlc_coeff(xc_code)
+
+    @property
+    def settings(self):
+        return self.mlxc.settings
+
+    @property
+    def has_sdmx(self):
+        return self.settings.has_sdmx
+
+    @property
+    def has_nldf(self):
+        return self.settings.has_nldf
+
+    def build(self, mol=None):
+        self.mol = mol
+        self.timer = Timer()
+        self.sl_plan = None
+        self.fl_plan = None
+        self.sdmxgen = None
+        self.nldfgen = None
+
+    def reset(self, mol=None):
+        self.mol = mol
+        self.sl_plan = None
+        self.fl_plan = None
+        self.sdmxgen = None
+        self.nldfgen = None
+
+    def initialize_feature_generators(self, mol, grids, nspin):
+        self.sl_plan = SemilocalPlan(self.settings.sl_settings, nspin)
+        self.fl_plan = FracLaplPlan(self.settings.nlof_settings, nspin)
+        cond = self.sdmxgen is None
+        cond = cond or self.mol != mol
+        cond = cond or self.sdmxgen.plan.nspin != nspin
+        cond = cond and self.settings.has_sdmx
+        if cond:
+            self.sdmxgen = self.sdmx_init.initialize_sdmx_generator(mol, nspin)
+        self.mol = mol
+
+    def eval_xc_cider(
+        self,
+        xc_code,
+        rho,
+        nldf_feat,
+        sdmx_feat,
+        deriv=1,
+        omega=None,
+        xctype=None,
+        verbose=None,
+    ):
+        rho = np.asarray(rho, order="C", dtype=np.double)
+        if deriv > 1:
+            raise NotImplementedError
+        if rho.ndim == 2:
+            closed = True
+            rho = rho[None, :]
+        else:
+            closed = False
+        nfeat = self.settings.nfeat
+        ngrids = rho.shape[-1]
+        nspin = rho.shape[0]
+        vxc = np.zeros_like(rho)
+        xctype = self._xc_type(xc_code)
+        if xctype == "LDA":
+            nvar = 1
+        elif xctype == "GGA":
+            nvar = 4
+        elif xctype == "MGGA":
+            nvar = 5
+        else:
+            exc = np.zeros(ngrids)
+        if xctype in ["LDA", "GGA", "MGGA"]:
+            if nspin == 1:
+                rhotmp = np.ascontiguousarray(rho[0, :nvar])
+                exc, vxc[0, :nvar] = self.eval_xc_eff(
+                    xc_code,
+                    rhotmp,
+                    deriv=deriv,
+                    omega=omega,
+                    xctype=xctype,
+                    verbose=verbose,
+                )[:2]
+            else:
+                exc, vxc[:, :nvar] = self.eval_xc_eff(
+                    xc_code,
+                    rho[:, :nvar],
+                    deriv=deriv,
+                    omega=omega,
+                    xctype=xctype,
+                    verbose=verbose,
+                )[:2]
+
+        start = 0
+        X0T = np.empty((nspin, nfeat, ngrids))
+        has_sl = not self.settings.sl_settings.is_empty
+        has_nldf = not self.settings.nldf_settings.is_empty
+        has_nlof = not self.settings.nlof_settings.is_empty
+        has_sdmx = not self.settings.sadm_settings.is_empty
+        if has_sl:
+            nfeat_tmp = self.settings.sl_settings.nfeat
+            X0T[:, start : start + nfeat_tmp] = self.sl_plan.get_feat(rho[:, :5])
+            start += nfeat_tmp
+        if has_nldf:
+            if nldf_feat.ndim == 2:
+                nldf_feat = nldf_feat[None, :]
+            assert nldf_feat.shape[0] == nspin
+            assert nldf_feat.ndim == 3
+            nfeat_tmp = self.settings.nldf_settings.nfeat
+            X0T[:, start : start + nfeat_tmp] = nldf_feat
+            start += nfeat_tmp
+        if has_nlof:
+            nfeat_tmp = self.settings.nlof_settings.nfeat
+            X0T[:, start : start + nfeat_tmp] = self.fl_plan.get_feat(rho)
+            start += nfeat_tmp
+        if has_sdmx:
+            if sdmx_feat.ndim == 2:
+                sdmx_feat = sdmx_feat[None, :]
+            assert sdmx_feat.shape[0] == nspin
+            assert sdmx_feat.ndim == 3
+            nfeat_tmp = self.settings.sadm_settings.nfeat
+            X0T[:, start : start + nfeat_tmp] = sdmx_feat
+            start += nfeat_tmp
+        if start != nfeat:
+            raise RuntimeError("nfeat mismatch, this should not happen!")
+
+        X0TN = self.settings.normalizers.get_normalized_feature_vector(X0T)
+        exc_ml, dexcdX0TN_ml = self.mlxc(X0TN, rhocut=self.rhocut)
+        xmix = self.xmix  # / rho.shape[0]
+        exc_ml *= xmix
+        dexcdX0TN_ml *= xmix
+        vxc_ml = self.settings.normalizers.get_derivative_wrt_unnormed_features(
+            X0T, dexcdX0TN_ml
+        )
+        exc[:] += exc_ml / (rho[:, 0].sum(axis=0) + 1e-16)
+
+        start = 0
+        if has_sl:
+            nfeat_tmp = self.settings.sl_settings.nfeat
+            self.sl_plan.get_vxc(
+                rho[:, :5], vxc_ml[:, start : start + nfeat_tmp], vxc=vxc[:, :5]
+            )
+            start += nfeat_tmp
+        if has_nldf:
+            nfeat_tmp = self.settings.nldf_settings.nfeat
+            vxc_nldf = vxc_ml[:, start : start + nfeat_tmp]
+            start += nfeat_tmp
+        else:
+            vxc_nldf = None
+        if has_nlof:
+            nfeat_tmp = self.settings.nlof_settings.nfeat
+            self.fl_plan.get_vxc(vxc_ml[:, start : start + nfeat_tmp], vxc=vxc)
+            start += nfeat_tmp
+        if has_sdmx:
+            nfeat_tmp = self.settings.sadm_settings.nfeat
+            vxc_sdmx = vxc_ml[:, start : start + nfeat_tmp]
+            start += nfeat_tmp
+        else:
+            vxc_sdmx = None
+        if start != nfeat:
+            raise RuntimeError("nfeat mismatch, this should not happen!")
+
+        if closed:
+            vxc = vxc[0]
+
+        return exc, (vxc, vxc_nldf, vxc_sdmx), None, None
+
+
+class CiderNumInt(CiderNumIntMixin, numint.NumInt):
+    def __init__(
+        self, mlxc, nldf_init, sdmx_init, xmix=1.0, rhocut=None, nlc_coeff=None
+    ):
+        """
+
+        Args:
+            mlxc (MappedXC): Model for XC energy
+            nldf_init (PySCFNLDFInitializer)
+            sdmx_init (PySCFSDMXInitializer)
+            xmix (float): Mixing fraction of ML functional
+            rhocut (float): Low density cutoff for numerical stability
+        """
+        self.mlxc = mlxc
+        self.xmix = xmix
+        self.rhocut = DEFAULT_RHOCUT if rhocut is None else rhocut
+        self.mol = None
+        self.nldf_init = nldf_init
+        self.sdmx_init = sdmx_init
+        self.sdmxgen = None
+        self.nldfgen = None
+        self._nlc_coeff = nlc_coeff
+        super(CiderNumInt, self).__init__()
+
+    def method_not_implemented(self, *args, **kwargs):
+        raise NotImplementedError
+
+    nr_rks = nr_rks
+    nr_uks = nr_uks
+
+    nr_rks_fxc = method_not_implemented
+    nr_uks_fxc = method_not_implemented
+    nr_rks_fxc_st = method_not_implemented
+    cache_xc_kernel = method_not_implemented
+    cache_xc_kernel1 = method_not_implemented
+
+    def contract_wv(
+        self,
+        ao,
+        wv,
+        nbins,
+        mask,
+        pair_mask,
+        ao_loc,
+        buffers=None,
+        vmats=None,
+    ):
+        if vmats is None:
+            vmats = (None, None)
+        if not wv.flags.c_contiguous:
+            wv = np.ascontiguousarray(wv)
+        vmat, v1 = vmats
+        aow = buffers
+        wv[0] *= 0.5
+        wv[4] *= 0.5
+        aow = _scale_ao_sparse(ao[:4], wv[:4], mask, ao_loc, out=aow)
+        vmat = _dot_ao_ao_sparse(
+            ao[0], aow, None, nbins, mask, pair_mask, ao_loc, hermi=0, out=vmat
+        )
+        v1 = _tau_dot_sparse(
+            ao, ao, wv[4], nbins, mask, pair_mask, ao_loc, out=v1, aow=aow
+        )
+        return (vmat, v1), aow
+
+    def block_loop(
+        self,
+        mol,
+        grids,
+        nao=None,
+        deriv=0,
+        max_memory=2000,
+        non0tab=None,
+        blksize=None,
+        buf=None,
+        extra_ao=0,
+    ):
+        """Define this macro to loop over grids by blocks.
+        Same as pyscf block_loop except extra_ao allows accounting for
+        extra ao terms in blksize calculation.
+        """
+        if grids.coords is None:
+            grids.build(with_non0tab=True)
+        if nao is None:
+            nao = mol.nao
+        ngrids = grids.coords.shape[0]
+        comp = (deriv + 1) * (deriv + 2) * (deriv + 3) // 6
+        # NOTE to index grids.non0tab, the blksize needs to be an integer
+        # multiplier of BLKSIZE
+        if blksize is None:
+            blksize = int(
+                max_memory * 1e6 / (((comp + 1) * nao + extra_ao) * 8 * BLKSIZE)
+            )
+            blksize = max(4, min(blksize, ngrids // BLKSIZE + 1, 1200)) * BLKSIZE
+        assert blksize % BLKSIZE == 0
+
+        if non0tab is None and mol is grids.mol:
+            non0tab = grids.non0tab
+        if non0tab is None:
+            non0tab = np.empty(
+                ((ngrids + BLKSIZE - 1) // BLKSIZE, mol.nbas), dtype=np.uint8
+            )
+            non0tab[:] = NBINS + 1  # Corresponding to AO value ~= 1
+        screen_index = non0tab
+
+        # the xxx_sparse() functions require ngrids 8-byte aligned
+        allow_sparse = ngrids % ALIGNMENT_UNIT == 0 and nao > numint.SWITCH_SIZE
+
+        if buf is None:
+            buf = numint._empty_aligned(comp * blksize * nao)
+        for ip0, ip1 in lib.prange(0, ngrids, blksize):
+            coords = grids.coords[ip0:ip1]
+            weight = grids.weights[ip0:ip1]
+            mask = screen_index[ip0 // BLKSIZE :]
+            # TODO: pass grids.cutoff to eval_ao
+            ao = self.eval_ao(
+                mol, coords, deriv=deriv, non0tab=mask, cutoff=grids.cutoff, out=buf
+            )
+            if not allow_sparse and not numint._sparse_enough(mask):
+                # Unset mask for dense AO tensor. It determines which eval_rho
+                # to be called in make_rho
+                mask = None
+            yield ao, mask, weight, coords
+
+
+class _FLNumIntMixin:
+
+    feat_plan = None
+    settings: FeatureSettings = None
+
+    nr_rks = nr_rks
+    nr_uks = nr_uks
+
+    nr_nlc_vxc = numint.nr_nlc_vxc
+    nr_sap = nr_sap_vxc = numint.nr_sap_vxc
+
+    make_mask = staticmethod(nlof.make_mask)
+    eval_ao = staticmethod(numint.eval_ao)
+    eval_kao = staticmethod(nlof.eval_kao)
+    eval_rho = staticmethod(nlof.eval_rho)
+    eval_rho1 = lib.module_method(nlof.eval_rho1, absences=["cutoff"])
+    eval_rho2 = staticmethod(nlof.eval_rho2)
+    get_rho = numint.get_rho
+
+    def block_loop(
+        ni,
+        mol,
+        grids,
+        nao=None,
+        deriv=0,
+        max_memory=2000,
+        non0tab=None,
+        blksize=None,
+        buf=None,
+        extra_ao=None,
+    ):
+        """Define this macro to loop over grids by blocks. Expands on
+        CiderNumInt.block_loop by also returning the fractional laplacian-
+        like operators on the orbitals, as set up by the FracLaplSettings
+        """
+        if grids.coords is None:
+            grids.build(with_non0tab=True)
+        if nao is None:
+            nao = mol.nao
+        ngrids = grids.coords.shape[0]
+        comp = (deriv + 1) * (deriv + 2) * (deriv + 3) // 6
+        flsettings = ni.settings.nlof_settings
+        kcomp = flsettings.npow + 3 * flsettings.nd1
+        # NOTE to index grids.non0tab, the blksize needs to be an integer
+        # multiplier of BLKSIZE
+        if blksize is None:
+            blksize = int(
+                max_memory * 1e6 / (((comp + kcomp + 1) * nao + extra_ao) * 8 * BLKSIZE)
+            )
+            blksize = max(4, min(blksize, ngrids // BLKSIZE + 1, 1200)) * BLKSIZE
+        assert blksize % BLKSIZE == 0
+
+        if non0tab is None and mol is grids.mol:
+            non0tab = grids.non0tab
+        if non0tab is None:
+            non0tab = np.empty(
+                ((ngrids + BLKSIZE - 1) // BLKSIZE, mol.nbas), dtype=np.uint8
+            )
+            non0tab[:] = NBINS + 1  # Corresponding to AO value ~= 1
+        screen_index = non0tab
+
+        # the xxx_sparse() functions require ngrids 8-byte aligned
+        allow_sparse = ngrids % ALIGNMENT_UNIT == 0
+
+        if buf is None:
+            buf = nlof._empty_aligned((comp + kcomp) * blksize * nao)
+        for ip0, ip1 in lib.prange(0, ngrids, blksize):
+            coords = grids.coords[ip0:ip1]
+            weight = grids.weights[ip0:ip1]
+            mask = screen_index[ip0 // BLKSIZE :]
+            # TODO: pass grids.cutoff to eval_ao
+            ao_buf = np.ndarray((comp + kcomp, weight.size * nao), buffer=buf)
+            ao = ni.eval_ao(
+                mol, coords, deriv=deriv, non0tab=mask, cutoff=grids.cutoff, out=ao_buf
+            )
+            kao = ni.eval_kao(
+                flsettings.slist,
+                mol,
+                coords,
+                deriv=0,
+                non0tab=None,
+                cutoff=grids.cutoff,
+                out=ao_buf[comp:],
+                n1=flsettings.nd1,
+            )
+            if not allow_sparse and not nlof._sparse_enough(mask):
+                # Unset mask for dense AO tensor. It determines which eval_rho
+                # to be called in make_rho
+                mask = None
+            yield (ao, kao), mask, weight, coords
+
+    def contract_wv(
+        self,
+        ao,
+        wv,
+        nbins,
+        mask,
+        pair_mask,
+        ao_loc,
+        buffers=None,
+        vmats=None,
+    ):
+        if buffers is None:
+            buffers = (None, None)
+        if vmats is None:
+            vmats = (None, None)
+        aow1, aow2, vmat, v1 = nlof._odp_dot_sparse_(
+            ao,
+            wv,
+            nbins,
+            mask,
+            pair_mask,
+            ao_loc,
+            self.settings.nlof_settings,
+            aow1=buffers[0],
+            aow2=buffers[1],
+            vmat=vmats[0],
+            v1=vmats[1],
+        )
+        return (vmat, v1), (aow1, aow2)
+
+    _gen_rho_evaluator = nlof.FLNumInt._gen_rho_evaluator
+
+
+class _NLDFMixin:
+
+    nr_rks = nr_rks_nldf
+    nr_uks = nr_uks_nldf
+
+    def extra_block_loop(
+        self,
+        mol,
+        grids,
+        max_memory=2000,
+        non0tab=None,
+        blksize=None,
+        make_mask=False,
+        extra_ao=None,
+    ):
+        if grids.coords is None:
+            grids.build(with_non0tab=True)
+        ngrids = grids.coords.shape[0]
+        # NOTE to index grids.non0tab, the blksize needs to be an integer
+        # multiplier of BLKSIZE
+        if blksize is None:
+            blksize = int(max_memory * 1e6 / ((1 + extra_ao) * 8 * BLKSIZE))
+            blksize = max(4, min(blksize, ngrids // BLKSIZE + 1, 1200)) * BLKSIZE
+        assert blksize % BLKSIZE == 0
+
+        if make_mask:
+            if non0tab is None and mol is grids.mol:
+                non0tab = grids.non0tab
+            if non0tab is None:
+                non0tab = np.empty(
+                    ((ngrids + BLKSIZE - 1) // BLKSIZE, mol.nbas), dtype=np.uint8
+                )
+                non0tab[:] = NBINS + 1  # Corresponding to AO value ~= 1
+            screen_index = non0tab
+        else:
+            screen_index = None
+
+        for ip0, ip1 in lib.prange(0, ngrids, blksize):
+            coords = grids.coords[ip0:ip1]
+            weight = grids.weights[ip0:ip1]
+            if make_mask:
+                mask = screen_index[ip0 // BLKSIZE :]
+            else:
+                mask = None
+            yield mask, weight, coords
+
+
+class NLOFNumInt(_FLNumIntMixin, CiderNumInt):
+    pass
+
+
+class NLDFNumInt(_NLDFMixin, CiderNumInt):
+
+    grids = None
+
+    def initialize_feature_generators(self, mol, grids, nspin):
+        cond = self.nldfgen is None
+        cond = cond or self.grids != grids
+        cond = cond or self.mol != mol
+        cond = cond or self.nldfgen.plan.nspin != nspin
+        if cond:
+            self.nldfgen = self.nldf_init.initialize_nldf_generator(
+                mol, grids.grids_indexer, nspin
+            )
+            self.nldfgen.interpolator.set_coords(grids.coords)
+        super().initialize_feature_generators(mol, grids, nspin)
+        self.grids = grids
+
+
+class NLDFNLOFNumInt(_NLDFMixin, _FLNumIntMixin, CiderNumInt):
+
+    grids = None
+
+    def initialize_feature_generators(self, mol, grids, nspin):
+        cond = self.nldfgen is None
+        cond = cond or self.grids != grids
+        cond = cond or self.mol != mol
+        cond = cond or self.nldfgen.plan.nspin != nspin
+        if cond:
+            self.nldfgen = self.nldf_init.initialize_nldf_generator(
+                mol, grids.grids_indexer, nspin
+            )
+        super().initialize_feature_generators(mol, grids, nspin)
+        self.grids = grids
diff --git a/ciderpress/pyscf/sdmx.py b/ciderpress/pyscf/sdmx.py
new file mode 100644
index 0000000..fb485de
--- /dev/null
+++ b/ciderpress/pyscf/sdmx.py
@@ -0,0 +1,590 @@
+import ctypes
+import time
+
+import numpy as np
+from pyscf import lib
+from pyscf.dft.numint import _dot_ao_ao, _dot_ao_dm, _scale_ao, eval_ao
+from pyscf.gto.eval_gto import BLKSIZE, _get_intor_and_comp, make_screen_index
+from pyscf.gto.mole import ANG_OF, ATOM_OF, PTR_EXP
+
+from ciderpress.dft.plans import SADMPlan, SDMXFullPlan, SDMXIntPlan, SDMXPlan
+from ciderpress.dft.settings import SADMSettings, SDMXFullSettings, SDMXSettings
+from ciderpress.dft.sph_harm_coeff import get_deriv_ylm_coeff
+from ciderpress.lib import load_library as load_cider_library
+
+libcider = load_cider_library("libmcider")
+
+
+def _get_ylm_atom_loc(mol):
+    ylm_atom_loc = np.zeros(mol.natm + 1, dtype=np.int32)
+    for ia in range(mol.natm):
+        lmax = np.max(mol._bas[mol._bas[:, ATOM_OF] == ia, ANG_OF]) + 1
+        ylm_atom_loc[ia + 1] = ylm_atom_loc[ia] + lmax * lmax
+    return ylm_atom_loc
+
+
+def eval_conv_shells(
+    feval,
+    plan,
+    mol,
+    coords,
+    shls_slice=None,
+    non0tab=None,
+    ao_loc=None,
+    cutoff=None,
+    out=None,
+):
+    if isinstance(plan, tuple):
+        alphas, alpha_norms, itype = plan
+    else:
+        alphas = plan.alphas
+        alpha_norms = plan.alpha_norms
+        itype = plan.settings.integral_type
+    if non0tab is not None:
+        if (non0tab == 0).any():
+            # TODO implement some sort of screening later
+            raise NotImplementedError
+    eval_name, comp = _get_intor_and_comp(mol, feval)
+    if comp not in [1, 4]:
+        raise NotImplementedError
+    comp = len(alphas)
+    if eval_name == "GTOval_sph_deriv0":
+        ncpa = 1
+        if itype == "gauss_diff":
+            contract_fn = getattr(libcider, "SDMXcontract_smooth0")
+        elif itype == "gauss_r2":
+            contract_fn = getattr(libcider, "SDMXcontract_rsq0")
+        else:
+            raise ValueError
+        eval_fn = getattr(libcider, "SDMXrad_eval_grid")
+    elif eval_name == "GTOval_sph_deriv1":
+        ncpa = 2
+        if itype == "gauss_diff":
+            contract_fn = getattr(libcider, "SDMXcontract_smooth1")
+        elif itype == "gauss_r2":
+            contract_fn = getattr(libcider, "SDMXcontract_rsq1")
+        else:
+            raise ValueError
+        eval_fn = getattr(libcider, "SDMXrad_eval_grid_deriv1")
+    else:
+        raise NotImplementedError
+    comp *= ncpa
+    drv = getattr(libcider, "SDMXeval_rad_loop")
+    atm = np.asarray(mol._atm, dtype=np.int32, order="C")
+    bas = np.asarray(mol._bas, dtype=np.int32, order="C")
+    env = np.asarray(mol._env, dtype=np.double, order="C")
+    coords = np.asarray(coords, dtype=np.double, order="F")
+    natm = atm.shape[0]
+    nbas = bas.shape[0]
+    ngrids = coords.shape[0]
+
+    if shls_slice is None:
+        shls_slice = (0, nbas)
+    if "spinor" in eval_name:
+        raise NotImplementedError
+        # ao = np.ndarray((2, comp, nao, ngrids), dtype=np.complex128,
+        #                 buffer=out).transpose(0, 1, 3, 2)
+    else:
+        ao = np.ndarray((comp, nbas, ngrids), buffer=out).transpose(0, 2, 1)
+
+    if non0tab is None:
+        if cutoff is None:
+            non0tab = np.ones(((ngrids + BLKSIZE - 1) // BLKSIZE, nbas), dtype=np.uint8)
+        else:
+            non0tab = make_screen_index(mol, coords, shls_slice, cutoff)
+    time.monotonic()
+    drv(
+        eval_fn,
+        contract_fn,
+        ctypes.c_double(1),
+        ctypes.c_int(ngrids),
+        (ctypes.c_int * 2)(1, ncpa),
+        (ctypes.c_int * 2)(*shls_slice),
+        ao.ctypes.data_as(ctypes.c_void_p),
+        coords.ctypes.data_as(ctypes.c_void_p),
+        non0tab.ctypes.data_as(ctypes.c_void_p),
+        atm.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(natm),
+        bas.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(nbas),
+        env.ctypes.data_as(ctypes.c_void_p),
+        alphas.ctypes.data_as(ctypes.c_void_p),
+        alpha_norms.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(alphas.size),
+    )
+    time.monotonic()
+    # print("Get shells", t1 - t0)
+    if comp == 1:
+        if "spinor" in eval_name:
+            ao = ao[:, 0]
+        else:
+            ao = ao[0]
+    return ao
+
+
+def eval_conv_sh(
+    plan,
+    mol,
+    coords,
+    deriv=0,
+    shls_slice=None,
+    non0tab=None,
+    cutoff=None,
+    out=None,
+    verbose=None,
+):
+    assert deriv in [0, 1]
+    if mol.cart:
+        feval = "GTOval_cart_deriv%d" % deriv
+    else:
+        feval = "GTOval_sph_deriv%d" % deriv
+    return eval_conv_shells(
+        feval, plan, mol, coords, shls_slice, non0tab, cutoff=cutoff, out=out
+    )
+
+
+class PySCFSDMXInitializer:
+    def __init__(self, settings, **kwargs):
+        self.settings = settings
+        self.kwargs = kwargs
+
+    def initialize_sdmx_generator(self, mol, nspin):
+        return EXXSphGenerator.from_settings_and_mol(
+            self.settings, nspin, mol, **self.kwargs
+        )
+
+
+class EXXSphGenerator:
+    fast = True
+
+    def __init__(self, plan):
+        if plan.fit_metric != "ovlp":
+            raise NotImplementedError
+        self.plan = plan
+        self._ao_buf = None
+        self._cao_buf = None
+        self._ylm_buf = None
+        self._shl_c0 = None
+        self._cached_ao_data = None
+
+    @property
+    def has_l1(self):
+        return self.plan.settings.n1terms > 0
+
+    def reset_buffers(self):
+        self._ao_buf = None
+        self._cao_buf = None
+        self._ylm_buf = None
+        self._shl_c0 = None
+        self._cached_ao_data = None
+
+    @classmethod
+    def from_settings_and_mol(
+        cls, settings, nspin, mol, alpha0=None, lambd=None, nalpha=None, lowmem=False
+    ):
+        if alpha0 is None:
+            min_exp = np.min(mol._env[mol._bas[:, PTR_EXP]])
+            max_width = 1.0 / np.sqrt(2 * min_exp)
+            coords = mol.atom_coords(unit="Bohr")
+            dist = np.linalg.norm(coords[:, None, :] - coords[None, :, :], axis=2)
+            max_dist = np.max(dist)
+            max_wpd = 4 * max_dist + 4 * max_width
+            # max_wpd = 2 * max_dist + 2 * max_width
+            # max_wpd *= 2
+            alpha0 = 1.0 / (2 * max_wpd**2)
+        if lambd is None:
+            lambd = 1.8
+        if nalpha is None:
+            max_exp = np.max(mol._env[mol._bas[:, PTR_EXP]])
+            # max_exp = min(max_exp, 300)
+            nalpha = 1 + int(1 + np.log(max_exp / alpha0) / np.log(lambd))
+        if isinstance(settings, SDMXFullSettings):
+            plan = SDMXFullPlan(settings, nspin, alpha0, lambd, nalpha)
+            # plan = SDMXIntPlan(settings, nspin, alpha0, lambd, nalpha)
+        elif isinstance(settings, SDMXSettings):
+            plan = SDMXPlan(settings, nspin, alpha0, lambd, nalpha)
+        elif isinstance(settings, SADMSettings):
+            plan = SADMPlan(settings, nspin, alpha0, lambd, nalpha, fit_metric="ovlp")
+        else:
+            raise ValueError("Invalid type {}".format(type(settings)))
+        return cls(plan)
+
+    def __call__(self, *args, **kwargs):
+        """
+        Evaluate the features. See get_features.
+        """
+        return self.get_features(*args, **kwargs)
+
+    def _get_fit_mats(self):
+        if self.plan.has_coul_list:
+            fit_mats = self.plan.fit_matrices
+        else:
+            fit_mats = [self.plan.fit_matrix]
+        return fit_mats
+
+    @property
+    def deriv(self):
+        return 1 if self.has_l1 else 0
+
+    def get_extra_ao(self, mol):
+        ymem = _get_ylm_atom_loc(mol)[-1]
+        cmem = (1 + 6 * self.deriv) * mol.nbas
+        bmem = (1 + self.deriv) * mol.nbas * self.plan.nalpha
+        return ymem + cmem + bmem
+
+    def get_ao(self, mol, coords, non0tab=None, cutoff=None, save_buf=True):
+        # TODO use buffers
+        return eval_ao(mol, coords, deriv=0, non0tab=non0tab, cutoff=cutoff, out=None)
+
+    def get_cao(self, mol, coords, non0tab=None, cutoff=None, save_buf=True):
+        caobuf_size = mol.nbas * coords.shape[0] * self.plan.nalpha
+        if self.deriv > 0:
+            caobuf_size *= 2
+        if self._cao_buf is not None and self._cao_buf.size >= caobuf_size:
+            buf = self._cao_buf
+        else:
+            buf = np.empty(caobuf_size)
+        res = eval_conv_sh(
+            self.plan,
+            mol,
+            coords,
+            deriv=self.deriv,
+            non0tab=non0tab,
+            cutoff=cutoff,
+            out=buf,
+        )
+        if save_buf:
+            self._cao_buf = buf
+        return res
+
+    def _contract_ao_to_bas_single_(
+        self,
+        mol,
+        b0,
+        ylm,
+        c0,
+        shls_slice,
+        ao_loc,
+        ylm_atom_loc,
+        coords=None,
+        atom_coords=None,
+        bwd=False,
+    ):
+        args = [
+            ctypes.c_int(b0.shape[-1]),
+            b0.ctypes.data_as(ctypes.c_void_p),
+            ylm.ctypes.data_as(ctypes.c_void_p),
+            c0.ctypes.data_as(ctypes.c_void_p),
+            (ctypes.c_int * 2)(*shls_slice),
+            ao_loc.ctypes.data_as(ctypes.c_void_p),
+            ylm_atom_loc.ctypes.data_as(ctypes.c_void_p),
+            mol._atm.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(mol.natm),
+            mol._bas.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(mol.nbas),
+            mol._env.ctypes.data_as(ctypes.c_void_p),
+        ]
+        if coords is not None:
+            assert coords.ndim == 1 and coords.flags.c_contiguous
+            assert atom_coords.ndim == 1 and atom_coords.flags.c_contiguous
+            args.append(coords.ctypes.data_as(ctypes.c_void_p))
+            args.append(atom_coords.ctypes.data_as(ctypes.c_void_p))
+            if bwd:
+                libcider.SDMXcontract_ao_to_bas_grid_bwd(*args)
+            else:
+                libcider.SDMXcontract_ao_to_bas_grid(*args)
+        else:
+            if bwd:
+                libcider.SDMXcontract_ao_to_bas_bwd(*args)
+            else:
+                libcider.SDMXcontract_ao_to_bas(*args)
+
+    def _get_ylm(self, mol, coords, ylm_atom_loc=None, savebuf=True):
+        ngrids = coords.shape[0]
+        ncpa = 1 + 3 * self.deriv
+        gaunt_lmax = np.max(mol._bas[:, ANG_OF])
+        if ylm_atom_loc is None:
+            ylm_atom_loc = _get_ylm_atom_loc(mol)
+        ybufsize = ncpa * ylm_atom_loc[-1] * ngrids
+        if self._ylm_buf is not None and self._ylm_buf.size >= ybufsize:
+            buf = self._ylm_buf
+        else:
+            buf = np.empty(ybufsize)
+        ylm = np.ndarray((ncpa, ylm_atom_loc[-1], ngrids), buffer=buf, order="C")
+        atom_coords = np.ascontiguousarray(mol.atom_coords(unit="Bohr"))
+        libcider.SDMXylm_loop(
+            ctypes.c_int(coords.shape[0]),
+            ylm.ctypes.data_as(ctypes.c_void_p),
+            coords.ctypes.data_as(ctypes.c_void_p),
+            ylm_atom_loc.ctypes.data_as(ctypes.c_void_p),
+            atom_coords.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(mol.natm),
+        )
+        if ncpa > 1:
+            gaunt_coeff = get_deriv_ylm_coeff(gaunt_lmax)
+            libcider.SDMXylm_grad(
+                ctypes.c_int(coords.shape[0]),
+                ylm.ctypes.data_as(ctypes.c_void_p),
+                gaunt_coeff.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(gaunt_coeff.shape[1]),
+                ylm_atom_loc.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(mol.natm),
+            )
+        libcider.SDMXylm_yzx2xyz(
+            ctypes.c_int(coords.shape[0]),
+            ctypes.c_int(ncpa),
+            ylm.ctypes.data_as(ctypes.c_void_p),
+            ylm_atom_loc.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(mol.natm),
+        )
+        if savebuf:
+            self._ylm_buf = buf
+        else:
+            self._ylm_buf = None
+        return ylm
+
+    def _contract_ao_to_bas_helper(
+        self, mol, b0, c0, shls_slice, ao_loc, coords, ylm=None, bwd=False
+    ):
+        coords = np.asfortranarray(coords)
+        if ylm is None:
+            ylm = self._get_ylm(mol, coords)
+        ylm_atom_loc = _get_ylm_atom_loc(mol)
+        if self.deriv == 1:
+            atom_coords = np.asfortranarray(mol.atom_coords(unit="Bohr"))
+            args = [
+                ctypes.c_int(b0.shape[-1]),
+                b0.ctypes.data_as(ctypes.c_void_p),
+                ylm.ctypes.data_as(ctypes.c_void_p),
+                c0.ctypes.data_as(ctypes.c_void_p),
+                (ctypes.c_int * 2)(*shls_slice),
+                ao_loc.ctypes.data_as(ctypes.c_void_p),
+                ylm_atom_loc.ctypes.data_as(ctypes.c_void_p),
+                mol._atm.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(mol.natm),
+                mol._bas.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(mol.nbas),
+                mol._env.ctypes.data_as(ctypes.c_void_p),
+                coords.ctypes.data_as(ctypes.c_void_p),
+                atom_coords.ctypes.data_as(ctypes.c_void_p),
+            ]
+            if bwd:
+                fn = libcider.SDMXcontract_ao_to_bas_l1_bwd
+            else:
+                fn = libcider.SDMXcontract_ao_to_bas_l1
+            fn(*args)
+        else:
+            self._contract_ao_to_bas_single_(
+                mol,
+                b0[0],
+                ylm[0],
+                c0,
+                shls_slice,
+                ao_loc,
+                ylm_atom_loc,
+                coords=None,
+                bwd=bwd,
+            )
+        """
+        if self.deriv > 0:
+            atom_coords = np.asfortranarray(mol.atom_coords(unit='Bohr'))
+            self._contract_ao_to_bas_single_(
+                mol, b0[1], ylm[1], c0, shls_slice, ao_loc,
+                ylm_atom_loc, coords=None, bwd=bwd
+            )
+            self._contract_ao_to_bas_single_(
+                mol, b0[2], ylm[2], c0, shls_slice, ao_loc,
+                ylm_atom_loc, coords=None, bwd=bwd
+            )
+            self._contract_ao_to_bas_single_(
+                mol, b0[3], ylm[3], c0, shls_slice, ao_loc,
+                ylm_atom_loc, coords=None, bwd=bwd
+            )
+            self._contract_ao_to_bas_single_(
+                mol, b0[4], ylm[0], c0, shls_slice, ao_loc,
+                ylm_atom_loc, coords=coords[:, 0],
+                atom_coords=atom_coords[:, 0], bwd=bwd
+            )
+            self._contract_ao_to_bas_single_(
+                mol, b0[5], ylm[0], c0, shls_slice, ao_loc,
+                ylm_atom_loc, coords=coords[:, 1],
+                atom_coords=atom_coords[:, 1], bwd=bwd
+            )
+            self._contract_ao_to_bas_single_(
+                mol, b0[6], ylm[0], c0, shls_slice, ao_loc,
+                ylm_atom_loc, coords=coords[:, 2],
+                atom_coords=atom_coords[:, 2], bwd=bwd
+            )
+        """
+
+    def _contract_ao_to_bas(self, mol, c0, shls_slice, ao_loc, coords, ylm=None):
+        ngrids = coords.shape[0]
+        b0 = np.empty((1 + 6 * self.deriv, mol.nbas, ngrids))
+        self._contract_ao_to_bas_helper(
+            mol,
+            b0,
+            c0,
+            shls_slice,
+            ao_loc,
+            coords,
+            ylm=ylm,
+            bwd=False,
+        )
+        return b0
+
+    def _contract_ao_to_bas_bwd(self, mol, b0, shls_slice, ao_loc, coords, ylm=None):
+        ngrids = coords.shape[0]
+        c0 = np.zeros((mol.nao_nr(), ngrids))
+        self._contract_ao_to_bas_helper(
+            mol,
+            b0,
+            c0,
+            shls_slice,
+            ao_loc,
+            coords,
+            ylm=ylm,
+            bwd=True,
+        )
+        return c0.T
+
+    def _eval_crho_potential(
+        self, mol, coords, cao, tmp2, shls_slice, ao_loc, ylm=None
+    ):
+        ngrids = coords.shape[0]
+        nalpha = self.plan.nalpha
+        b0 = np.empty((1 + 6 * self.deriv, mol.nbas, ngrids)).transpose(0, 2, 1)
+        if self.deriv == 0:
+            b0[0] = _scale_ao(cao[:nalpha], tmp2[0])
+        else:
+            libcider.contract_shl_to_alpha_l1_bwd(
+                ctypes.c_int(coords.shape[0]),
+                ctypes.c_int(nalpha),
+                ctypes.c_int(cao.shape[-1]),
+                tmp2.ctypes.data_as(ctypes.c_void_p),
+                b0.ctypes.data_as(ctypes.c_void_p),
+                cao.ctypes.data_as(ctypes.c_void_p),
+            )
+        # if self.deriv > 0:
+        #     b0[1] = _scale_ao(cao[:nalpha], tmp2[1])
+        #     b0[2] = _scale_ao(cao[:nalpha], tmp2[2])
+        #     b0[3] = _scale_ao(cao[:nalpha], tmp2[3])
+        #     b0[4] = _scale_ao(cao[nalpha:2*nalpha], tmp2[1])
+        #     b0[5] = _scale_ao(cao[nalpha:2*nalpha], tmp2[2])
+        #     b0[6] = _scale_ao(cao[nalpha:2*nalpha], tmp2[3])
+        return self._contract_ao_to_bas_bwd(
+            mol, b0.transpose(0, 2, 1), shls_slice, ao_loc, coords, ylm=ylm
+        )
+
+    def get_features(
+        self,
+        dms,
+        mol,
+        coords,
+        non0tab=None,
+        cutoff=None,
+        save_buf=False,
+        ao=None,
+        cao=None,
+    ):
+        coords = np.asfortranarray(coords)
+        ndim = dms.ndim
+        if ndim == 2:
+            dms = dms[None, :, :]
+            n_out = 1
+        elif ndim == 3:
+            n_out = dms.shape[0]
+        else:
+            raise ValueError
+        has_l1 = self.has_l1
+        ngrids = coords.shape[0]
+        nalpha = self.plan.nalpha
+        ncpa = 4 if has_l1 else 1
+        n0 = self.plan.num_l0_feat
+        nfeat = self.plan.settings.nfeat
+        # TODO might want to account for these when computing
+        # memory footprint, though they are linear scaling in
+        # size and therefore less of an issue.
+        tmp = np.empty((ncpa, nalpha, ngrids))
+        if isinstance(self.plan, SDMXIntPlan):
+            l0tmp2 = np.empty((n_out, nalpha, ngrids))
+            l1tmp2 = np.empty((n_out, 3, nalpha, ngrids))
+        else:
+            l0tmp2 = np.empty((n_out, n0, nalpha, ngrids))
+            l1tmp2 = np.empty((n_out, nfeat - n0, 3, nalpha, ngrids))
+        out = np.empty((n_out, nfeat, ngrids))
+        time.monotonic()
+        if ao is None:
+            ao = self.get_ao(
+                mol, coords, non0tab=non0tab, cutoff=cutoff, save_buf=save_buf
+            )
+        if cao is None:
+            cao = self.get_cao(
+                mol, coords, non0tab=non0tab, cutoff=cutoff, save_buf=save_buf
+            )
+        time.monotonic()
+        shls_slice = (0, mol.nbas)
+        ao_loc = mol.ao_loc_nr()
+        ylm = self._get_ylm(mol, coords, savebuf=True)
+        for idm, dm in enumerate(dms):
+            c0 = _dot_ao_dm(mol, ao, dm, None, shls_slice, ao_loc)
+            time.monotonic()
+            b0 = self._contract_ao_to_bas(mol, c0, shls_slice, ao_loc, coords, ylm=ylm)
+            time.monotonic()
+            if has_l1:
+                assert tmp.flags.c_contiguous
+                assert b0.flags.c_contiguous
+                libcider.contract_shl_to_alpha_l1(
+                    ctypes.c_int(coords.shape[0]),
+                    ctypes.c_int(nalpha),
+                    ctypes.c_int(cao.shape[-1]),
+                    tmp.ctypes.data_as(ctypes.c_void_p),
+                    b0.ctypes.data_as(ctypes.c_void_p),
+                    cao.ctypes.data_as(ctypes.c_void_p),
+                )
+            else:
+                for ialpha in range(nalpha):
+                    tmp[0, ialpha] = lib.einsum(
+                        "bg,bg->g", b0[0], cao[0 * nalpha + ialpha].T
+                    )
+            time.monotonic()
+            self.plan.get_features(
+                tmp, out=out[idm], l0tmp=l0tmp2[idm], l1tmp=l1tmp2[idm]
+            )
+            time.monotonic()
+        # print('times', t1 - t0, t2 - t1, t3 - t2, torb2 - torb)
+        # print(out.mean(axis=-1))
+        # exit()
+        if ndim == 2:
+            out = out[0]
+        self._cached_ao_data = (mol, ao, cao, l0tmp2, l1tmp2, coords, ylm)
+        return out
+
+    def get_vxc_(self, vxc_mat, vxc_grid):
+        if self._cached_ao_data is None:
+            raise RuntimeError("Must call get_features first")
+        ndim = vxc_mat.ndim
+        # nalpha = self.plan.nalpha
+        # ngrids = vxc_grid.shape[-1]
+        if ndim == 2:
+            vxc_mat = vxc_mat[None, :, :]
+            vxc_grid = vxc_grid[None, :]
+            n_out = 1
+        elif ndim == 3:
+            n_out = vxc_mat.shape[0]
+        else:
+            raise ValueError
+        # has_l1 = self.has_l1
+        mol, ao, cao, alpha_terms, l1_alpha_terms, coords, ylm = self._cached_ao_data
+        # ncpa = 1 + 3 * self.deriv
+        # n0 = self.plan.num_l0_feat
+        shls_slice = (0, mol.nbas)
+        ao_loc = mol.ao_loc_nr()
+        for idm in range(n_out):
+            tmp2 = self.plan.get_vxc(
+                vxc_grid[idm], alpha_terms[idm], l1_alpha_terms[idm]
+            )
+            tmp3 = self._eval_crho_potential(
+                mol, coords, cao, tmp2, shls_slice, ao_loc, ylm=ylm
+            )
+            vxc_mat[idm] += _dot_ao_ao(mol, ao, tmp3, None, shls_slice, ao_loc, hermi=0)
+        return vxc_mat
diff --git a/examples/pyscf/compute_ae.py b/examples/pyscf/compute_ae.py
new file mode 100644
index 0000000..21bcdb4
--- /dev/null
+++ b/examples/pyscf/compute_ae.py
@@ -0,0 +1,136 @@
+import sys
+from collections import Counter
+
+import ase
+from ase import Atoms
+from ase.data import chemical_symbols, ground_state_magnetic_moments
+from pyscf import dft, gto, scf
+from pyscf.pbc.tools.pyscf_ase import atoms_from_ase
+
+from ciderpress.pyscf.dft import make_cider_calc
+
+"""
+This script demonstrates running a CIDER calculation with accelerated
+nonlocal feature evaluation. Example commands:
+
+    python examples/pyscf/fast_cider.py <molecule_formula> <charge> <spin> <functional>
+    python examples/pyscf/fast_cider.py H2 0 0 PBE
+    python examples/pyscf/fast_cider.py O2 0 2 CIDER_NL_MGGA
+
+<molecule_formula> is a chemical formula string like CH4, H2, etc. It must be included
+in the list of molecules supported by ase.build.molecule()
+
+<charge> is the integer charge of the system.
+
+<spin> is the integer spin of the system 2S.
+
+<functional> is the functional name. It can be semilocal, or it can be
+CIDER_SL_GGA, CIDER_SL_MGGA, CIDER_NL_GGA, or CIDER_NL_MGGA, in which case
+the corresponding example CIDER functional is run with the PBE0/CIDER
+surrogate hybrid functional form.
+
+At the end, prints out the total energy of the molecule and its atomization energy
+in Ha and eV, then saves the atomization energy in eV to aeresult.txt.
+
+NOTE that the ri_cider module is used to initialize the fast CIDER calculation.
+"""
+
+name, charge, spin, functional = sys.argv[1:5]
+charge = int(charge)
+spin = int(spin)
+
+spinpol = True if spin > 0 else False
+BAS = "def2-qzvppd"
+if name == "HF_stretch":
+    BAS = "def2-svp"
+    atoms = Atoms(symbols=["H", "F"], positions=[[0, 0, 0], [0, 0, 1.1]])
+elif name.startswith("el-"):
+    el = name[3:]
+    atoms = Atoms(el)
+elif name.endswith(".xyz"):
+    ismol = True
+    atoms = ase.io.read(name)
+    atoms.center(vacuum=4)
+else:
+    ismol = True
+    from ase.build import molecule
+
+    atoms = molecule(name)
+    atoms.center(vacuum=4)
+
+if functional.startswith("CIDER"):
+    functional = "functionals/{}.yaml".format(functional)
+    is_cider = True
+    mlfunc = functional
+else:
+    is_cider = False
+formula = Counter(atoms.get_atomic_numbers())
+
+mol = gto.M(
+    atom=atoms_from_ase(atoms), basis=BAS, ecp=BAS, spin=spin, charge=charge, verbose=4
+)
+
+# various CIDER settings, as explained in the ri_cider.setup_cider_calc docstring.
+settings = {
+    "xkernel": "GGA_X_PBE",
+    "ckernel": "GGA_C_PBE",
+    "xmix": 0.25,
+    "grid_level": 3,
+    "debug": False,
+    "amax": 3000.0,
+    "cider_lmax": 10,
+    "lambd": 1.6,
+    "aux_beta": 1.6,
+    "onsite_direct": True,
+}
+
+
+def run_calc(mol, spinpol):
+    if spinpol:
+        ks = dft.UKS(mol)
+    else:
+        ks = dft.RKS(mol)
+    ks = ks.density_fit()
+    ks.with_df.auxbasis = "def2-universal-jfit"
+    ks = ks.apply(scf.addons.remove_linear_dep_)
+    if is_cider:
+        ks = make_cider_calc(
+            ks,
+            functional,
+            xmix=0.25,
+            xkernel="GGA_X_PBE",
+            ckernel="GGA_C_PBE",
+        )
+    ks.grids.level = 3
+    etot = ks.kernel()
+    return etot
+
+
+if spin == 0:
+    spinpol = False
+else:
+    spinpol = True
+
+etot_mol = run_calc(mol, spinpol)
+etot_ae = -1 * etot_mol
+
+exit()
+
+for Z, count in formula.items():
+    atom = gto.M(
+        atom=chemical_symbols[Z],
+        basis=BAS,
+        ecp=BAS,
+        spin=int(ground_state_magnetic_moments[Z]),
+        verbose=4,
+    )
+    etot_atom = run_calc(atom, True)
+    etot_ae += count * etot_atom
+
+print("Total and Atomization Energies, Ha")
+print(etot_mol, etot_ae)
+eh2ev = 27.211399
+print("Total and Atomization Energies, eV")
+print(etot_mol * eh2ev, etot_ae * eh2ev)
+with open("aeresult.txt", "w") as f:
+    f.write(str(etot_ae * eh2ev))
diff --git a/scripts/download_functionals.py b/scripts/download_functionals.py
index 604e607..254ac9b 100644
--- a/scripts/download_functionals.py
+++ b/scripts/download_functionals.py
@@ -25,19 +25,20 @@
 savedir = os.path.basename(os.path.join(basedir, "../functionals"))
 
 os.makedirs(savedir, exist_ok=True)
-
 links = {
-    "NL_GGA": "1UNxxvYpxc8RkywxIynbenkLEMGviIce1",
-    "NL_MGGA_DTR": "1-233LBivti-UDfbSuOMdPZuyC_Gibto0",
-    "NL_MGGA_PBE": "15x8s_Pf3_iYoAHl_sWChB2q1HFif1jFA",
-    "NL_MGGA": "1wwYFSJsZX_jhBSlsXVeggQutODXC2KP0",
-    "SL_GGA": "1qD_IpXQNYUbT8Y7nLt4e4pA1E1dNntu7",
-    "SL_MGGA": "18OFNglYpcPXHzXwFcRXeJn9VCLFtC7Zd",
+    "CIDER23X_NL_GGA": "16IF81oS3Snqp96KdS43orSmpmNyRKFEs",
+    "CIDER23X_NL_MGGA_DTR": "1njP58rnZL0L523hEiwXdQ9YoBTlp0Uev",
+    "CIDER23X_NL_MGGA_PBE": "1sVofPzXu4eN2N0cDSC0wawnvNjqCoXhi",
+    "CIDER23X_NL_MGGA": "1Y_pGWGnuzH7PiTT1UntnCga4NPIXBh0m",
+    "CIDER23X_SL_GGA": "1laNkInAIq1EfHACfOf5hUpqL7eWFpoA1",
+    "CIDER23X_SL_MGGA": "1kmYuG50OhTCNB8QVIEHLm-cEQ8mMjwT8",
+    "CIDER24Xe": "1tFKAMvfYJcUZEz11uFSl-DXGW-47l9SV",
+    "CIDER24Xne": "1OUcWv18e2vM0eSDyiSl1dm9g3A7Wu5_3",
 }
 
 os.chdir(savedir)
 cmd = "wget --no-check-certificate 'https://docs.google.com/uc?export=download&id={}' -O {}"
 for name, linkid in links.items():
-    fname = "CIDER23_{}.yaml".format(name)
+    fname = "{}.yaml".format(name)
     mycmd = cmd.format(linkid, fname)
     subprocess.call(mycmd, shell=True)