diff --git a/src/tpsci_outer.jl b/src/tpsci_outer.jl
index b42c1bf..21e5f37 100644
--- a/src/tpsci_outer.jl
+++ b/src/tpsci_outer.jl
@@ -1,6 +1,7 @@
 using TimerOutputs
 using BlockDavidson
 using BenchmarkTools
+using Printf
 
 """
     build_full_H(ci_vector::TPSCIstate, cluster_ops, clustered_ham::ClusteredOperator)
@@ -1051,3 +1052,374 @@ end
 #    return eval.(a)
 #end
 
+function do_fois_ci(ref::TPSCIstate{T,N,R}, cluster_ops, clustered_ham;
+                    H0          = "Hcmf",
+                    max_iter    = 50,
+                    nbody       = 4,
+                    thresh_foi  = 1e-6,
+                    tol         = 1e-5,
+                    thresh_clip = 1e-6,
+                    threaded    =false,
+                    prescreen   = false,
+                    compress    = false,
+                    pt          =false,
+                    verbose     = true) where {T,N,R}
+    @printf("\n-------------------------------------------------------\n")
+    @printf(" Do CI in FOIS\n")
+    @printf("   H0                      = %-s\n", H0)
+    @printf("   thresh_foi              = %-8.1e\n", thresh_foi)
+    @printf("   nbody                   = %-i\n", nbody)
+    @printf("\n")
+    @printf("   Length of Reference     = %-i\n", length(ref))
+    @printf("\n-------------------------------------------------------\n")
+
+# 
+    # Solve variationally in reference space
+    ref_vec = deepcopy(ref)
+    @printf(" Solve zeroth-order problem. Dimension = %10i\n", length(ref_vec))
+    @time e0, ref_vec = tps_ci_direct(ref_vec, cluster_ops, clustered_ham, conv_thresh=tol)
+    
+
+    #
+    # Get First order wavefunction
+    println()
+    println(" Compute FOIS. Reference space dim = ", length(ref_vec))
+    # pt1_vec= deepcopy(ref_vec)
+    # pt1_vec=matvec(pt1_vec)
+    if threaded == true
+        pt1_vec = open_matvec_thread(ref_vec, cluster_ops, clustered_ham, nbody=nbody, thresh=thresh_foi, prescreen=prescreen)
+    else
+        pt1_vec = open_matvec_serial(ref_vec, cluster_ops, clustered_ham, nbody=nbody, thresh=thresh_foi, prescreen=prescreen)
+    end
+    for i in 1:R
+        @printf("Arnab: %12.8f\n", sqrt.(orth_dot(pt1_vec,pt1_vec))[i])
+    end
+    project_out!(pt1_vec, ref)
+    # Compress FOIS
+    if compress==true
+        norm1 = sqrt.(orth_dot(pt1_vec, pt1_vec))
+        dim1 = length(pt1_vec)
+        clip!(pt1_vec, thresh=thresh_clip) #does clip! function do the compression? or have to write a compress function.
+        norm2 = sqrt.(orth_dot(pt1_vec, pt1_vec))
+        dim2 = length(pt1_vec)
+        @printf(" %-50s%10i → %-10i (thresh = %8.1e)\n", "FOIS Compressed from: ", dim1, dim2, thresh_foi)
+        for i in 1:R
+            @printf(" %-50s%10.2e → %-10.2e (thresh = %8.1e)\n", "Norm of |1>: ",norm1[i], norm2[i], thresh_foi)
+        end
+    end
+    for i in 1:R
+        @printf(" %-50s%10.6f\n", "Overlap between <1|0>: ", overlap(pt1_vec, ref_vec)[i])
+    end
+
+    add!(ref_vec, pt1_vec)
+    # Solve for first order wavefunction 
+    println(" Compute CI energy in the space = ", length(ref_vec))
+   
+    eci, ref_vec = tps_ci_direct(ref_vec, cluster_ops, clustered_ham;)
+    for i in 1:R
+        @printf(" E(Ref)   for %ith state   = %12.8f\n",i, e0[i])
+        @printf(" E(CI) tot for %ith state = %12.8f\n",i, eci[i])
+    end
+    if pt==true
+        e_pt2,pt1_vec= compute_pt1_wavefunction(ref_vec, cluster_ops, clustered_ham;  H0=H0,verbose=verbose)  
+        for i in 1:R
+            @printf(" E(PT2)  for %ith state   = %12.8f\n",i, e_pt2[i])
+        end
+    end
+    return eci, ref_vec 
+    # println("debugging")
+    # error()
+end
+
+"""
+    tucker_cepa_solve(ref_vector::TPSCIstate, cepa_vector::TPSCIstate, cluster_ops, clustered_ham; tol=1e-5, cache=true)
+
+# Arguments
+- `ref_vector`: Input reference state. 
+- `cepa_vector`: TPSCIstate which defines the configurational space defining {X}. This 
+should be the first-order interacting space (or some compressed version of it).
+- `cluster_ops`
+- `clustered_ham`
+- `tol`: haven't yet set this up (NYI)
+- `cache`: Should we cache the compressed H operators? Speeds up drastically, but uses lots of memory
+
+Compute compressed CEPA.
+Since there can be non-zero overlap with a multireference state, we need to generalize.
+
+    HC = SCe
+
+    |Haa + Hax| |1 | = |I   + Sax| |1 | E
+    |Hxa + Hxx| |Cx|   |Sxa + I  | |Cx|
+
+    Haa + Hax*Cx = (1 + Sax*Cx)E
+    Hxa + HxxCx = SxaE + CxE
+
+The idea for CEPA is to approximate E in the amplitude equation.
+CEPA(0): E = Eref
+
+    (Hxx-Eref)*Cx = Sxa*Eref - Hxa
+
+Ax=b
+
+After solving, the Energy can be obtained as:
+    
+    E = (Eref + Hax*Cx) / (1 + Sax*Cx)
+"""
+function tpsci_cepa_solve(ref_vector::TPSCIstate{T,N,R}, cepa_vector::TPSCIstate, cluster_ops, clustered_ham, 
+                           cepa_shift="cepa", 
+                           cepa_mit = 50; 
+                           tol=1e-5, 
+                           cache=true, 
+                           max_iter=30, 
+                           verbose=false) where {T,N,R}
+    # sig=deepcopy(ci_vector)
+    # zero!(sig)
+    sig=open_matvec_thread(ref_vector, cluster_ops, clustered_ham)
+    e0=overlap(sig, ref_vector)
+    length(e0) == 1 || error("Only one state at a time please", e0)
+    e0 = e0[1]
+    @printf("Reference Energy: %12.8f\n", e0) 
+    n_clusters = length(cepa_vector.clusters)
+    x_vector = deepcopy(cepa_vector)
+    a_vector = deepcopy(ref_vector)
+    # b=deepcopy(x_vector)
+    # zero!(b)
+
+    b=open_matvec_thread(ref_vector, cluster_ops, clustered_ham)
+    
+    @printf(" Overlap between <0|0>:          %18.12e\n", orth_dot(ref_vector, ref_vector)[1])
+    @printf(" Overlap between <1|0>:          %18.12e\n", overlap(x_vector, ref_vector)[1])
+    @printf(" Overlap between <1|1>:          %18.12e\n", orth_dot(x_vector, x_vector)[1])
+    
+
+    #
+    # Get Overlap <X|A>C(A)
+    Sx=deepcopy(x_vector)
+    zero!(Sx)
+    for (fock,configs) in x_vector.data 
+        for (config, coeffs) in configs
+            if haskey(ref_vector, fock)
+                if haskey(ref_vector[fock], config)
+                    ref_coeffs = ref_vector[fock][config]
+                    overlaps=Vector{Float64}(undef, R)
+                    for i in 1:length(Sx.clusters)
+                        push!(overlaps, ref_coeffs[i]'.*coeffs[i])
+                    end
+                    display(overlaps)
+                    Sx[fock][config]=overlaps
+                    
+                end
+            end
+        end
+    end
+    b_correct=deepcopy(Sx)
+    zero!(b_correct)
+    for (fock,configs) in b.data 
+        for (config, coeffs) in configs
+            if haskey(Sx, fock)
+                if haskey(Sx[fock], config)
+                    b_coeffs = b[fock][config]
+                    b_correct[fock][config]=b_coeffs
+                end
+            end
+        end
+    end
+    # display(length(ref_vector))
+    # display(length(x_vector))
+    # display(length(b))
+    # display(length(b_correct))
+    # display(length(Sx))
+    # display(length(bv))
+    # display(Sx.data)
+    bv=-get_vector(b_correct)
+    @printf(" Norm of Sx overlap: %18.12f\n", overlap(Sx,Sx)[1])
+    @printf(" Norm of b         : %18.12f\n", sum(bv.*bv))
+
+    Ec = 0
+    Ecepa = 0
+    
+
+    for it in 1:cepa_mit 
+    	bv = -get_vector(b_correct)
+    	if cepa_shift == "cepa"
+            cepa_mit = 1
+	        shift = 0.0
+	    elseif cepa_shift == "acpf"
+
+	        shift = Ec * 2.0 / n_clusters
+	    elseif cepa_shift == "aqcc"
+	        shift = (1.0 - (n_clusters-3.0)*(n_clusters - 2.0)/(n_clusters * ( n_clusters-1.0) )) * Ec
+	    elseif cepa_shift == "cisd"
+	        shift = Ec
+	    else
+            println()
+            println("NYI: cepa_shift is not available:",cepa_shift)
+            println()
+            exit()
+	    end
+	    eshift = e0+shift
+        display(eshift)
+        bv .= bv .+ get_vector(Sx)* (eshift)
+        function mymatvec(v)
+            xr = TPSCIstate(x_vector, R=1)
+            xl = TPSCIstate(x_vector, R=1)
+            length(xr) .== length(v) || throw(DimensionMismatch)
+            set_vector!(xr, Vector(v), root=1)
+            zero!(xl)
+            xr=open_matvec_thread(xr, cluster_ops, clustered_ham)
+
+            tmp = deepcopy(xr)
+            scale!(tmp, -eshift)
+            add!(xl, tmp)
+            return get_vector(xl)
+        end
+        @printf(" %-50s%10.6f\n", "Norm of b: ", sum(bv.*bv))
+        dim = length(x_vector)
+        Axx = LinearMap(mymatvec, dim, dim)
+        flush_cache(clustered_ham)
+        # if cache
+        #     @printf(" %-50s", "Cache zeroth-order Hamiltonian: ")
+        #     @time cache_hamiltonian(x_vector, x_vector, cluster_ops, clustered_ham)
+        # end
+
+
+        for r in 1:R
+                
+            println(" Start CEPA iterations with dimension = ", length(x_vector))
+            xv = get_vector(x_vector,r)
+            time = @elapsed x, solver = cg!(xv, Axx, bv[:,r],
+                                            log=true, maxiter=max_iter, verbose=verbose, abstol=tol)
+            @printf(" %-50s%10.6f seconds\n", "Time to solve for CEPA with conjugate gradient: ", time)
+
+            set_vector!(x_vector, xv[:,1], root=r)
+        end
+
+        flush_cache(clustered_ham)
+
+        SxC = overlap(Sx,x_vector)
+        @printf(" %-50s%10.2f\n", "<A|X>C(X): ", SxC[1])
+
+        # sig = deepcopy(ref_vector)
+        # zero!(sig)
+        sig=open_matvec_thread(ref_vector, cluster_ops, clustered_ham)
+        ecorr = overlap(sig, x_vector)
+        @printf(" Cepa: %18.12f\n", ecorr[1])
+
+        # sig = deepcopy(x_vector)
+        # zero!(sig)
+        sig=open_matvec_thread(x_vector, cluster_ops, clustered_ham)
+        ecorr = overlap(sig, ref_vector)
+        @printf(" Cepa: %18.12f\n", ecorr[1])
+        length(ecorr) == 1 || error(" Dimension Error", ecorr)
+        ecorr = ecorr[1]
+        @printf(" <1|1> = %18.12f\n", overlap(x_vector,x_vector))
+        @printf(" <1|1> = %18.12f\n", sum(x.*x))
+
+        @printf(" E(CEPA) = %18.12f\n", (e0[1] + ecorr[1])/(1+SxC[1]))
+        Ecepa = (e0[1] + ecorr[1])/(1+SxC[1])
+        #@printf(" %s %18.12f\n",cepa_shift, (e0 + ecorr)/(1+SxC))
+        @printf("Iter: %4d        %18.12f %18.12f \n",it,Ec ,Ecepa-e0)
+	    if abs(Ec - (Ecepa-e0)) < 1e-6
+            @printf(" Converged %s %18.12f\n",cepa_shift, (e0[1] + ecorr[1])/(1+SxC[1]))
+	        break
+	    end
+	    Ec = Ecepa - e0
+    end
+
+    #x, info = linsolve(Hmap,zeros(size(v0)))
+    return Ecepa, x_vector 
+end#=}}}=#
+
+function do_fois_cepa(ref::TPSCIstate{T,N,R}, cluster_ops, clustered_ham;
+                        max_iter=20,
+                        cepa_shift="cepa",
+                        cepa_mit=30,
+                        nbody=4,
+                        thresh_foi=1e-6,
+                        thresh_clip=1e-5,
+                        tol=1e-5,
+                        compress=false,
+                        compress_type="matvec",
+                        verbose=true) where {T,N,R}
+    @printf("\n-------------------------------------------------------\n")
+    @printf(" Do CEPA\n")
+    @printf("   thresh_foi              = %-8.1e\n", thresh_foi)
+    @printf("   nbody                   = %-i\n", nbody)
+    @printf("\n")
+    @printf("   Length of Reference     = %-i\n", length(ref))
+    @printf("   Calculation type        = %s\n", cepa_shift)
+    @printf("   Compression type        = %s\n", compress_type)
+    @printf("\n-------------------------------------------------------\n")
+
+    # 
+    # Solve variationally in reference space
+    println()
+    ref_vec = deepcopy(ref)
+    @printf(" Solve zeroth-order problem. Dimension = %10i\n", length(ref_vec))
+    @time e0, ref_vec = tps_ci_direct(ref_vec, cluster_ops, clustered_ham, conv_thresh=tol)
+
+    #
+     # Get First order wavefunction
+     println()
+     println(" Compute FOIS. Reference space dim = ", length(ref_vec))
+     pt1_vec = deepcopy(ref_vec)
+     pt1_vec=open_matvec_thread(pt1_vec, cluster_ops, clustered_ham, nbody=nbody, thresh=thresh_foi)
+     for i in 1:R
+         @printf("Arnab: %12.8f\n", sqrt.(orth_dot(pt1_vec, pt1_vec))[i])
+     end
+    project_out!(pt1_vec, ref)
+    # display(pt1_vec)
+
+    # Compress FOIS
+    if compress==true
+        norm1 = sqrt.(orth_dot(pt1_vec, pt1_vec))
+        dim1 = length(pt1_vec)
+        clip!(pt1_vec, thresh=thresh_clip)
+        norm2 = sqrt.(orth_dot(pt1_vec, pt1_vec))
+        dim2 = length(pt1_vec)
+        @printf(" %-50s%10i → %-10i (thresh = %8.1e)\n", "FOIS Compressed from: ", dim1, dim2, thresh_foi)
+        for i in 1:R
+            @printf(" %-50s%10.2e → %-10.2e (thresh = %8.1e)\n", "Norm of |1>: ",norm1[i], norm2[i], thresh_foi)
+        end
+    end
+    for i in 1:R
+        @printf(" %-50s%10.6f\n", "Overlap between <1|0>: ", overlap(pt1_vec, ref_vec)[i])
+    end
+    # 
+    
+    # Solve CEPA 
+    println()
+    cepa_vec = deepcopy(pt1_vec)
+    e_cepa_r=[]
+    println("Do CEPA: Dim = ", length(cepa_vec))
+    
+    # x_cepa=TPSCIstate(ref.clusters,T=T,R=R)
+    x_cepa = deepcopy(ref)
+    zero!(x_cepa)
+    for i in 1:R
+        ref_vec_i=extract_chosen_root(ref_vec, i)
+        cepa_vec_i=extract_chosen_root(cepa_vec, i)
+        zero!(cepa_vec)
+        display(cepa_vec)
+        display(ref_vec)
+        println(" Do CEPA: Dim = ", length(cepa_vec_i))
+        println("debugging")
+        # error()
+        @time e_cepa, x_cepa_i = tpsci_cepa_solve(ref_vec_i, cepa_vec_i, cluster_ops, clustered_ham, cepa_shift, cepa_mit, tol=tol, max_iter=max_iter, verbose=verbose)
+        
+        @printf(" E(cepa) corr =                 %12.8f\n", e_cepa[1])
+        @printf(" X(cepa) norm =                 %12.8f\n", sqrt(orth_dot(x_cepa, x_cepa)[1]))
+        push!(e_cepa_r, e_cepa[1])
+        for (fock, configs) in x_cepa_i.data
+            for (config, coeffs) in configs
+                if i==1
+                    x_cepa[fock][config] =MVector{R, T}(undef)
+                end
+                x_cepa[fock][config][i, :] .= coeffs
+            end
+        end
+    end
+    add!(x_cepa, ref_vec)
+    orthonormalize!(x_cepa)
+    return e_cepa, x_cepa
+end
\ No newline at end of file
diff --git a/src/tucker_outer.jl b/src/tucker_outer.jl
index e1cd316..014c798 100644
--- a/src/tucker_outer.jl
+++ b/src/tucker_outer.jl
@@ -257,11 +257,11 @@ After solving, the Energy can be obtained as:
 """
 function tucker_cepa_solve(ref_vector::BSTstate{T,N,R}, cepa_vector::BSTstate, cluster_ops, clustered_ham, 
                            cepa_shift="cepa", 
-                           cepa_mit = 50; 
-                           tol=1e-5, 
-                           cache=true, 
-                           max_iter=30, 
-                           verbose=false) where {T,N,R}
+                           cepa_mit  = 50; 
+                           tol       =1e-5, 
+                           cache     =true, 
+                           max_iter  =30, 
+                           verbose   =false) where {T,N,R}
 #={{{=#
 
     sig = deepcopy(ref_vector)
@@ -270,7 +270,7 @@ function tucker_cepa_solve(ref_vector::BSTstate{T,N,R}, cepa_vector::BSTstate, c
     e0 = nonorth_dot(ref_vector, sig)
     length(e0) == 1 || error("Only one state at a time please", e0)
     e0 = e0[1]
-    @printf(" Reference Energy: %12.8f\n",e0[1])
+    @printf(" Reference Energy: %12.8f\n",e0)
 
     n_clusters = length(cepa_vector.clusters)
 
@@ -332,29 +332,33 @@ function tucker_cepa_solve(ref_vector::BSTstate{T,N,R}, cepa_vector::BSTstate, c
 
     Ec = 0
     Ecepa = 0
-    if cepa_shift == "cepa"
-        cepa_mit = 1
-    end
+    # if cepa_shift == "cepa"
+        
+    # end
     for it in 1:cepa_mit 
-
+        println("CEPA cycle: ", it)
     	bv = -get_vector(b)
         #n_clusters = 8
     	if cepa_shift == "cepa"
-	    shift = 0.0
-	elseif cepa_shift == "acpf"
-
-	    shift = Ec * 2.0 / n_clusters
-	elseif cepa_shift == "aqcc"
-	    shift = (1.0 - (n_clusters-3.0)*(n_clusters - 2.0)/(n_clusters * ( n_clusters-1.0) )) * Ec
-	elseif cepa_shift == "cisd"
-	    shift = Ec
-	else
-	    println()
-	    println("NYI: cepa_shift is not available:",cepa_shift)
-	    println()
-	    exit()
-	end
-	eshift = e0+shift
+            cepa_mit = 1
+	        shift = 0.0
+	    elseif cepa_shift == "acpf"
+
+	        shift = Ec * 2.0 / n_clusters
+	    elseif cepa_shift == "aqcc"
+	        shift = (1.0 - (n_clusters-3.0)*(n_clusters - 2.0)/(n_clusters * ( n_clusters-1.0) )) * Ec
+	    elseif cepa_shift == "cisd"
+	        shift = Ec
+	    else
+            println()
+            println("NYI: cepa_shift is not available:",cepa_shift)
+            println()
+            exit()
+        end
+	
+	    eshift = e0+shift
+        # display(shift)
+        # display(eshift)
         bv .= bv .+ get_vector(Sx)* (eshift)
 
         function mymatvec(v)
@@ -428,10 +432,10 @@ function tucker_cepa_solve(ref_vector::BSTstate{T,N,R}, cepa_vector::BSTstate, c
         Ecepa = (e0[1] + ecorr[1])/(1+SxC[1])
         #@printf(" %s %18.12f\n",cepa_shift, (e0 + ecorr)/(1+SxC))
         @printf("Iter: %4d        %18.12f %18.12f \n",it,Ec ,Ecepa-e0)
-	    if abs(Ec - (Ecepa-e0)) < 1e-6
+	    if abs(Ec - (Ecepa-e0)) < 1e-10
             @printf(" Converged %s %18.12f\n",cepa_shift, (e0[1] + ecorr[1])/(1+SxC[1]))
-	    break
-	end
+	        break
+	    end
 	Ec = Ecepa - e0
     end
 
@@ -1379,7 +1383,7 @@ function do_fois_cepa(ref::BSTstate{T,N,R}, cluster_ops, clustered_ham;
     println(" Do CEPA: Dim = ", length(cepa_vec))
     for i in 1:R
         ref_vec_i=FermiCG.BSTstate(ref_vec,i) 
-        display(ref_vec_i)
+        # display(ref_vec_i)
         cepa_vec_i=FermiCG.BSTstate(cepa_vec,i)
         zero!(cepa_vec_i) 
         println(" Do CEPA: Dim = ", length(cepa_vec_i))
diff --git a/src/type_TPSCIstate.jl b/src/type_TPSCIstate.jl
index b76b342..16d2cbe 100644
--- a/src/type_TPSCIstate.jl
+++ b/src/type_TPSCIstate.jl
@@ -581,7 +581,27 @@ function orthonormalize!(s::TPSCIstate{T,N,R}) where {T,N,R}
     set_vector!(s,v0)
 end
 #=}}}=#
+"""
+    orth_dot(ts1::FermiCG.TPSCIstate, ts2::FermiCG.TPSCIstate)
+
+Dot product between `ts2` and `ts1`
 
+"""
+function orth_dot(ts1::TPSCIstate{T,N,R}, ts2::TPSCIstate{T,N,R}) where {T,N,R}
+    #={{{=#
+    overlap = zeros(T,R)
+    for (fock,configs) in ts2.data
+        haskey(ts1, fock) || continue
+        for (config,coeffs) in configs
+            haskey(ts1[fock], config) || continue
+            for r in 1:R
+                overlap[r] += sum(ts1[fock][config][r] .* ts2[fock][config][r])
+            end
+        end
+    end
+    return overlap
+    #=}}}=#
+end
 
 """
     clip!(s::TPSCIstate; thresh=1e-5)
@@ -650,6 +670,27 @@ function extract_roots(v::TPSCIstate{T,N,R}, roots) where {T,N,R}
     return out
 end
 
+"""
+    function extract_chosen_root(v::TPSCIstate{T,N,R}, root::Int)
+
+Extracts the chosen root to give a new `TPSCIstate` with only that root.
+"""
+function extract_chosen_root(v::TPSCIstate{T,N,R}, root::Int) where {T,N,R}
+    if root < 1 || root > R
+        throw(ArgumentError("Root index $root is out of bounds. It should be between 1 and $R."))
+    end
+    out = TPSCIstate(v.clusters, T=T, R=1)
+    for (fock, configs) in v.data
+        add_fockconfig!(out, fock)
+        # display(configs)
+        for (config, coeffs) in configs
+            # display(coeffs)
+            out[fock][config] =MVector{1, T}(coeffs[root])
+        end
+    end
+    
+    return out
+end
 
 
 """