diff --git a/mesh/infinite_rectangular_2x8.x b/mesh/infinite_rectangular_2x8.x
new file mode 100644
index 0000000..c7e459c
--- /dev/null
+++ b/mesh/infinite_rectangular_2x8.x
@@ -0,0 +1,23 @@
+       1
+       3        9        1
+     0.0000000000000000     0.50000000000000000      1.0000000000000000      0.0000000000000000
+    0.50000000000000000      1.0000000000000000      0.0000000000000000     0.50000000000000000
+     1.0000000000000000      0.0000000000000000     0.50000000000000000      1.0000000000000000
+     0.0000000000000000     0.50000000000000000      1.0000000000000000      0.0000000000000000
+    0.50000000000000000      1.0000000000000000      0.0000000000000000     0.50000000000000000
+     1.0000000000000000      0.0000000000000000     0.50000000000000000      1.0000000000000000
+     0.0000000000000000     0.50000000000000000      1.0000000000000000
+     0.0000000000000000      0.0000000000000000      0.0000000000000000      3750.0000000000000
+     3750.0000000000005      3750.0000000000000      7500.0000000000000      7500.0000000000009
+     7500.0000000000000      11250.000000000000      11250.000000000000      11250.000000000000
+     15000.000000000000      15000.000000000002      15000.000000000000      18750.000000000000
+     18750.000000000000      18750.000000000000      17500.000000000000      17500.000000000000
+     17500.000000000000      13750.000000000000      13750.000000000000      13750.000000000000
+     10000.000000000000      10000.000000000000      10000.000000000000
+     0.0000000000000000      0.0000000000000000      0.0000000000000000      0.0000000000000000
+     0.0000000000000000      0.0000000000000000      0.0000000000000000      0.0000000000000000
+     0.0000000000000000      0.0000000000000000      0.0000000000000000      0.0000000000000000
+     0.0000000000000000      0.0000000000000000      0.0000000000000000      0.0000000000000000
+     0.0000000000000000      0.0000000000000000      0.0000000000000000      0.0000000000000000
+     0.0000000000000000      0.0000000000000000      0.0000000000000000      0.0000000000000000
+     0.0000000000000000      0.0000000000000000      0.0000000000000000
diff --git a/tests/test_non_linear.cpp b/tests/test_non_linear.cpp
index 6348fdb..240ccc6 100644
--- a/tests/test_non_linear.cpp
+++ b/tests/test_non_linear.cpp
@@ -64,8 +64,8 @@ int main(int argc, char** argv) {
     const std::vector<std::string> backends = {"cpu"};
     std::vector<std::pair<std::string, std::unique_ptr<LiftCurveFunctor>>> lift_curves;
     lift_curves.emplace_back(std::make_pair("spallart1", std::make_unique<SpallartLiftCurve>(1.2f, 0.28f, 0.02f, 2.f*PI_f, 2.f*PI_f)));
-    lift_curves.emplace_back(std::make_pair("spallart2", std::make_unique<SpallartLiftCurve>(0.72f, 0.28f, 0.04f, 2.f*PI_f, 1.5f*PI_f)));
-    lift_curves.emplace_back(std::make_pair("polar", std::make_unique<ThinAirfoilPolarLiftCurve>()));
+    // lift_curves.emplace_back(std::make_pair("spallart2", std::make_unique<SpallartLiftCurve>(0.72f, 0.28f, 0.04f, 2.f*PI_f, 1.5f*PI_f)));
+    // lift_curves.emplace_back(std::make_pair("polar", std::make_unique<ThinAirfoilPolarLiftCurve>()));
     
     std::vector<f32> test_alphas = {0, 5, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
     std::transform(test_alphas.begin(), test_alphas.end(), test_alphas.begin(), to_radians);
diff --git a/vlm/backends/cpu/include/vlm_backend_cpu.hpp b/vlm/backends/cpu/include/vlm_backend_cpu.hpp
index 81b0227..28d9ca2 100644
--- a/vlm/backends/cpu/include/vlm_backend_cpu.hpp
+++ b/vlm/backends/cpu/include/vlm_backend_cpu.hpp
@@ -5,16 +5,17 @@
 
 namespace vlm {
 
-class BackendGeneric : public Backend {
+class BackendCPU : public Backend {
     public:
         std::vector<f32> lhs;
         std::vector<f32> rhs;
         std::vector<i32> ipiv;
         std::vector<f32> gamma;
         std::vector<f32> delta_gamma;
+        std::vector<f32> trefftz_buffer;
 
-        BackendGeneric(Mesh& mesh);
-        ~BackendGeneric();
+        BackendCPU(Mesh& mesh);
+        ~BackendCPU();
         void reset() override;
         void compute_lhs(const FlowData& flow) override;
         void compute_rhs(const FlowData& flow) override;
diff --git a/vlm/backends/cpu/include/vlm_backend_cpu_kernels.hpp b/vlm/backends/cpu/include/vlm_backend_cpu_kernels.hpp
index 2b8be98..7ff4093 100644
--- a/vlm/backends/cpu/include/vlm_backend_cpu_kernels.hpp
+++ b/vlm/backends/cpu/include/vlm_backend_cpu_kernels.hpp
@@ -9,4 +9,5 @@ void kernel_influence(
     f32 normalx[], f32 normaly[], f32 normalz[],
     u32 ia, u32 lidx, f32 sigma_p4
     );
-}
\ No newline at end of file
+}
+
diff --git a/vlm/backends/cpu/src/vlm_backend_cpu.cpp b/vlm/backends/cpu/src/vlm_backend_cpu.cpp
index a9d8444..33aea8c 100644
--- a/vlm/backends/cpu/src/vlm_backend_cpu.cpp
+++ b/vlm/backends/cpu/src/vlm_backend_cpu.cpp
@@ -3,7 +3,7 @@
 #include "vlm_backend_cpu_kernels_ispc.h"
 
 #include "linalg.h"
-#include "simpletimer.hpp"
+#include "tinytimer.hpp"
 #include "vlm_mesh.hpp"
 #include "vlm_data.hpp"
 #include "vlm_utils.hpp"
@@ -22,23 +22,24 @@
 
 using namespace vlm;
 
-BackendGeneric::~BackendGeneric() = default; // Destructor definition
+BackendCPU::~BackendCPU() = default; // Destructor definition
 
-BackendGeneric::BackendGeneric(Mesh& mesh) : Backend(mesh) {
+BackendCPU::BackendCPU(Mesh& mesh) : Backend(mesh) {
     lhs.resize((u64)mesh.nb_panels_wing() * (u64)mesh.nb_panels_wing());
     rhs.resize(mesh.nb_panels_wing());
     ipiv.resize(mesh.nb_panels_wing());
     gamma.resize(mesh.nb_panels_wing());
     delta_gamma.resize(mesh.nb_panels_wing());
+    trefftz_buffer.resize(mesh.ns);
 }
 
-void BackendGeneric::reset() {
+void BackendCPU::reset() {
     std::fill(gamma.begin(), gamma.end(), 0.0f);
     std::fill(lhs.begin(), lhs.end(), 0.0f);
     std::fill(rhs.begin(), rhs.end(), 0.0f);
 }
 
-void BackendGeneric::compute_delta_gamma() {
+void BackendCPU::compute_delta_gamma() {
     std::copy(gamma.begin(), gamma.begin()+mesh.ns, delta_gamma.begin());
 
     // note: this is efficient as the memory is contiguous
@@ -49,290 +50,16 @@ void BackendGeneric::compute_delta_gamma() {
     }
 }
 
-inline void kernel_biosavart(f32& vx, f32& vy, f32& vz, f32& x, f32& y, f32& z, f32& x1, f32& y1, f32& z1, f32& x2, f32& y2, f32& z2) {
-    static const f32 rcut = 1.0e-10f;
-    vx = 0.0f;
-    vy = 0.0f;
-    vz = 0.0f;
-
-    // Katz Plotkin, Low speed Aero | Eq 10.115
-    const f32 r1r2x =   (y-y1)*(z-z2) - (z-z1)*(y-y2);
-    const f32 r1r2y = -((x-x1)*(z-z2) - (z-z1)*(x-x2));
-    const f32 r1r2z =   (x-x1)*(y-y2) - (y-y1)*(x-x2);
-
-    const f32 r1 = std::sqrt(pow<2>(x-x1)+pow<2>(y-y1)+pow<2>(z-z1));
-    const f32 r2 = std::sqrt(pow<2>(x-x2)+pow<2>(y-y2)+pow<2>(z-z2));
-    const f32 square = pow<2>(r1r2x) + pow<2>(r1r2y) + pow<2>(r1r2z);
-    if ((r1<rcut) || (r2<rcut) || (square<rcut)) return;
-
-    const f32 r0r1 = (x2-x1)*(x-x1)+(y2-y1)*(y-y1)+(z2-z1)*(z-z1);
-    const f32 r0r2 = (x2-x1)*(x-x2)+(y2-y1)*(y-y2)+(z2-z1)*(z-z2);
-    const f32 coeff = 1.0f/(4.0f*PI_f*square) * (r0r1/r1 - r0r2/r2);
-
-    vx = coeff * r1r2x;
-    vy = coeff * r1r2y;
-    vz = coeff * r1r2z;
-}
-
-inline void kernel_symmetry(f32& inf_x, f32& inf_y, f32& inf_z, f32 x, f32 y, f32 z, f32 x1, f32 y1, f32 z1, f32 x2, f32 y2, f32 z2) {
-    f32 vx, vy, vz;
-    kernel_biosavart(vx, vy, vz, x, y, z, x1, y1, z1, x2, y2, z2);
-    inf_x += vx;
-    inf_y += vy;
-    inf_z += vz;
-    y = -y; // wing symmetry
-    kernel_biosavart(vx, vy, vz, x, y, z, x1, y1, z1, x2, y2, z2);
-    inf_x += vx;
-    inf_y -= vy;
-    inf_z += vz;
-}
-
-template<bool Overwrite>
-inline void macro_kernel_remainder_scalar(Mesh& m, std::vector<f32>& lhs, u32 ia, u32 lidx) {
-    // quick return 
-    const u32 remainder =  m.nb_panels_wing() % 8;
-    if (remainder == 0) return;
-
-    const u32 v0 = lidx + lidx / m.ns;
-    const u32 v1 = v0 + 1;
-    const u32 v3 = v0 + m.ns+1;
-    const u32 v2 = v3 + 1;
-    
-    f32 v0x = m.v.x[v0];
-    f32 v0y = m.v.y[v0];
-    f32 v0z = m.v.z[v0];
-    f32 v1x = m.v.x[v1];
-    f32 v1y = m.v.y[v1];
-    f32 v1z = m.v.z[v1];
-    f32 v2x = m.v.x[v2];
-    f32 v2y = m.v.y[v2];
-    f32 v2z = m.v.z[v2];
-    f32 v3x = m.v.x[v3];
-    f32 v3y = m.v.y[v3];
-    f32 v3z = m.v.z[v3];
-
-    for (u32 ia2 = m.nb_panels_wing() - remainder; ia2 < m.nb_panels_wing(); ia2++) {
-        const f32 colloc_x = m.colloc.x[ia2];
-        const f32 colloc_y = m.colloc.y[ia2];
-        const f32 colloc_z = m.colloc.z[ia2];
-
-        // 3 regs to store induced velocity 
-        f32 inf_x = 0.0f;
-        f32 inf_y = 0.0f;
-        f32 inf_z = 0.0f;
-        kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v0x, v0y, v0z, v1x, v1y, v1z);
-        kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v1x, v1y, v1z, v2x, v2y, v2z);
-        kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v2x, v2y, v2z, v3x, v3y, v3z);
-        kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v3x, v3y, v3z, v0x, v0y, v0z);
-        f32 nx = m.normal.x[ia2];
-        f32 ny = m.normal.y[ia2];
-        f32 nz = m.normal.z[ia2];
-        f32 ring_inf = inf_x * nx + inf_y * ny + inf_z * nz;
-        // store in col major order
-        if (Overwrite) {
-            lhs[ia * m.nb_panels_wing() + ia2] = ring_inf;
-        } else {
-            lhs[ia * m.nb_panels_wing() + ia2] += ring_inf;
-        }
-    }
-}
-
-template<bool Overwrite>
-inline void macro_kernel_scalar(Mesh& m, std::vector<f32>& lhs, u32 ia, u32 lidx) {
-    const u32 v0 = lidx + lidx / m.ns;
-    const u32 v1 = v0 + 1;
-    const u32 v3 = v0 + m.ns+1;
-    const u32 v2 = v3 + 1;
-    
-    f32 v0x = m.v.x[v0];
-    f32 v0y = m.v.y[v0];
-    f32 v0z = m.v.z[v0];
-    f32 v1x = m.v.x[v1];
-    f32 v1y = m.v.y[v1];
-    f32 v1z = m.v.z[v1];
-    f32 v2x = m.v.x[v2];
-    f32 v2y = m.v.y[v2];
-    f32 v2z = m.v.z[v2];
-    f32 v3x = m.v.x[v3];
-    f32 v3y = m.v.y[v3];
-    f32 v3z = m.v.z[v3];
-
-    for (u32 ia2 = 0; ia2 < m.nb_panels_wing(); ia2++) {
-        const f32 colloc_x = m.colloc.x[ia2];
-        const f32 colloc_y = m.colloc.y[ia2];
-        const f32 colloc_z = m.colloc.z[ia2];
-
-        // 3 regs to store induced velocity 
-        f32 inf_x = 0.0f;
-        f32 inf_y = 0.0f;
-        f32 inf_z = 0.0f;
-        kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v0x, v0y, v0z, v1x, v1y, v1z);
-        kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v1x, v1y, v1z, v2x, v2y, v2z);
-        kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v2x, v2y, v2z, v3x, v3y, v3z);
-        kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v3x, v3y, v3z, v0x, v0y, v0z);
-        f32 nx = m.normal.x[ia2];
-        f32 ny = m.normal.y[ia2];
-        f32 nz = m.normal.z[ia2];
-        f32 ring_inf = inf_x * nx + inf_y * ny + inf_z * nz;
-        // store in col major order
-        if (Overwrite) {
-            lhs[ia * m.nb_panels_wing() + ia2] = ring_inf;
-        } else {
-            lhs[ia * m.nb_panels_wing() + ia2] += ring_inf;
-        }
-    }
-}
-
-inline void micro_kernel_influence_avx2(__m256& vx, __m256& vy, __m256& vz, __m256& x, __m256& y, __m256& z, __m256& x1, __m256& y1, __m256& z1, __m256& x2, __m256& y2, __m256& z2, f32 sigma_p4) {
-    static const __m256 threshold = _mm256_set1_ps(1e-10f);
-    static const __m256 four_pi = _mm256_set1_ps(4.0f * PI_f);
-    static const __m256 zero = _mm256_set1_ps(0.0f);
-
-    vx = zero;
-    vy = zero;
-    vz = zero;
-    // define vectors
-    __m256 r1x = _mm256_sub_ps(x, x1);
-    __m256 r1y = _mm256_sub_ps(y, y1);
-    __m256 r1z = _mm256_sub_ps(z, z1);
-    __m256 r2x = _mm256_sub_ps(x, x2);
-    __m256 r2y = _mm256_sub_ps(y, y2);
-    __m256 r2z = _mm256_sub_ps(z, z2);
-
-    // crossproduct
-    // (v0y*v1z - v0z*v1y);
-    // (v0z*v1x - v0x*v1z);
-    // (v0x*v1y - v0y*v1x);
-    __m256 r1_x_r2_x = _mm256_fmsub_ps(r1y, r2z, _mm256_mul_ps(r1z, r2y));
-    __m256 r1_x_r2_y = _mm256_fmsub_ps(r1z, r2x, _mm256_mul_ps(r1x, r2z));
-    __m256 r1_x_r2_z = _mm256_fmsub_ps(r1x, r2y, _mm256_mul_ps(r1y, r2x));
-
-    // magnitude & mag squared of crossproduct
-    __m256 r1_x_r2_mag_p2 = _mm256_fmadd_ps(r1_x_r2_x, r1_x_r2_x, _mm256_fmadd_ps(r1_x_r2_y, r1_x_r2_y, _mm256_mul_ps(r1_x_r2_z, r1_x_r2_z)));
-    __m256 r1_mag = _mm256_sqrt_ps(_mm256_fmadd_ps(r1x, r1x, _mm256_fmadd_ps(r1y, r1y, _mm256_mul_ps(r1z, r1z))));
-    __m256 r2_mag = _mm256_sqrt_ps(_mm256_fmadd_ps(r2x, r2x, _mm256_fmadd_ps(r2y, r2y, _mm256_mul_ps(r2z, r2z))));
-
-    // vector from point 1 to point 2 of segment
-    __m256 r0x = _mm256_sub_ps(x2, x1);
-    __m256 r0y = _mm256_sub_ps(y2, y1);
-    __m256 r0z = _mm256_sub_ps(z2, z1);
-
-    // dot product r0.r1 and r0.r2
-    __m256 r0_dot_r1 = _mm256_fmadd_ps(r0x, r1x, _mm256_fmadd_ps(r0y, r1y, _mm256_mul_ps(r0z, r1z)));
-    __m256 r0_dot_r2 = _mm256_fmadd_ps(r0x, r2x, _mm256_fmadd_ps(r0y, r2y, _mm256_mul_ps(r0z, r2z)));
-
-    __m256 numerator = _mm256_fmsub_ps(r0_dot_r1, r2_mag, _mm256_mul_ps(r0_dot_r2, r1_mag));
-
-    __m256 four_pi_r1_mag_r2_mag = _mm256_mul_ps(four_pi, _mm256_mul_ps(r1_mag, r2_mag)); // 4*pi*|r1|*|r2|
-    __m256 denominator;
-
-    // if (sigma_p4 < 1e-6f) {
-    //     // Singular Bio-Savart
-    //     denominator = _mm256_mul_ps(four_pi_r1_mag_r2_mag, r1_x_r2_mag_p2);
-    // } else {
-        // Vatistas smoothing kernel (n=2) (https://doi.org/10.3390/fluids7020081)
-        __m256 r1_x_r2_mag_p4 = _mm256_mul_ps(r1_x_r2_mag_p2, r1_x_r2_mag_p2); // ^2n
-        __m256 r0_mag_p2 = _mm256_fmadd_ps(r0x, r0x, _mm256_fmadd_ps(r0y, r0y, _mm256_mul_ps(r0z, r0z)));
-        __m256 r0_mag_p4 = _mm256_mul_ps(r0_mag_p2, r0_mag_p2); // ^2n
-        denominator = _mm256_mul_ps(four_pi_r1_mag_r2_mag, _mm256_sqrt_ps(_mm256_fmadd_ps(_mm256_set1_ps(sigma_p4), r0_mag_p4, r1_x_r2_mag_p4)));
-    //}
-    
-    __m256 coeff = _mm256_div_ps(numerator, denominator);
-
-    // add the influence and blend with mask
-    // the masks should be done independently for optimal ILP but if compiler smart he can do it
-    __m256 mask = _mm256_cmp_ps(r1_mag, threshold, _CMP_LT_OS);
-    mask = _mm256_or_ps(mask, _mm256_cmp_ps(r2_mag, threshold, _CMP_LT_OS));
-    mask = _mm256_or_ps(mask, _mm256_cmp_ps(r1_x_r2_mag_p2, threshold, _CMP_LT_OS));
-
-    vx = _mm256_blendv_ps(_mm256_mul_ps(r1_x_r2_x, coeff), zero, mask);
-    vy = _mm256_blendv_ps(_mm256_mul_ps(r1_x_r2_y, coeff), zero, mask);
-    vz = _mm256_blendv_ps(_mm256_mul_ps(r1_x_r2_z, coeff), zero, mask);
-}
-
-inline void kernel_influence_avx2(__m256& inf_x, __m256& inf_y, __m256& inf_z, __m256 x, __m256 y, __m256 z, __m256 x1, __m256 y1, __m256 z1, __m256 x2, __m256 y2, __m256 z2, f32 sigma_p4) {
-    __m256 vx, vy, vz;
-    micro_kernel_influence_avx2(vx, vy, vz, x, y, z, x1, y1, z1, x2, y2, z2, sigma_p4);
-    inf_x = _mm256_add_ps(inf_x, vx);
-    inf_y = _mm256_add_ps(inf_y, vy);
-    inf_z = _mm256_add_ps(inf_z, vz);
-    y = _mm256_xor_ps(y, _mm256_set1_ps(-0.0f)); // wing symmetry
-    micro_kernel_influence_avx2(vx, vy, vz, x, y, z, x1, y1, z1, x2, y2, z2, sigma_p4);
-    inf_x = _mm256_add_ps(inf_x, vx);
-    inf_y = _mm256_sub_ps(inf_y, vy); // HERE IS SUB INSTEAD OF ADD !
-    inf_z = _mm256_add_ps(inf_z, vz);
-}
-
-// Fill a column of the LHS matrix (influence of a single panel on all the others)
-template<bool Overwrite>
-inline void macro_kernel_avx2(Mesh& m, std::vector<f32>& lhs, u32 ia, u32 lidx, f32 sigma_p4) {
-    const u32 v0 = lidx + lidx / m.ns;
-    const u32 v1 = v0 + 1;
-    const u32 v3 = v0 + m.ns+1;
-    const u32 v2 = v3 + 1;
-    // in reality only load v1 & v2 and reuse data from previous v1 & v2 to be new v0 & v3 respectively
-    // 12 regs (6 loads + 6 reuse) -> these will get spilled once we get in the influence function
-    __m256 v0x = _mm256_broadcast_ss(&m.v.x[v0]);
-    __m256 v0y = _mm256_broadcast_ss(&m.v.y[v0]);
-    __m256 v0z = _mm256_broadcast_ss(&m.v.z[v0]);
-    __m256 v1x = _mm256_broadcast_ss(&m.v.x[v1]);
-    __m256 v1y = _mm256_broadcast_ss(&m.v.y[v1]);
-    __m256 v1z = _mm256_broadcast_ss(&m.v.z[v1]);
-    __m256 v2x = _mm256_broadcast_ss(&m.v.x[v2]);
-    __m256 v2y = _mm256_broadcast_ss(&m.v.y[v2]);
-    __m256 v2z = _mm256_broadcast_ss(&m.v.z[v2]);
-    __m256 v3x = _mm256_broadcast_ss(&m.v.x[v3]);
-    __m256 v3y = _mm256_broadcast_ss(&m.v.y[v3]);
-    __m256 v3z = _mm256_broadcast_ss(&m.v.z[v3]);
-
-    for (u32 ia2 = 0; ia2 <= m.nb_panels_wing()-8; ia2+=8) {
-        // loads (3 regs)
-        __m256 colloc_x = _mm256_loadu_ps(&m.colloc.x[ia2]);
-        __m256 colloc_y = _mm256_loadu_ps(&m.colloc.y[ia2]);
-        __m256 colloc_z = _mm256_loadu_ps(&m.colloc.z[ia2]);
-        // 3 regs to store induced velocity
-        __m256 inf_x = _mm256_setzero_ps();
-        __m256 inf_y = _mm256_setzero_ps();
-        __m256 inf_z = _mm256_setzero_ps();
-
-        kernel_influence_avx2(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v0x, v0y, v0z, v1x, v1y, v1z, sigma_p4);
-        kernel_influence_avx2(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v1x, v1y, v1z, v2x, v2y, v2z, sigma_p4);
-        kernel_influence_avx2(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v2x, v2y, v2z, v3x, v3y, v3z, sigma_p4);
-        kernel_influence_avx2(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v3x, v3y, v3z, v0x, v0y, v0z, sigma_p4);
-
-        // dot product
-        __m256 nx = _mm256_loadu_ps(&m.normal.x[ia2]);
-        __m256 ny = _mm256_loadu_ps(&m.normal.y[ia2]);
-        __m256 nz = _mm256_loadu_ps(&m.normal.z[ia2]);
-        __m256 ring_inf = _mm256_fmadd_ps(inf_x, nx, _mm256_fmadd_ps(inf_y, ny, _mm256_mul_ps(inf_z, nz)));
-
-        // store in col major order
-        if (Overwrite) {
-            _mm256_storeu_ps(&lhs[ia * m.nb_panels_wing() + ia2], ring_inf);
-        } else {                
-            __m256 lhs_ia = _mm256_loadu_ps(&lhs[ia * m.nb_panels_wing() + ia2]);
-            lhs_ia = _mm256_add_ps(lhs_ia, ring_inf);
-            _mm256_storeu_ps(&lhs[ia * m.nb_panels_wing() + ia2], lhs_ia);
-        }
-        if (ia == 0 && ia2 == 0) {
-            for (u32 iii = 0; iii < 8; iii++) {
-                std::printf("%f ", lhs[iii]);
-            }
-            std::printf("\n");
-        }
-    }
-}
-
-void BackendGeneric::compute_lhs(const FlowData& flow) {
-    SimpleTimer timer("LHS");
+void BackendCPU::compute_lhs(const FlowData& flow) {
+    tiny::ScopedTimer timer("LHS");
     Mesh& m = mesh;
+
     ispc::MeshProxy mesh_proxy = {
-        m.nc, m.ns, m.nb_panels_wing(),
+        m.ns, m.nc, m.nb_panels_wing(),
         {m.v.x.data(), m.v.y.data(), m.v.z.data()}, 
         {m.colloc.x.data(), m.colloc.y.data(), m.colloc.z.data()},
         {m.normal.x.data(), m.normal.y.data(), m.normal.z.data()}
     };
-    // const f32 sigma_p4 = pow<4>(flow.sigma_vatistas); // Vatistas coeffcient (^2n with n=2)
 
     const u32 start_wing = 0;
     const u32 end_wing = (m.nc - 1) * m.ns;
@@ -341,15 +68,6 @@ void BackendGeneric::compute_lhs(const FlowData& flow) {
 
     auto init = taskflow.placeholder();
     auto wing_pass = taskflow.for_each_index(start_wing, end_wing, [&] (u32 i) {
-        // macro_kernel_avx2<true>(m, lhs, i, i, flow.sigma_vatistas);
-        // macro_kernel_remainder_scalar<true>(m, lhs, i, i);
-        // kernel_influence(m.nc, m.ns,
-        // lhs.data(),
-        // m.v.x.data(), m.v.y.data(), m.v.z.data(),
-        // m.colloc.x.data(), m.colloc.y.data(), m.colloc.z.data(),
-        // m.normal.x.data(), m.normal.y.data(), m.normal.z.data(),
-        // i, i, flow.sigma_vatistas);
-        //macro_kernel_scalar<false>(m, lhs, i, i);
         ispc::kernel_influence(mesh_proxy, lhs.data(), i, i, flow.sigma_vatistas);
     });
 
@@ -360,14 +78,7 @@ void BackendGeneric::compute_lhs(const FlowData& flow) {
     auto wake_pass = taskflow.for_each_index(0u, m.ns, [&] (u32 j) {
         const u32 ia = (m.nc - 1) * m.ns + j;
         const u32 lidx = idx * m.ns + j;
-        kernel_influence(m.nc, m.ns,
-        lhs.data(),
-        m.v.x.data(), m.v.y.data(), m.v.z.data(),
-        m.colloc.x.data(), m.colloc.y.data(), m.colloc.z.data(),
-        m.normal.x.data(), m.normal.y.data(), m.normal.z.data(),
-        ia, lidx, flow.sigma_vatistas);
-        // macro_kernel_avx2<false>(m, lhs, ia, lidx, flow.sigma_vatistas);
-        // macro_kernel_remainder_scalar<false>(m, lhs, idx, idx);
+        ispc::kernel_influence(mesh_proxy, lhs.data(), ia, lidx, flow.sigma_vatistas);
     });
     auto back = taskflow.emplace([&]{
         idx++;
@@ -390,15 +101,15 @@ void kernel_cpu_rhs(u32 n, const float normal_x[], const float normal_y[], const
     }
 }
 
-void BackendGeneric::compute_rhs(const FlowData& flow) {
-    SimpleTimer timer("RHS");
+void BackendCPU::compute_rhs(const FlowData& flow) {
+    tiny::ScopedTimer timer("RHS");
     const Mesh& m = mesh;
     
     kernel_cpu_rhs(m.nb_panels_wing(), m.normal.x.data(), m.normal.y.data(), m.normal.z.data(), flow.freestream.x, flow.freestream.y, flow.freestream.z, rhs.data());
 }
 
-void BackendGeneric::compute_rhs(const FlowData& flow, const std::vector<f32>& section_alphas) {
-    SimpleTimer timer("Rebuild RHS");
+void BackendCPU::compute_rhs(const FlowData& flow, const std::vector<f32>& section_alphas) {
+    tiny::ScopedTimer timer("Rebuild RHS");
     assert(section_alphas.size() == mesh.ns);
     const Mesh& m = mesh;
     for (u32 i = 0; i < mesh.nc; i++) {
@@ -410,24 +121,24 @@ void BackendGeneric::compute_rhs(const FlowData& flow, const std::vector<f32>& s
     }
 }
 
-void BackendGeneric::lu_factor() {
-    SimpleTimer timer("Factor");
+void BackendCPU::lu_factor() {
+    tiny::ScopedTimer timer("Factor");
     const u32 n = mesh.nb_panels_wing();
     LAPACKE_sgetrf(LAPACK_COL_MAJOR, n, n, lhs.data(), n, ipiv.data());
 }
 
-void BackendGeneric::lu_solve() {
-    SimpleTimer timer("Solve");
+void BackendCPU::lu_solve() {
+    tiny::ScopedTimer timer("Solve");
     const u32 n = mesh.nb_panels_wing();
     std::copy(rhs.begin(), rhs.end(), gamma.begin());
 
     LAPACKE_sgetrs(LAPACK_COL_MAJOR, 'N', n, 1, lhs.data(), n, ipiv.data(), gamma.data(), n);
 }
 
-f32 BackendGeneric::compute_coefficient_cl(const FlowData& flow, const f32 area,
+f32 BackendCPU::compute_coefficient_cl(const FlowData& flow, const f32 area,
     const u32 j, const u32 n) {
     assert(n > 0);
-    assert(j >= 0 and j+n <= mesh.ns);
+    assert(j >= 0 && j+n <= mesh.ns);
     
     f32 cl = 0.0f;
 
@@ -448,7 +159,7 @@ f32 BackendGeneric::compute_coefficient_cl(const FlowData& flow, const f32 area,
     return cl;
 }
 
-linalg::alias::float3 BackendGeneric::compute_coefficient_cm(
+linalg::alias::float3 BackendCPU::compute_coefficient_cm(
     const FlowData& flow,
     const f32 area,
     const f32 chord,
@@ -456,7 +167,7 @@ linalg::alias::float3 BackendGeneric::compute_coefficient_cm(
     const u32 n)
 {
     assert(n > 0);
-    assert(j >= 0 and j+n <= mesh.ns);
+    assert(j >= 0 && j+n <= mesh.ns);
     linalg::alias::float3 cm(0.f, 0.f, 0.f);
 
     for (u32 u = 0; u < mesh.nc; u++) {
@@ -477,46 +188,121 @@ linalg::alias::float3 BackendGeneric::compute_coefficient_cm(
     return cm;
 }
 
-f32 BackendGeneric::compute_coefficient_cd(
+inline void kernel_biosavart(f32& vx, f32& vy, f32& vz, f32& x, f32& y, f32& z, f32& x1, f32& y1, f32& z1, f32& x2, f32& y2, f32& z2) {
+    static const f32 rcut = 1.0e-10f;
+    vx = 0.0f;
+    vy = 0.0f;
+    vz = 0.0f;
+
+    // Katz Plotkin, Low speed Aero | Eq 10.115
+    const f32 r1r2x =   (y-y1)*(z-z2) - (z-z1)*(y-y2);
+    const f32 r1r2y = -((x-x1)*(z-z2) - (z-z1)*(x-x2));
+    const f32 r1r2z =   (x-x1)*(y-y2) - (y-y1)*(x-x2);
+
+    const f32 r1 = std::sqrt(pow<2>(x-x1)+pow<2>(y-y1)+pow<2>(z-z1));
+    const f32 r2 = std::sqrt(pow<2>(x-x2)+pow<2>(y-y2)+pow<2>(z-z2));
+    const f32 square = pow<2>(r1r2x) + pow<2>(r1r2y) + pow<2>(r1r2z);
+    if ((r1<rcut) || (r2<rcut) || (square<rcut)) return;
+
+    const f32 r0r1 = (x2-x1)*(x-x1)+(y2-y1)*(y-y1)+(z2-z1)*(z-z1);
+    const f32 r0r2 = (x2-x1)*(x-x2)+(y2-y1)*(y-y2)+(z2-z1)*(z-z2);
+    const f32 coeff = 1.0f/(4.0f*PI_f*square) * (r0r1/r1 - r0r2/r2);
+
+    vx = coeff * r1r2x;
+    vy = coeff * r1r2y;
+    vz = coeff * r1r2z;
+}
+
+inline void kernel_symmetry(f32& inf_x, f32& inf_y, f32& inf_z, f32 x, f32 y, f32 z, f32 x1, f32 y1, f32 z1, f32 x2, f32 y2, f32 z2) {
+    f32 vx, vy, vz;
+    kernel_biosavart(vx, vy, vz, x, y, z, x1, y1, z1, x2, y2, z2);
+    inf_x += vx;
+    inf_y += vy;
+    inf_z += vz;
+    y = -y; // wing symmetry
+    kernel_biosavart(vx, vy, vz, x, y, z, x1, y1, z1, x2, y2, z2);
+    inf_x += vx;
+    inf_y -= vy;
+    inf_z += vz;
+}
+
+using namespace linalg::alias;
+
+inline float3 kernel_biosavart(const float3& colloc, const float3& vertex1, const float3& vertex2, const f32& sigma) {
+    const f32 rcut = 1e-10f; // highly tuned value
+    const float3 r0 = vertex2 - vertex1;
+    const float3 r1 = colloc - vertex1;
+    const float3 r2 = colloc - vertex2;
+    // Katz Plotkin, Low speed Aero | Eq 10.115
+    const float3 r1r2cross = linalg::cross(r1, r2);
+    const f32 r1_norm = linalg::length(r1);
+    const f32 r2_norm = linalg::length(r2);
+    const f32 square = linalg::length2(r1r2cross);
+    if ((r1_norm<rcut) || (r2_norm<rcut) || (square<rcut)) {
+        return float3{0.0f, 0.0f, 0.0f};
+    }
+    
+    const f32 smoother = sigma*sigma*linalg::length2(r0);
+    const f32 coeff = (linalg::dot(r0,r1)/r1_norm - linalg::dot(r0,r2)/r2_norm) / (4.0f*PI_f*std::sqrt(square*square + smoother*smoother));
+    return r1r2cross * coeff;
+}
+
+inline void kernel_symmetry(float3& inf, float3 colloc, const float3& vertex0, const float3& vertex1, const f32 sigma) {
+    float3 induced_speed = kernel_biosavart(colloc, vertex0, vertex1, sigma);
+    inf += induced_speed;
+    colloc.y = -colloc.y; // wing symmetry
+    float3 induced_speed_sym = kernel_biosavart(colloc, vertex0, vertex1, sigma);
+    inf.x += induced_speed_sym.x;
+    inf.y -= induced_speed_sym.y;
+    inf.z += induced_speed_sym.z;
+}
+
+f32 BackendCPU::compute_coefficient_cd(
     const FlowData& flow,
     const f32 area,
     const u32 j,
     const u32 n) 
 {
+    tiny::ScopedTimer timer("Compute CD");
     assert(n > 0);
-    assert(j >= 0 and j+n <= mesh.ns);
-
+    assert(j >= 0 && j+n <= mesh.ns);
+    std::fill(trefftz_buffer.begin(), trefftz_buffer.end(), 0.0f);
+    
     f32 cd = 0.0f;
     // Drag coefficent computed using Trefftz plane
     const u32 begin = j + mesh.nb_panels_wing();
     const u32 end = begin + n;
     // parallel for
     for (u32 ia = begin; ia < end; ia++) {
-        const f32 colloc_x = mesh.colloc.x[ia];
-        const f32 colloc_y = mesh.colloc.y[ia];
-        const f32 colloc_z = mesh.colloc.z[ia];
-        linalg::alias::float3 inf(0.f, 0.f, 0.f);
+        const u32 v0 = ia + ia / mesh.ns;
+        const u32 v1 = v0 + 1;
+        const u32 v3 = v0 + n+1;
+        const u32 v2 = v3 + 1;
+
+        const float3 vertex0{mesh.v.x[v0], mesh.v.y[v0], mesh.v.z[v0]};
+        const float3 vertex1{mesh.v.x[v1], mesh.v.y[v1], mesh.v.z[v1]};
+        const float3 vertex2{mesh.v.x[v2], mesh.v.y[v2], mesh.v.z[v2]};
+        const float3 vertex3{mesh.v.x[v3], mesh.v.y[v3], mesh.v.z[v3]};
+
+        const f32 gammaw = gamma[ia - mesh.ns];
+
         for (u32 ia2 = begin; ia2 < end; ia2++) {
+            const float3 colloc(mesh.colloc.x[ia2], mesh.colloc.y[ia2], mesh.colloc.z[ia2]);
+            const float3 normal(mesh.normal.x[ia2], mesh.normal.y[ia2], mesh.normal.z[ia2]);
             linalg::alias::float3 inf2(0.f, 0.f, 0.f);
-            linalg::alias::float3 v0 = mesh.get_v0(ia2);
-            linalg::alias::float3 v1 = mesh.get_v1(ia2);
-            linalg::alias::float3 v2 = mesh.get_v2(ia2);
-            linalg::alias::float3 v3 = mesh.get_v3(ia2);
             // Influence from the streamwise vortex lines
-            kernel_symmetry(inf2.x, inf2.y, inf2.z, colloc_x, colloc_y, colloc_z, v1.x, v1.y, v1.z, v2.x, v2.y, v2.z);
-            kernel_symmetry(inf2.x, inf2.y, inf2.z, colloc_x, colloc_y, colloc_z, v3.x, v3.y, v3.z, v0.x, v0.y, v0.z);
-            f32 gamma_w = gamma[(mesh.nc-1)*mesh.ns + ia2 % mesh.ns];
+            kernel_symmetry(inf2, colloc, vertex1, vertex2, flow.sigma_vatistas);
+            kernel_symmetry(inf2, colloc, vertex3, vertex0, flow.sigma_vatistas);
             // This is the induced velocity calculated with the vortex (gamma) calculated earlier (according to kutta condition)
-            inf += gamma_w * inf2;
+            trefftz_buffer[ia2 - begin] += gammaw * linalg::dot(inf2, normal);
         }
-        const linalg::alias::float3 normal{mesh.normal.x[ia], mesh.normal.y[ia], mesh.normal.z[ia]};
-        const f32 w_ind = linalg::dot(inf, normal);
-        const u32 col = ia % mesh.ns;
-        linalg::alias::float3 v0 = mesh.get_v0(ia);
-        linalg::alias::float3 v1 = mesh.get_v1(ia);
-        const f32 dl = linalg::length(v1 - v0);
-        cd -= 0.5f * flow.rho * gamma[(mesh.nc-1)*mesh.ns + col] * w_ind * dl;
     }
-    cd /= 0.5f * flow.rho * linalg::length2(flow.freestream) * area;
+
+    for (u32 i = 0; i < mesh.ns; i++) {
+        const u32 li = (mesh.nc-1) * mesh.ns + i;
+        const f32 dl = mesh.strip_width(i);
+        cd -= gamma[li] * trefftz_buffer[i] * dl; // used to have 0.5f * flow.rho
+    }
+    cd /= linalg::length2(flow.freestream) * area;
     return cd;
 }
\ No newline at end of file
diff --git a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.cpp b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.cpp
index af2a0a7..6d6e8d1 100644
--- a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.cpp
+++ b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.cpp
@@ -16,7 +16,9 @@ inline float3 kernel_biosavart(const float3& colloc, const float3& vertex1, cons
     const f32 r1_norm = linalg::length(r1);
     const f32 r2_norm = linalg::length(r2);
     const f32 square = linalg::length2(r1r2cross);
-    if ((r1_norm<rcut) || (r2_norm<rcut) || (square<rcut)) return float3{0.0f, 0.0f, 0.0f};
+    if ((r1_norm<rcut) || (r2_norm<rcut) || (square<rcut)) {
+        return float3{0.0f, 0.0f, 0.0f};
+    }
     
     const f32 smoother = sigma*sigma*linalg::length2(r0);
     const f32 coeff = (linalg::dot(r0,r1)/r1_norm - linalg::dot(r0,r2)/r2_norm) / (4.0f*PI_f*std::sqrt(square*square + smoother*smoother));
diff --git a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc
index 72151a9..41632a8 100644
--- a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc
+++ b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc
@@ -28,7 +28,6 @@ inline float length(const float3& v) {return LENGTH(v);}
 inline uniform float length(const uniform float3& v) {return LENGTH(v);}
 
 // Aggregate Structures
-
 export struct SoA3D {
     uniform float* uniform x;
     uniform float* uniform y;
@@ -46,7 +45,8 @@ export struct MeshProxy {
 
 // Bio-savart Kernel
 #define RCUT 1e-10f // highly tuned value
-#define PI_f 3.14159274f
+
+#define PI_f 3.141593f
 
 inline float3 kernel_biosavart(float3& colloc, const uniform float3& vertex1, const uniform float3& vertex2, const uniform float& sigma) {
     uniform float3 r0 = vertex2 - vertex1;
@@ -57,19 +57,23 @@ inline float3 kernel_biosavart(float3& colloc, const uniform float3& vertex1, co
     float r1_norm = length(r1);
     float r2_norm = length(r2);
     float square = length2(r1r2cross);
+    
     if ((r1_norm<RCUT) || (r2_norm<RCUT) || (square<RCUT)) {
         float3 res = {0.0f, 0.0f, 0.0f};
         return res;
     }
-    
+
     uniform float smoother = sigma*sigma*length2(r0);
-    float coeff = (dot(r0,r1)/r1_norm - dot(r0,r2)/r2_norm) / (4.0f*PI_f*sqrt(square*square + smoother*smoother));
+
+    float coeff = (dot(r0,r1)*r2_norm - dot(r0, r2)*r1_norm) / (4.0f*PI_f*sqrt(square*square + smoother*smoother)*r1_norm*r2_norm);
     return r1r2cross * coeff;
 }
 
 inline void kernel_symmetry(float3& inf, float3 colloc, const uniform float3& vertex0, const uniform float3& vertex1, const uniform float& sigma) {
     float3 induced_speed = kernel_biosavart(colloc, vertex0, vertex1, sigma);
-    inf += induced_speed;
+    inf.x += induced_speed.x;
+    inf.y += induced_speed.y;
+    inf.z += induced_speed.z;
     colloc.y = -colloc.y; // wing symmetry
     float3 induced_speed_sym = kernel_biosavart(colloc, vertex0, vertex1, sigma);
     inf.x += induced_speed_sym.x;
@@ -104,4 +108,11 @@ export void kernel_influence(
         kernel_symmetry(inf, colloc, vertex3, vertex0, sigma);
         lhs[ia * m.nb_panels + ia2] += dot(inf, normal); // store
     }
-}
\ No newline at end of file
+}
+
+// export uniform float kernel_trefftz_cd(
+//     uniform const MeshProxy& m,
+//     uniform float* uniform gamma,
+//     uniform unsigned int ia, uniform float sigma
+//     ) {
+// }
\ No newline at end of file
diff --git a/vlm/backends/cpu/xmake.lua b/vlm/backends/cpu/xmake.lua
index eb4c0a2..0c39656 100644
--- a/vlm/backends/cpu/xmake.lua
+++ b/vlm/backends/cpu/xmake.lua
@@ -9,7 +9,7 @@ target("backend-cpu")
     add_packages("openblas_custom", { public = true })
 
     add_rules("utils.ispc", {header_extension = "_ispc.h"})
-    set_values("ispc.flags", "--target=host")
+    set_values("ispc.flags", {"--target=host", "-O1"})
     add_files("src/*.ispc")
 
     add_includedirs("../../include")
diff --git a/vlm/backends/cuda/include/vlm_backend_cuda.hpp b/vlm/backends/cuda/include/vlm_backend_cuda.hpp
index ba1c273..892ab15 100644
--- a/vlm/backends/cuda/include/vlm_backend_cuda.hpp
+++ b/vlm/backends/cuda/include/vlm_backend_cuda.hpp
@@ -10,7 +10,7 @@ namespace vlm {
 
 class BackendCUDA : public Backend {
     public:
-        BackendGeneric default_backend; // temporary
+        BackendCPU default_backend; // temporary
 
         float* d_lhs = nullptr;
         float* d_rhs = nullptr;
diff --git a/vlm/dev/main.cpp b/vlm/dev/main.cpp
index 5b3ad15..e40b066 100644
--- a/vlm/dev/main.cpp
+++ b/vlm/dev/main.cpp
@@ -56,7 +56,7 @@ int main(int argc, char **argv) {
     });
 
     try {
-        //vlm::Executor::instance(1); // 1 thread
+        // vlm::Executor::instance(1); // 1 thread
         LinearVLM solver(cfg);
         std::vector<f32> alphas = cfg().section("solver").get_vector<f32>("alphas", {0.0f});
         std::transform(alphas.begin(), alphas.end(), alphas.begin(), 
diff --git a/vlm/include/vlm_mesh.hpp b/vlm/include/vlm_mesh.hpp
index 48ff59d..e75c450 100644
--- a/vlm/include/vlm_mesh.hpp
+++ b/vlm/include/vlm_mesh.hpp
@@ -56,6 +56,7 @@ class Mesh {
     f32 panels_area(const u32 i, const u32 j, const u32 m, const u32 n) const;
     f32 panels_area_xy(const u32 i, const u32 j, const u32 m, const u32 n) const;
     f32 panel_width_y(const u32 i, const u32 j) const;
+    f32 strip_width(const u32 j) const;
     f32 chord_length(const u32 j) const;
     f32 chord_mean(const u32 j, const u32 n) const;
 
diff --git a/vlm/src/vlm_backend.cpp b/vlm/src/vlm_backend.cpp
index 195e943..a905638 100644
--- a/vlm/src/vlm_backend.cpp
+++ b/vlm/src/vlm_backend.cpp
@@ -31,7 +31,7 @@ std::unique_ptr<Backend> vlm::create_backend(const std::string& backend_name, Me
 
     #ifdef VLM_CPU
     if (backend_name == "cpu") {
-        return std::make_unique<BackendGeneric>(mesh);
+        return std::make_unique<BackendCPU>(mesh);
     }
     #endif
     #ifdef VLM_CUDA
diff --git a/vlm/src/vlm_mesh.cpp b/vlm/src/vlm_mesh.cpp
index fbd234f..87d7c85 100644
--- a/vlm/src/vlm_mesh.cpp
+++ b/vlm/src/vlm_mesh.cpp
@@ -149,6 +149,13 @@ f32 Mesh::panel_width_y(const u32 i, const u32 j) const {
     return v.y[j + 1 + i * ld] - v.y[j + i * ld];
 }
 
+f32 Mesh::strip_width(const u32 j) const {
+    assert(j < ns);
+    const linalg::alias::float3 v0 = get_v0(j);
+    const linalg::alias::float3 v1 = get_v1(j);
+    return linalg::length(v1 - v0);
+}
+
 /// @brief Computes the chord length of a chordwise segment
 /// @details Since the mesh follows the camber line, the chord length is computed
 /// as the distance between the first and last vertex of a chordwise segment