diff --git a/vlm/backends/cpu/include/vlm_backend_cpu.hpp b/vlm/backends/cpu/include/vlm_backend_cpu.hpp
index b66435a..2f74569 100644
--- a/vlm/backends/cpu/include/vlm_backend_cpu.hpp
+++ b/vlm/backends/cpu/include/vlm_backend_cpu.hpp
@@ -7,7 +7,7 @@ namespace vlm {
 
 class BackendCPU : public Backend {
     public:
-        BackendCPU();
+        BackendCPU(MeshGeom* mesh, u64 timesteps);
         ~BackendCPU();
         void reset() override;
         void lhs_assemble() override;
@@ -24,6 +24,9 @@ class BackendCPU : public Backend {
         void compute_delta_gamma() override;
         void set_velocities(const linalg::alias::float3& vel) override;
         void set_velocities(const SoA_3D_t<f32>& vels) override;
+    private:
+        f32 mesh_mac(u64 j, u64 n) override; // mean chord
+        f32 mesh_area(const u64 i, const u64 j, const u64 m, const u64 n) override; // mean span
 };
 
 } // namespace vlm
\ No newline at end of file
diff --git a/vlm/backends/cpu/src/vlm_backend_cpu.cpp b/vlm/backends/cpu/src/vlm_backend_cpu.cpp
index 361c8c0..c6fd580 100644
--- a/vlm/backends/cpu/src/vlm_backend_cpu.cpp
+++ b/vlm/backends/cpu/src/vlm_backend_cpu.cpp
@@ -20,49 +20,42 @@
 #include <cblas.h>
 
 using namespace vlm;
-
-template<typename T>
-void print_buffer(const T* start, u64 size) {
-    std::cout << "[";
-    for (u64 i = 0; i < size; i++) {
-        std::cout << start[i] << ",";
-    }
-    std::cout << "]\n";
-}
-
 using namespace linalg::ostream_overloads;
 
 BackendCPU::~BackendCPU() = default; // Destructor definition
 
-// TODO: replace any kind of size arithmetic with methods
-BackendCPU::BackendCPU(Mesh& mesh) : Backend(mesh) {
-    lhs.resize(mesh.nb_panels_wing() * mesh.nb_panels_wing());
-
-    rollup_vertices.resize(mesh.nb_vertices_total()); // TODO: exclude the wing vertices
-    local_velocities.resize(mesh.nb_panels_wing());
-    rhs.resize(mesh.nb_panels_wing());
-    ipiv.resize(mesh.nb_panels_wing());
-    gamma.resize((mesh.nc + mesh.nw) * mesh.ns); // store wake gamma as well
-    gamma_prev.resize(mesh.nb_panels_wing());
-    delta_gamma.resize(mesh.nb_panels_wing());
-    trefftz_buffer.resize(mesh.ns);
+BackendCPU::BackendCPU(MeshGeom* mesh, u64 timesteps) {
+    allocator.h_malloc = std::malloc;
+    allocator.d_malloc = std::malloc;
+    allocator.h_free = std::free;
+    allocator.d_free = std::free;
+    allocator.hh_memcpy = std::memcpy;
+    allocator.hd_memcpy = std::memcpy;
+    allocator.dh_memcpy = std::memcpy;
+    allocator.dd_memcpy = std::memcpy;
+    allocator.h_memset = std::memset;
+    allocator.d_memset = std::memset;
+    init(mesh, timesteps);
 }
 
 void BackendCPU::reset() {
     // TODO: verify that these are needed
-    std::fill(gamma.begin(), gamma.end(), 0.0f);
-    std::fill(lhs.begin(), lhs.end(), 0.0f);
-    std::fill(rhs.begin(), rhs.end(), 0.0f);
+    // std::fill(gamma.begin(), gamma.end(), 0.0f);
+    // std::fill(lhs.begin(), lhs.end(), 0.0f);
+    // std::fill(rhs.begin(), rhs.end(), 0.0f);
 }
 
 void BackendCPU::compute_delta_gamma() {
+    const u64 nc = hd_mesh->nc;
+    const u64 ns = hd_mesh->ns;
+    const u64 nw = hd_mesh->nw;
     // const tiny::ScopedTimer timer("Delta Gamma");
-    std::copy(gamma.data(), gamma.data()+mesh.ns, delta_gamma.data());
+    std::copy(dd_data->gamma, dd_data->gamma+ns, dd_data->delta_gamma);
 
     // note: this is efficient as the memory is contiguous
-    for (u64 i = 1; i < mesh.nc; i++) {
-        for (u64 j = 0; j < mesh.ns; j++) {
-            delta_gamma[i*mesh.ns + j] = gamma[i*mesh.ns + j] - gamma[(i-1)*mesh.ns + j];
+    for (u64 i = 1; i < nc; i++) {
+        for (u64 j = 0; j < ns; j++) {
+            dd_data->delta_gamma[i*ns + j] = dd_data->gamma[i*ns + j] - dd_data->gamma[(i-1)*ns + j];
         }
     }
 }
@@ -114,10 +107,16 @@ void BackendCPU::lhs_assemble() {
 
 void BackendCPU::compute_rhs() {
     const tiny::ScopedTimer timer("RHS");
-    const Mesh& m = mesh;
-
-    for (u64 i = 0; i < m.nb_panels_wing(); i++) {
-        rhs[i] = - (local_velocities.x[i] * m.normal.x[i] + local_velocities.y[i] * m.normal.y[i] + local_velocities.z[i] * m.normal.z[i]);
+    const u64 nc = hd_mesh->nc;
+    const u64 ns = hd_mesh->ns;
+    const u64 nw = hd_mesh->nw;
+    const u64 nb_panels_wing = nc * ns;
+
+    for (u64 i = 0; i < nb_panels_wing(); i++) {
+        rhs[i] = - (
+            dd_data->local_velocities[i + 0 * nb_panels_wing] * dd_mesh->normal[i + 0 * nb_panels_wing] +
+            dd_data->local_velocities[i + 1 * nb_panels_wing] * dd_mesh->normal[i + 1 * nb_panels_wing] +
+            dd_data->local_velocities[i + 2 * nb_panels_wing] * dd_mesh->normal[i + 2 * nb_panels_wing]);
     }
 }
 
@@ -361,3 +360,84 @@ void BackendCPU::set_velocities(const SoA_3D_t<f32>& vels) {
     std::copy(vels.y.begin(), vels.y.end(), local_velocities.y.begin());
     std::copy(vels.z.begin(), vels.z.end(), local_velocities.z.begin());
 }
+
+/// @brief Computes the chord length of a chordwise segment
+/// @details Since the mesh follows the camber line, the chord length is computed
+/// as the distance between the first and last vertex of a chordwise segment
+/// @param j chordwise segment index
+f32 Mesh::chord_length(const u64 j) const {
+    const f32 dx = v.x[j + nc * (ns+1)] - v.x[j];
+    const f32 dy = 0.0f; // chordwise segments are parallel to the x axis
+    const f32 dz = v.z[j + nc * (ns+1)] - v.z[j];
+
+    return std::sqrt(dx * dx + dy * dy + dz * dz);
+}
+
+// i: chordwise index, j: spanwise index, k: x,y,z dim
+#define PTR_MESH_V(m, i,j,k) (m->vertices + j + i * (m->ns+1) + k * (m->nc+m->nw+1) * (m->ns+1))
+
+/// @brief Computes the mean chord of a set of panels
+/// @details
+/// Mean Aerodynamic Chord = \frac{2}{S} \int_{0}^{b/2} c(y)^{2} dy
+/// Integration using the Trapezoidal Rule
+/// Validated against empirical formulas for tapered wings
+/// @param j first panel index spanwise
+/// @param n number of panels spanwise
+/// @return mean chord of the set of panels
+f32 BackendCPU::mesh_mac(u64 j, u64 n) {
+    assert(j + n <= hd_mesh->ns); // spanwise range
+    assert(n > 0);
+    u64 nc = hd_mesh->nc;
+    u64 ns = hd_mesh->ns;
+    u64 nw = hd_mesh->nw;
+
+    // Leading edge vertex
+    f32* lvx = PTR_MESH_V(hd_mesh, 0,0,0);
+    f32* lvy = PTR_MESH_V(hd_mesh, 0,0,1);
+    f32* lvz = PTR_MESH_V(hd_mesh, 0,0,2);
+    // Trailing edge vertex
+    f32* tvx = PTR_MESH_V(hd_mesh, nc,0,0);
+    f32* tvy = PTR_MESH_V(hd_mesh, nc,0,1);
+    f32* tvz = PTR_MESH_V(hd_mesh, nc,0,2);
+
+    f32 mac = 0.0f;
+    // loop over panel chordwise sections in spanwise direction
+    // Note: can be done optimally with vertical fused simd
+    for (u64 v = j; v < j+n; v++) {
+        // left and right chord lengths
+        const f32 dx0 = tvx[v] - lvx[v];
+        const f32 dy0 = tvy[v] - lvy[v];
+        const f32 dz0 = tvz[v] - lvz[v];
+        const f32 dx1 = tvx[v + 1] - lvx[v + 1];
+        const f32 dy1 = tvy[v + 1] - lvy[v + 1];
+        const f32 dz1 = tvz[v + 1] - lvz[v + 1];
+        const f32 c0 = std::sqrt(dx0 * dx0 + dy0 * dy0 + dz0 * dz0);
+        const f32 c1 = std::sqrt(dx1 * dx1 + dy1 * dy1 + dz1 * dz1);
+        // Panel width
+        const f32 dx3 = lvx[v + 1] - lvx[v];
+        const f32 dy3 = lvy[v + 1] - lvy[v];
+        const f32 dz3 = lvz[v + 1] - lvz[v];
+        const f32 width = std::sqrt(dx3 * dx3 + dy3 * dy3 + dz3 * dz3);
+
+        mac += 0.5f * (c0 * c0 + c1 * c1) * width;
+    }
+    // Since we divide by half the total wing area (both sides) we dont need to multiply by 2
+    return mac / mesh_area(0, j, nc, n);
+}
+
+f32 BackendCPU::mesh_area(const u64 i, const u64 j, const u64 m, const u64 n) {
+    assert(i + m <= nc);
+    assert(j + n <= ns);
+    u64 nc = hd_mesh->nc;
+    u64 ns = hd_mesh->ns;
+    u64 nw = hd_mesh->nw;
+
+    const f32* areas = hd_mesh->area + j + i * (m->ns);
+    f32 area_sum = 0.0f;
+    for (u64 u = 0; u < m; u++) {
+        for (u64 v = 0; v < n; v++) {
+            area_sum += areas[v + u * ns];
+        }
+    }
+    return area_sum;
+}
diff --git a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc
index 9a1b5bd..5f989af 100644
--- a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc
+++ b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc
@@ -27,30 +27,29 @@ inline uniform float length2(const uniform float3& v) {return LENGTH2(v);}
 inline float length(const float3& v) {return LENGTH(v);}
 inline uniform float length(const uniform float3& v) {return LENGTH(v);}
 
-// Aggregate Structures
-export struct SoA3D {
-    uniform float* uniform x;
-    uniform float* uniform y;
-    uniform float* uniform z;
-};
-
-export struct MeshProxy {
-    uniform uint64 ns;
-    uniform uint64 nc;
-    uniform uint64 nb_panels;
-    uniform SoA3D v; // vertices
-    uniform SoA3D colloc; // collocation points
-    uniform SoA3D normal; // normals
-};
-
-export struct MeshView {
-    uniform uint64 ni; // wing chord panels
-    uniform uint64 nj; // wing span panels
-    uniform uint64 nw; // wake chord capacity
-    uniform uint64 cw; // number of active wake panels
-    uniform SoA3D v; // vertices
-    uniform SoA3D colloc; // collocation points
-    uniform SoA3D normal; // normals
+struct Mesh2 {
+    f32* vertices = nullptr; // (nc+nw+1)*(ns+1)*3
+    // TODO: evaluate if we really need those values or we can just compute on the fly in the kernels
+    f32* normals = nullptr; // nc*ns*3
+    f32* colloc = nullptr; // nc*ns*3
+    f32* area = nullptr; // nc*ns
+
+    u64 nc = 0; // number of wing panels chordwise
+    u64 ns = 0; // number of wing panels spanwise
+    u64 nw = 0; // number of wake panels chordwise
+    u64 nwa = 0; // number of wake panels active chordwise
+
+    f32 s_ref = 0.0f; // reference area
+    f32 c_ref = 0.0f; // reference chord
+    f32 frame[16] = {
+        1.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 1.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 1.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 1.0f
+    }; // Col major order (TODO: move this to kinematic tracker)
+
+    char* name = nullptr; // mesh name
+    bool lifting = true;
 };
 
 // Bio-savart Kernel
@@ -58,6 +57,7 @@ export struct MeshView {
 #define RCUT2 1e-5f
 
 #define PI_f 3.141593f
+#define PTR_MESH_V(m, i,j,k) (m->vertices + j + i * (m->ns+1) + k * (m->nc+m->nw+1) * (m->ns+1))
 
 template<typename C, typename V>
 inline float3 kernel_biosavart(C& colloc, const V& vertex1, const V& vertex2, const uniform float& sigma) {
@@ -95,7 +95,7 @@ inline void kernel_symmetry(float3& inf, C colloc, const V& vertex0, const V& ve
 }
 
 export void kernel_influence(
-    uniform const MeshProxy& m,
+    uniform const Mesh2* uniform m,
     uniform float* uniform lhs,
     uniform uint64 ia, uniform uint64 lidx, uniform float sigma
     ) {
@@ -105,12 +105,14 @@ export void kernel_influence(
     const uniform uint64 v3 = v0 + m.ns+1;
     const uniform uint64 v2 = v3 + 1;
 
-
     // Broadcast vertices
-    uniform float3 vertex0 = {m.v.x[v0], m.v.y[v0], m.v.z[v0]};
-    uniform float3 vertex1 = {m.v.x[v1], m.v.y[v1], m.v.z[v1]};
-    uniform float3 vertex2 = {m.v.x[v2], m.v.y[v2], m.v.z[v2]};
-    uniform float3 vertex3 = {m.v.x[v3], m.v.y[v3], m.v.z[v3]};
+    const uniform float* uniform vx = PTR_MESH_V(m, 0, 0, 0);
+    const uniform float* uniform vy = PTR_MESH_V(m, 0, 0, 1);
+    const uniform float* uniform vz = PTR_MESH_V(m, 0, 0, 2);
+    uniform float3 vertex0 = {vx[v0], vy[v0], vz[v0]};
+    uniform float3 vertex1 = {vx[v1], vy[v1], vz[v1]};
+    uniform float3 vertex2 = {vx[v2], vy[v2], vz[v2]};
+    uniform float3 vertex3 = {vx[v3], vy[v3], vz[v3]};
 
     // print("Influencer %: \n", lidx);
     // print("Vertex 0: % % %\n", vertex0.x, vertex0.y, vertex0.z);
diff --git a/vlm/backends/cuda/include/vlm_backend_cuda.hpp b/vlm/backends/cuda/include/vlm_backend_cuda.hpp
index e088698..3fd040d 100644
--- a/vlm/backends/cuda/include/vlm_backend_cuda.hpp
+++ b/vlm/backends/cuda/include/vlm_backend_cuda.hpp
@@ -25,21 +25,9 @@ struct MeshProxy {
 
 class BackendCUDA : public Backend {
     public:
-        BackendCPU default_backend; // temporary
 
-        float* d_lhs = nullptr;
-        float* d_rhs = nullptr;
-        float* d_gamma = nullptr;
-        float* d_delta_gamma = nullptr;
 
-        int* d_solver_info = nullptr;
-        int* d_solver_ipiv = nullptr;
-        float* d_solver_buffer = nullptr;
-        
-        MeshProxy h_mesh;
-        MeshProxy* d_mesh;
-
-        BackendCUDA(Mesh& mesh);
+        BackendCUDA(MeshGeom* mesh, u64 timesteps);
         ~BackendCUDA();
 
         void reset() override;
diff --git a/vlm/include/vlm_allocator.hpp b/vlm/include/vlm_allocator.hpp
index ae72014..3ac1e88 100644
--- a/vlm/include/vlm_allocator.hpp
+++ b/vlm/include/vlm_allocator.hpp
@@ -2,10 +2,10 @@
 
 namespace vlm {
 
-typedef void* (*malloc_f)(unsigned long long size);
+typedef void* (*malloc_f)(size_t size);
 typedef void (*free_f)(void* ptr);
-typedef void* (*memcpy_f)(void* dst, const void* src, unsigned long long size);
-typedef void* (*memset_f)(void* dst, int value, unsigned long long size);
+typedef void* (*memcpy_f)(void* dst, const void* src, size_t size);
+typedef void* (*memset_f)(void* dst, int value, size_t size);
 
 struct Allocator {
     malloc_f h_malloc;
diff --git a/vlm/include/vlm_backend.hpp b/vlm/include/vlm_backend.hpp
index 148316b..adf9a18 100644
--- a/vlm/include/vlm_backend.hpp
+++ b/vlm/include/vlm_backend.hpp
@@ -18,16 +18,21 @@ class Backend {
         MeshGeom* dd_mesh_geom; 
 
         // Mutable meshes (temporal state)
-        Mesh2* hh_mesh; // host ptr to host buffers for io
+        // Mesh2* hh_mesh; // host ptr to host buffers for io
         Mesh2* hd_mesh; // host ptr to device buffers
         Mesh2* dd_mesh; // device ptr to device buffers for kernels
 
         Data* hd_data;
         Data* dd_data;
 
+        i32* d_solver_info = nullptr;
+        i32* d_solver_ipiv = nullptr;
+        f32* d_solver_buffer = nullptr;
+
         f32 sigma_vatistas = 0.0f;
-        Backend(MeshGeom* mesh_geom);
-        void init(u64 timesteps);
+        Backend() = default;
+        ~Backend();
+        void init(MeshGeom* mesh_geom, u64 timesteps); // Acts as delayed constructor
         virtual void reset() = 0;
         virtual void lhs_assemble() = 0;
         virtual void compute_rhs() = 0;
@@ -46,7 +51,10 @@ class Backend {
         virtual void compute_delta_gamma() = 0;
         virtual void set_velocities(const linalg::alias::float3& vel) = 0;
         virtual void set_velocities(const SoA_3D_t<f32>& vels) = 0;
-        virtual ~Backend() = default;
+
+    private:
+        virtual f32 mesh_mac(u64 j, u64 n) = 0; // mean chord
+        virtual f32 mesh_area(const u64 i, const u64 j, const u64 m, const u64 n) = 0; // mean span
 };
 
 std::unique_ptr<Backend> create_backend(const std::string& backend_name, const MeshGeom* mesh, int timesteps);
diff --git a/vlm/include/vlm_data.hpp b/vlm/include/vlm_data.hpp
index fcd07cf..0f8b33b 100644
--- a/vlm/include/vlm_data.hpp
+++ b/vlm/include/vlm_data.hpp
@@ -2,25 +2,23 @@
 
 #include "vlm_types.hpp"
 #include "vlm_fwd.hpp"
+#include "vlm_allocator.hpp"
 #include "tinyinterpolate.hpp"
 
 namespace vlm {
 
 struct Data {
-    f32* lhs = nullptr;
-    f32* rhs = nullptr;
-    f32* gamma = nullptr;
-    f32* delta_gamma = nullptr;
-    f32* rollup_vertices = nullptr;
-    f32* local_velocities = nullptr;
-    f32* trefftz_buffer = nullptr;
-
-    i32* solver_info = nullptr;
-    i32* solver_ipiv = nullptr;
-    f32* solver_buffer = nullptr;
+    f32* lhs = nullptr; // (ns*nc)^2
+    f32* rhs = nullptr; // ns*nc
+    f32* gamma = nullptr; // (nc+nw)*nw
+    f32* delta_gamma = nullptr; // nc*ns
+    f32* rollup_vertices = nullptr; // (nc+nw+1)*(ns+1)*3
+    f32* local_velocities = nullptr; // ns*nc*3
+    f32* trefftz_buffer = nullptr; // ns TODO: can we get rid of this ??
 };
 
-void data_alloc(const Allocator* allocator, Data* data, u64 n, u64 m);
+void data_alloc(const malloc_f malloc, Data* data, u64 nc, u64 ns, u64 nw);
+void data_free(const free_f free, Data* data);
 
 // Flow characteristics
 class FlowData {
diff --git a/vlm/include/vlm_mesh.hpp b/vlm/include/vlm_mesh.hpp
index b996911..0c8a422 100644
--- a/vlm/include/vlm_mesh.hpp
+++ b/vlm/include/vlm_mesh.hpp
@@ -23,7 +23,12 @@ struct Mesh2 {
 
     f32 s_ref = 0.0f; // reference area
     f32 c_ref = 0.0f; // reference chord
-    f32 frame[16] = {}; // 4x4 homogeneous matrix in col major order
+    f32 frame[16] = {
+        1.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 1.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 1.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 1.0f
+    }; // Col major order (TODO: move this to kinematic tracker)
 
     char* name = nullptr; // mesh name
     bool lifting = true;
@@ -41,93 +46,93 @@ void mesh_alloc(const malloc_f malloc, Mesh2* mesh, u64 nc, u64 ns, u64 nw, u64
 void mesh_free(const free_f free, Mesh2* mesh);
 void mesh_copy(const memcpy_f memcpy, Mesh2* dst, const Mesh2* src);
 
-// === STRUCTURED MESH ===
-// nc : number of panels chordwise
-// ns : number of panels spanwise
-// nw : number of panels in chordwise wake
-// ncw : nc + nw
-class Mesh {
-    public:
-    // Unstructured members (for exporting results)
-    // ---------------------
-    // All vertices stored in single SoA for result exporting
-    // (stored in span major order)
-    SoA_3D_t<f32> v; // size (ncw+1)*(ns+1)
-    // Offsets for indexing in connectivity array for each panel
-    std::vector<u64> offsets = {}; // size nc*ns + 1
-    // Panel-vertex connectivity
-    // vertices indices of a panel: connectivity[offsets[i]] to connectivity[offsets[i+1]]
-    std::vector<u64> connectivity = {}; // size 4*(nc*ns)
-
-    // Panels metrics (stored span major order)
-    // ---------------------
-    // Collocation points of panels
-    SoA_3D_t<f32> colloc; // size ncw*ns
-    // Normals of panels
-    SoA_3D_t<f32> normal; // size ncw*ns
-    // Area of panels
-    std::vector<f32> area = {}; // size ncw*ns
-
-    // Structured dimensions
-    // ---------------------
-    u64 nc = 1; // number of panels chordwise
-    u64 ns = 1; // number of panels spanwise
-    u64 nw; // number of wake panels chordwise (max capacity)
-    u64 current_nw = 0; // current number of built wake panels (temporary)
-
-    // Analytical quanities when provided
-    // ---------------------
-    f32 s_ref = 0.0f; // reference area (of wing)
-    f32 c_ref = 0.0f; // reference chord (of wing)
-
-    linalg::alias::float4x4 frame = linalg::identity; // transformation matrix
-    linalg::alias::float3 ref_pt = {0.25f, 0.0f, 0.0f}; // TODO: deprecate
-
-    void update_wake(const linalg::alias::float3& u_inf); // update wake vertices
-    void correction_high_aoa(f32 alpha_rad); // correct collocation points for high aoa
-    void create_vortex_panels(); // create true vortex panels
-    void compute_connectivity(); // fill offsets, connectivity
-    void compute_metrics_wing(); // fill colloc, normal, area
-    void compute_metrics_wake();
-    void compute_metrics_i(u64 i);
-    void move(const linalg::alias::float4x4& transform, const SoA_3D_t<f32>& origin_pos);
-    void resize_wake(const u64 nw);
+// // === STRUCTURED MESH ===
+// // nc : number of panels chordwise
+// // ns : number of panels spanwise
+// // nw : number of panels in chordwise wake
+// // ncw : nc + nw
+// class Mesh {
+//     public:
+//     // Unstructured members (for exporting results)
+//     // ---------------------
+//     // All vertices stored in single SoA for result exporting
+//     // (stored in span major order)
+//     SoA_3D_t<f32> v; // size (ncw+1)*(ns+1)
+//     // Offsets for indexing in connectivity array for each panel
+//     std::vector<u64> offsets = {}; // size nc*ns + 1
+//     // Panel-vertex connectivity
+//     // vertices indices of a panel: connectivity[offsets[i]] to connectivity[offsets[i+1]]
+//     std::vector<u64> connectivity = {}; // size 4*(nc*ns)
+
+//     // Panels metrics (stored span major order)
+//     // ---------------------
+//     // Collocation points of panels
+//     SoA_3D_t<f32> colloc; // size ncw*ns
+//     // Normals of panels
+//     SoA_3D_t<f32> normal; // size ncw*ns
+//     // Area of panels
+//     std::vector<f32> area = {}; // size ncw*ns
+
+//     // Structured dimensions
+//     // ---------------------
+//     u64 nc = 1; // number of panels chordwise
+//     u64 ns = 1; // number of panels spanwise
+//     u64 nw; // number of wake panels chordwise (max capacity)
+//     u64 current_nw = 0; // current number of built wake panels (temporary)
+
+//     // Analytical quanities when provided
+//     // ---------------------
+//     f32 s_ref = 0.0f; // reference area (of wing)
+//     f32 c_ref = 0.0f; // reference chord (of wing)
+
+//     linalg::alias::float4x4 frame = linalg::identity; // transformation matrix
+//     linalg::alias::float3 ref_pt = {0.25f, 0.0f, 0.0f}; // TODO: deprecate
+
+//     void update_wake(const linalg::alias::float3& u_inf); // update wake vertices
+//     void correction_high_aoa(f32 alpha_rad); // correct collocation points for high aoa
+//     void create_vortex_panels(); // create true vortex panels
+//     void compute_connectivity(); // fill offsets, connectivity
+//     void compute_metrics_wing(); // fill colloc, normal, area
+//     void compute_metrics_wake();
+//     void compute_metrics_i(u64 i);
+//     void move(const linalg::alias::float4x4& transform, const SoA_3D_t<f32>& origin_pos);
+//     void resize_wake(const u64 nw);
  
-    u64 nb_panels_wing() const;
-    u64 nb_panels_total() const;
-    u64 nb_vertices_wing() const;
-    u64 nb_vertices_total() const;
-    f32 panels_area(const u64 i, const u64 j, const u64 m, const u64 n) const;
-    f32 panels_area_xy(const u64 i, const u64 j, const u64 m, const u64 n) const;
-    f32 panel_width_y(const u64 i, const u64 j) const;
-    f32 panel_length(const u64 i, const u64 j) const;
-    f32 strip_width(const u64 j) const;
-    f32 chord_length(const u64 j) const; // vertex idx
-    f32 chord_mean(const u64 j, const u64 n) const;
-
-    // i panel vertices coordinates
-    linalg::alias::float3 get_v0(u64 i) const; // upper left
-    linalg::alias::float3 get_v1(u64 i) const; // upper right
-    linalg::alias::float3 get_v2(u64 i) const; // lower right
-    linalg::alias::float3 get_v3(u64 i) const; // lower left
-
-    void io_read(const std::string& filename);
-    Mesh(const tiny::Config& cfg);
-    Mesh(
-        const std::string& filename,
-        const u64 nw,
-        const f32 s_ref_,
-        const f32 c_ref_,
-        const linalg::alias::float3& ref_pt_ // todo: deprecate
-    );
+//     u64 nb_panels_wing() const;
+//     u64 nb_panels_total() const;
+//     u64 nb_vertices_wing() const;
+//     u64 nb_vertices_total() const;
+//     f32 panels_area(const u64 i, const u64 j, const u64 m, const u64 n) const;
+//     f32 panels_area_xy(const u64 i, const u64 j, const u64 m, const u64 n) const;
+//     f32 panel_width_y(const u64 i, const u64 j) const;
+//     f32 panel_length(const u64 i, const u64 j) const;
+//     f32 strip_width(const u64 j) const;
+//     f32 chord_length(const u64 j) const; // vertex idx
+//     f32 chord_mean(const u64 j, const u64 n) const;
+
+//     // i panel vertices coordinates
+//     linalg::alias::float3 get_v0(u64 i) const; // upper left
+//     linalg::alias::float3 get_v1(u64 i) const; // upper right
+//     linalg::alias::float3 get_v2(u64 i) const; // lower right
+//     linalg::alias::float3 get_v3(u64 i) const; // lower left
+
+//     void io_read(const std::string& filename);
+//     Mesh(const tiny::Config& cfg);
+//     Mesh(
+//         const std::string& filename,
+//         const u64 nw,
+//         const f32 s_ref_,
+//         const f32 c_ref_,
+//         const linalg::alias::float3& ref_pt_ // todo: deprecate
+//     );
     
-    private:
-    void alloc(); // allocate memory for all buffers
-    void init(); // called at the end of constructor
-    void io_read_plot3d_structured(std::ifstream& f);
-};
-
-// todo, update this to mirror the constructor
-std::unique_ptr<Mesh> create_mesh(const std::string& filename, const u64 nw=1);
+//     private:
+//     void alloc(); // allocate memory for all buffers
+//     void init(); // called at the end of constructor
+//     void io_read_plot3d_structured(std::ifstream& f);
+// };
+
+// // todo, update this to mirror the constructor
+// std::unique_ptr<Mesh> create_mesh(const std::string& filename, const u64 nw=1);
 
 } // namespace vlm
\ No newline at end of file
diff --git a/vlm/src/vlm_backend.cpp b/vlm/src/vlm_backend.cpp
index 033df5c..b924127 100644
--- a/vlm/src/vlm_backend.cpp
+++ b/vlm/src/vlm_backend.cpp
@@ -13,30 +13,32 @@
 
 using namespace vlm;
 
+// Here we assume constant shape wing since mesh_sref
+// TODO: remove this limitation
 f32 Backend::compute_coefficient_cl(const FlowData& flow) {
-    return compute_coefficient_cl(flow, mesh.s_ref, 0, mesh.ns);
+    return compute_coefficient_cl(flow, mesh_area(0,0,hd_mesh->nc, hd_mesh->ns), 0, hd_mesh->ns);
 }
 
 linalg::alias::float3 Backend::compute_coefficient_cm(const FlowData& flow) {
-    return compute_coefficient_cm(flow, mesh.s_ref, mesh.c_ref, 0, mesh.ns);
+    return compute_coefficient_cm(flow, mesh_area(0,0,hd_mesh->nc, hd_mesh->ns), mesh_mac(0,hd_mesh->ns), 0, hd_mesh->ns);
 }
 
 f32 Backend::compute_coefficient_cd(const FlowData& flow) {
-    return compute_coefficient_cd(flow, mesh.s_ref, 0, mesh.ns);
+    return compute_coefficient_cd(flow, mesh_area(0,0,hd_mesh->nc, hd_mesh->ns), 0, hd_mesh->ns);
 }
 
-std::unique_ptr<Backend> vlm::create_backend(const std::string& backend_name, Mesh& mesh) {
+std::unique_ptr<Backend> vlm::create_backend(const std::string& backend_name, MeshGeom* mesh, u64 timesteps) {
     //tiny::CPUID cpuid;
     //cpuid.print_info();
 
     #ifdef VLM_CPU
     if (backend_name == "cpu") {
-        return std::make_unique<BackendCPU>(mesh);
+        return std::make_unique<BackendCPU>(mesh, timesteps);
     }
     #endif
     #ifdef VLM_CUDA
     if (backend_name == "cuda") {
-        return std::make_unique<BackendCUDA>(mesh);
+        return std::make_unique<BackendCUDA>(mesh, timesteps);
     }
     #endif
     throw std::runtime_error("Unsupported backend: " + backend_name);
@@ -53,29 +55,28 @@ std::vector<std::string> vlm::get_available_backends() {
     return backends;
 }
 
-Backend::Backend(MeshGeom* mesh_geom) : hh_mesh_geom(mesh_geom) {};
-
-void Backend::init(u64 timesteps) {
-    u64 nb_vertices_wing = (hh_mesh_geom->nc+1)*(hh_mesh_geom->ns+1);
+void Backend::init(MeshGeom* mesh_geom, u64 timesteps) {
+    hh_mesh_geom = mesh_geom; // Borrowed ptr
+    const u64 nb_vertices_wing = (hh_mesh_geom->nc+1)*(hh_mesh_geom->ns+1);
 
     // Mesh structs allocation
     hd_mesh_geom = (MeshGeom*)allocator.h_malloc(sizeof(MeshGeom));
     dd_mesh_geom = (MeshGeom*)allocator.d_malloc(sizeof(MeshGeom));
-    hh_mesh = (Mesh2*)allocator.h_malloc(sizeof(Mesh2));
+    // hh_mesh = (Mesh2*)allocator.h_malloc(sizeof(Mesh2));
     hd_mesh = (Mesh2*)allocator.h_malloc(sizeof(Mesh2));
     dd_mesh = (Mesh2*)allocator.d_malloc(sizeof(Mesh2));
+    hd_data = (Data*)allocator.h_malloc(sizeof(Data));
+    dd_data = (Data*)allocator.d_malloc(sizeof(Data));
 
     // Allocate mesh buffers on device and host
     hd_mesh_geom->vertices = (f32*) allocator.d_malloc(nb_vertices_wing*3*sizeof(f32));
     // mesh_alloc(allocator.h_malloc, hh_mesh, hh_mesh_geom->nc, hh_mesh_geom->ns, timesteps, 0);
     mesh_alloc(allocator.d_malloc, hd_mesh, hh_mesh_geom->nc, hh_mesh_geom->ns, timesteps, 0); 
+    data_alloc(allocator.d_malloc, hd_data, hh_mesh_geom->nc, hh_mesh_geom->ns, timesteps);
     
     // Copy indices
     hd_mesh_geom->nc = hh_mesh_geom->nc;
     hd_mesh_geom->ns = hh_mesh_geom->ns;
-    hd_mesh->nc = hh_mesh_geom->nc;
-    hd_mesh->ns = hh_mesh_geom->ns;
-    hd_mesh->nw = timesteps;
 
     // Copy raw vertex geometry directly to device
     allocator.hd_memcpy(hd_mesh_geom->vertices, hh_mesh_geom->vertices, nb_vertices_wing*3*sizeof(f32));
@@ -84,4 +85,25 @@ void Backend::init(u64 timesteps) {
     // Copy host-device mesh ptr to device-device
     allocator.hd_memcpy(dd_mesh_geom, hd_mesh_geom, sizeof(*hd_mesh_geom));
     allocator.hd_memcpy(dd_mesh, hd_mesh, sizeof(*hd_mesh));
+    allocator.hd_memcpy(dd_data, hd_data, sizeof(*hd_data));
+}
+
+Backend::~Backend() {
+    // Free device-device ptrs
+    allocator.d_free(dd_data);
+    allocator.d_free(dd_mesh);
+    allocator.d_free(dd_mesh_geom);
+    allocator.d_free(d_solver_info);
+    allocator.d_free(d_solver_ipiv);
+    allocator.d_free(d_solver_buffer);
+
+    // Free device buffers store in host-device ptrs
+    allocator.d_free(hd_mesh_geom->vertices);
+    mesh_free(allocator.d_free, hd_mesh);
+    data_free(allocator.d_free, hd_data);
+
+    // Free host-device ptrs
+    allocator.h_free(hd_data);
+    allocator.h_free(hd_mesh);
+    allocator.h_free(hd_mesh_geom);
 }
\ No newline at end of file
diff --git a/vlm/src/vlm_data.cpp b/vlm/src/vlm_data.cpp
index 68c120f..1fc2353 100644
--- a/vlm/src/vlm_data.cpp
+++ b/vlm/src/vlm_data.cpp
@@ -20,4 +20,28 @@ FlowData::FlowData(const linalg::alias::float3& freestream_, const f32 rho_):
     freestream(freestream_),
     lift_axis(compute_lift_axis(freestream)),
     stream_axis(compute_stream_axis(freestream)) {
-}
\ No newline at end of file
+}
+
+void vlm::data_alloc(const malloc_f malloc, Data* data, u64 nc, u64 ns, u64 nw) {
+    const u64 nb_panels_wing = nc * ns;
+    const u64 nb_panels_total = (nc+nw) * ns;
+    const u64 nb_vertices_wing = (nc+1) * (ns+1);
+    const u64 nb_vertices_total = (nc+nw+1) * (ns+1);
+    data->lhs = (f32*)malloc(nb_panels_wing * nb_panels_wing * sizeof(f32));
+    data->rhs = (f32*)malloc(nb_panels_wing * sizeof(f32));
+    data->gamma = (f32*)malloc(nb_panels_total * sizeof(f32));
+    data->delta_gamma = (f32*)malloc(nb_panels_wing * sizeof(f32));
+    data->rollup_vertices = (f32*)malloc(nb_vertices_total * 3 * sizeof(f32)); // TODO: this can be reduced to (nw+1)*(ns+1)*3
+    data->local_velocities = (f32*)malloc(nb_panels_wing * 3 * sizeof(f32));
+    data->trefftz_buffer = (f32*)malloc(ns * sizeof(f32));
+}
+
+void data_free(const free_f free, Data* data) {
+    free(data->lhs);
+    free(data->rhs);
+    free(data->gamma);
+    free(data->delta_gamma);
+    free(data->rollup_vertices);
+    free(data->local_velocities);
+    free(data->trefftz_buffer);    
+}
diff --git a/vlm/src/vlm_mesh.cpp b/vlm/src/vlm_mesh.cpp
index 9d44386..dce0cfc 100644
--- a/vlm/src/vlm_mesh.cpp
+++ b/vlm/src/vlm_mesh.cpp
@@ -114,45 +114,21 @@ void mesh_io_read_file(const std::string& filename, MeshGeom* mesh_geom) {
     }
 }
 
-// TODO: deprecate everything below this line 
-Mesh::Mesh(const tiny::Config& cfg) :
-    Mesh(
-        cfg().section("files").get<std::string>("mesh"),
-        cfg().section("mesh").get<u64>("nw", 1),
-        cfg().section("mesh").get<f32>("s_ref", 0.0f),
-        cfg().section("mesh").get<f32>("c_ref", 0.0f),
-        linalg::alias::float3{cfg().section("mesh").get<std::array<f32, 3>>("ref_pt").data()}
-    ) {
-}
-
-Mesh::Mesh(
-    const std::string& filename,
-    const u64 nw_,
-    const f32 s_ref_ = 0.0f,
-    const f32 c_ref_ = 0.0f,
-    const linalg::alias::float3& ref_pt_ = {0.25f, 0.0f, 0.0f}
-    ) {
-    nw = nw_;
-    s_ref = s_ref_;
-    c_ref = c_ref_;
-    ref_pt = ref_pt_;
-    io_read(filename);
-    init();
-}
-
-std::unique_ptr<Mesh> vlm::create_mesh(const std::string& filename, const u64 nw) {
-    return std::make_unique<Mesh>(filename, nw);
-}
+void mesh_quaterchord(MeshGeom* mesh_geom) {
+    const u64 nc = mesh_geom->nc;
+    const u64 ns = mesh_geom->ns;
+    f32* vx = mesh_geom->vertices + 0 * (nc+1) * (ns+1);
+    f32* vy = mesh_geom->vertices + 1 * (nc+1) * (ns+1);
+    f32* vz = mesh_geom->vertices + 2 * (nc+1) * (ns+1);
 
-void Mesh::create_vortex_panels() {
     for (u64 i = 0; i < nc; i++) {
         for (u64 j = 0; j < ns+1; j++) {
             const u64 v0 = (i+0) * (ns+1) + j;
             const u64 v3 = (i+1) * (ns+1) + j;
 
-            v.x[v0] = 0.75f * v.x[v0] + 0.25f * v.x[v3];
-            v.y[v0] = 0.75f * v.y[v0] + 0.25f * v.y[v3];
-            v.z[v0] = 0.75f * v.z[v0] + 0.25f * v.z[v3];
+            vx[v0] = 0.75f * vx[v0] + 0.25f * vx[v3];
+            vy[v0] = 0.75f * vy[v0] + 0.25f * vy[v3];
+            vz[v0] = 0.75f * vz[v0] + 0.25f * vz[v3];
         }
     }
 
@@ -162,384 +138,384 @@ void Mesh::create_vortex_panels() {
         const u64 v0 = (i+0) * (ns+1) + j;
         const u64 v3 = (i+1) * (ns+1) + j;
         
-        v.x[v3] = 1.25f * v.x[v3] - 0.25f * v.x[v0];
-        v.y[v3] = 1.25f * v.y[v3] - 0.25f * v.y[v0];
-        v.z[v3] = 1.25f * v.z[v3] - 0.25f * v.z[v0];
-    }
-}
-
-void Mesh::init() {
-    create_vortex_panels();
-    if (c_ref == 0.0f) c_ref = chord_mean(0, ns+1); // use MAC as default
-    if (s_ref == 0.0f) s_ref = panels_area_xy(0,0, nc, ns); // use projected area as default
-    compute_connectivity();
-    compute_metrics_wing();
-}
-
-void Mesh::alloc() {
-    const u64 ncw = nc + nw;
-    v.resize((ncw + 1) * (ns + 1));
-    offsets.resize(nc * ns + 1);
-    connectivity.resize(4 * nc * ns);
-
-    colloc.resize(ncw * ns);
-    normal.resize(ncw * ns);
-    area.resize(ncw * ns);
-}
-
-u64 Mesh::nb_panels_wing() const { return nc * ns; };
-u64 Mesh::nb_panels_total() const { return (nc+nw) * ns; };
-u64 Mesh::nb_vertices_wing() const { return (nc + 1) * (ns + 1); };
-u64 Mesh::nb_vertices_total() const { return (nc + nw + 1) * (ns + 1); };
-
-/// @brief Computes the mean chord of a set of panels
-/// @details
-/// Mean Aerodynamic Chord = \frac{2}{S} \int_{0}^{b/2} c(y)^{2} dy
-/// Integration using the Trapezoidal Rule
-/// Validated against empirical formulas for tapered wings
-/// @param j first panel index chordwise
-/// @param n number of panels spanwise
-/// @return mean chord of the set of panels
-f32 Mesh::chord_mean(const u64 j, const u64 n) const {
-    assert(j + n <= ns+1); // spanwise range
-    assert(n > 1); // minimum 2 chords
-    f32 mac = 0.0f;
-    // loop over panel chordwise sections in spanwise direction
-    // Note: can be done optimally with vertical fused simd
-    for (u64 v = 0; v < n - 1; v++) {
-        const f32 c0 = chord_length(j + v);
-        const f32 c1 = chord_length(j + v + 1);
-        mac += 0.5f * (c0 * c0 + c1 * c1) * panel_width_y(0, j + v);
+        vx[v3] = 1.25f * vx[v3] - 0.25f * vx[v0];
+        vy[v3] = 1.25f * vy[v3] - 0.25f * vy[v0];
+        vz[v3] = 1.25f * vz[v3] - 0.25f * vz[v0];
     }
-    // Since we divide by half the total wing area (both sides) we dont need to multiply by 2
-    return mac / panels_area_xy(0, j, nc, n-1);
 }
 
-/// @brief Computes the total area of a 2D set of panels
-/// @param i first panel index chordwise
-/// @param j first panel panel index spanwise
-/// @param m number of panels chordwise
-/// @param n number of panels spanwise
-/// @return sum of the areas of the panels
-f32 Mesh::panels_area(const u64 i, const u64 j, const u64 m, const u64 n) const {
-    assert(i + m <= nc);
-    assert(j + n <= ns);
+// void Mesh::init() {
+//     create_vortex_panels();
+//     if (c_ref == 0.0f) c_ref = chord_mean(0, ns+1); // use MAC as default
+//     if (s_ref == 0.0f) s_ref = panels_area_xy(0,0, nc, ns); // use projected area as default
+//     compute_connectivity();
+//     compute_metrics_wing();
+// }
+
+// void Mesh::alloc() {
+//     const u64 ncw = nc + nw;
+//     v.resize((ncw + 1) * (ns + 1));
+//     offsets.resize(nc * ns + 1);
+//     connectivity.resize(4 * nc * ns);
+
+//     colloc.resize(ncw * ns);
+//     normal.resize(ncw * ns);
+//     area.resize(ncw * ns);
+// }
+
+// u64 Mesh::nb_panels_wing() const { return nc * ns; };
+// u64 Mesh::nb_panels_total() const { return (nc+nw) * ns; };
+// u64 Mesh::nb_vertices_wing() const { return (nc + 1) * (ns + 1); };
+// u64 Mesh::nb_vertices_total() const { return (nc + nw + 1) * (ns + 1); };
+
+// /// @brief Computes the mean chord of a set of panels
+// /// @details
+// /// Mean Aerodynamic Chord = \frac{2}{S} \int_{0}^{b/2} c(y)^{2} dy
+// /// Integration using the Trapezoidal Rule
+// /// Validated against empirical formulas for tapered wings
+// /// @param j first panel index chordwise
+// /// @param n number of panels spanwise
+// /// @return mean chord of the set of panels
+// f32 Mesh::chord_mean(const u64 j, const u64 n) const {
+//     assert(j + n <= ns+1); // spanwise range
+//     assert(n > 1); // minimum 2 chords
+//     f32 mac = 0.0f;
+//     // loop over panel chordwise sections in spanwise direction
+//     // Note: can be done optimally with vertical fused simd
+//     for (u64 v = 0; v < n - 1; v++) {
+//         const f32 c0 = chord_length(j + v);
+//         const f32 c1 = chord_length(j + v + 1);
+//         mac += 0.5f * (c0 * c0 + c1 * c1) * panel_width_y(0, j + v);
+//     }
+//     // Since we divide by half the total wing area (both sides) we dont need to multiply by 2
+//     return mac / panels_area_xy(0, j, nc, n-1);
+// }
+
+// /// @brief Computes the total area of a 2D set of panels
+// /// @param i first panel index chordwise
+// /// @param j first panel panel index spanwise
+// /// @param m number of panels chordwise
+// /// @param n number of panels spanwise
+// /// @return sum of the areas of the panels
+// f32 Mesh::panels_area(const u64 i, const u64 j, const u64 m, const u64 n) const {
+//     assert(i + m <= nc);
+//     assert(j + n <= ns);
     
-    const u64 ld = ns;
-    const f32* areas = &area[j + i * ld];
-
-    f32 total_area = 0.0f;
-    for (u64 u = 0; u < m; u++) {
-        for (u64 v = 0; v < n; v++) {
-            total_area += areas[v + u * ld];
-        }
-    }
-    return total_area;
-}
-
-/// @brief Computes the total area of a 2D set of panels projected on the xy plane
-/// @param i first panel index chordwise
-/// @param j first panel panel index spanwise
-/// @param m number of panels chordwise
-/// @param n number of panels spanwise
-/// @return sum of the areas of the panels projected on the xy plane
-f32 Mesh::panels_area_xy(const u64 i, const u64 j, const u64 m, const u64 n) const {
-    assert(i + m <= nc);
-    assert(j + n <= ns);
+//     const u64 ld = ns;
+//     const f32* areas = &area[j + i * ld];
+
+//     f32 total_area = 0.0f;
+//     for (u64 u = 0; u < m; u++) {
+//         for (u64 v = 0; v < n; v++) {
+//             total_area += areas[v + u * ld];
+//         }
+//     }
+//     return total_area;
+// }
+
+// /// @brief Computes the total area of a 2D set of panels projected on the xy plane
+// /// @param i first panel index chordwise
+// /// @param j first panel panel index spanwise
+// /// @param m number of panels chordwise
+// /// @param n number of panels spanwise
+// /// @return sum of the areas of the panels projected on the xy plane
+// f32 Mesh::panels_area_xy(const u64 i, const u64 j, const u64 m, const u64 n) const {
+//     assert(i + m <= nc);
+//     assert(j + n <= ns);
     
-    // Area of a quad:
-    // 0.5 * || d1 x d2 || where d1 and d2 are the diagonals of the quad
-    f32 total_area = 0.0f;
-    // Note: this is highly inefficient as vertex coordinates should be loaded with simd
-    for (u64 u = 0; u < m; u++) {
-        for (u64 v = 0; v < n; v++) {
-            const u64 idx = j + v + (i+u) * ns;
-            const linalg::alias::float3 d1 = get_v0(idx) - get_v2(idx);
-            const linalg::alias::float3 d2 = get_v1(idx) - get_v3(idx);
-            const linalg::alias::float3 cross = linalg::cross(d1, d2);
+//     // Area of a quad:
+//     // 0.5 * || d1 x d2 || where d1 and d2 are the diagonals of the quad
+//     f32 total_area = 0.0f;
+//     // Note: this is highly inefficient as vertex coordinates should be loaded with simd
+//     for (u64 u = 0; u < m; u++) {
+//         for (u64 v = 0; v < n; v++) {
+//             const u64 idx = j + v + (i+u) * ns;
+//             const linalg::alias::float3 d1 = get_v0(idx) - get_v2(idx);
+//             const linalg::alias::float3 d2 = get_v1(idx) - get_v3(idx);
+//             const linalg::alias::float3 cross = linalg::cross(d1, d2);
             
-            total_area += 0.5f * std::abs(cross.z);
-            // std::cout << "area xy: " << 0.5f * std::abs(cross.z()) << "\n";
-            // std::cout << "area: " << area[idx] << "\n";
-        }
-    }
-    return total_area;
-}
-
-f32 Mesh::panel_length(const u64 i, const u64 j) const {
-    // Since chordwise segments are always parallel, we can simply measure the width of the first panel
-    assert(i < nc);
-    assert(j < ns);
-    const linalg::alias::float3 v0 = get_v0(j + i * ns);
-    const linalg::alias::float3 v1 = get_v1(j + i * ns);
-    const linalg::alias::float3 v2 = get_v2(j + i * ns);
-    const linalg::alias::float3 v3 = get_v3(j + i * ns);
-    return 0.5f * (linalg::length(v3-v0) + linalg::length(v2-v1));
-}
-
-/// @brief Computes the width of a single panel in pure y direction
-/// @param i panel index chordwise
-/// @param j panel index spanwise
-f32 Mesh::panel_width_y(const u64 i, const u64 j) const {
-    assert(i < nc);
-    assert(j < ns);
-    const u64 ld = ns + 1;
-
-    return v.y[j + 1 + i * ld] - v.y[j + i * ld];
-}
-
-f32 Mesh::strip_width(const u64 j) const {
-    assert(j < ns);
-    const linalg::alias::float3 v0 = get_v0(j);
-    const linalg::alias::float3 v1 = get_v1(j);
-    return linalg::length(v1 - v0);
-}
-
-/// @brief Computes the chord length of a chordwise segment
-/// @details Since the mesh follows the camber line, the chord length is computed
-/// as the distance between the first and last vertex of a chordwise segment
-/// @param j chordwise segment index
-f32 Mesh::chord_length(const u64 j) const {
-    assert(j <= ns); // spanwise vertex range
-    const f32 dx = v.x[j + nc * (ns+1)] - v.x[j];
-    const f32 dy = 0.0f; // chordwise segments are parallel to the x axis
-    const f32 dz = v.z[j + nc * (ns+1)] - v.z[j];
-
-    return std::sqrt(dx * dx + dy * dy + dz * dz);
-}
-
-linalg::alias::float3 Mesh::get_v0(u64 i) const {
-    const u64 idx = i + i / ns;
-    return {v.x[idx], v.y[idx], v.z[idx]};
-}
-
-linalg::alias::float3 Mesh::get_v1(u64 i) const {
-    const u64 idx = i + i / ns + 1;
-    return {v.x[idx], v.y[idx], v.z[idx]};
-}
-
-linalg::alias::float3 Mesh::get_v2(u64 i) const {
-    const u64 idx = i + i / ns + ns + 2;
-    return {v.x[idx], v.y[idx], v.z[idx]};
-}
-
-linalg::alias::float3 Mesh::get_v3(u64 i) const {
-    const u64 idx = i + i / ns + ns + 1;
-    return {v.x[idx], v.y[idx], v.z[idx]};
-}
-
-void Mesh::update_wake(const linalg::alias::float3& freestream) {
-    const f32 chord_root = chord_length(0); 
-    const f32 off_x = freestream.x * 100.0f * chord_root;
-    const f32 off_y = freestream.y * 100.0f * chord_root;
-    const f32 off_z = freestream.z * 100.0f * chord_root;
-
-    const u64 v_ns = ns + 1;
-    const u64 begin_trailing_edge = nb_vertices_wing()-v_ns;
-    const u64 end_trailing_edge = nb_vertices_wing();
-    // Add one layer of wake vertices
-    // this can be parallelized (careful to false sharing tho)
-    for (u64 i = begin_trailing_edge; i < end_trailing_edge; ++i) {
-        v.x[i + v_ns] = v.x[i] + off_x;
-        v.y[i + v_ns] = v.y[i] + off_y;
-        v.z[i + v_ns] = v.z[i] + off_z;
-    }
-    compute_metrics_wake();
-    current_nw = 1;
-}
-
-// https://publications.polymtl.ca/2555/1/2017_MatthieuParenteau.pdf (Eq 3.4 p21)
-void Mesh::correction_high_aoa(f32 alpha_rad) {
-    const f32 factor = 0.5f * alpha_rad / (std::sin(alpha_rad) + EPS); // correction factor
-    // Note: this can be vectorized and parallelized
-    for (u64 i = 0; i < nb_panels_total(); i++) {
-        // "chord vector" from center of leading line (v0-v1) to trailing line (v3-v2)
-        const linalg::alias::float3 chord_vec = 0.5f * (get_v2(i) + get_v3(i) - get_v0(i) - get_v1(i));
-        const linalg::alias::float3 colloc_pt = 0.5f * (get_v0(i) + get_v1(i)) + factor * chord_vec;
-        // collocation calculated as a translation of the center of leading line center
-        colloc.x[i] = colloc_pt.x;
-        colloc.y[i] = colloc_pt.y;
-        colloc.z[i] = colloc_pt.z;
-    }
-}
-
-void Mesh::compute_connectivity() {
-    // indices of the points forming the quad
-    //             span(y) ->
-    //  chord(x)     0--1
-    //     |         |  |
-    //    \/         3--2
-    std::array<u64, 4> quad = {};
-
-    // Note: with some modifications this loop can be parallelized
-    for (u64 i = 0; i < nb_panels_wing(); i++) {
-        // extra offset occuring from the fact that there is 1 more vertex
-        // per row than surfaces
-        u64 chord_idx = i / nc;
-        quad[0] = i + chord_idx;
-        quad[1] = quad[0] + 1;
-        quad[2] = quad[1] + ns;
-        quad[3] = quad[0] + ns;
-        offsets[i+1] = offsets[i] + 4;
-        connectivity.insert(connectivity.end(), quad.begin(), quad.end());
-    }
-}
-
-void Mesh::compute_metrics_i(u64 i ) {
-    const linalg::alias::float3 v0 = get_v0(i);
-    const linalg::alias::float3 v1 = get_v1(i);
-    const linalg::alias::float3 v2 = get_v2(i);
-    const linalg::alias::float3 v3 = get_v3(i);
-
-    // Vector v0 from p1 to p3
-    const linalg::alias::float3 vec_0 = v3 - v1;
-
-    // Vector v1 from p0 to p2
-    const linalg::alias::float3 vec_1 = v2 - v0;
-
-    // Normal = v0 x v1
-    const linalg::alias::float3 normal_vec = linalg::normalize(linalg::cross(vec_0, vec_1));
-    normal.x[i] = normal_vec.x;
-    normal.y[i] = normal_vec.y;
-    normal.z[i] = normal_vec.z;
-
-    // 3 vectors f (P0P3), b (P0P2), e (P0P1) to compute the area:
-    // area = 0.5 * (||f x b|| + ||b x e||)
-    // this formula might also work: area = || 0.5 * ( f x b + b x e ) ||
-    const linalg::alias::float3 vec_f = v3 - v0;
-    const linalg::alias::float3 vec_b = v2 - v0;
-    const linalg::alias::float3 vec_e = v1 - v0;
-
-    area[i] = 0.5f * (linalg::length(linalg::cross(vec_f, vec_b)) + linalg::length(linalg::cross(vec_b, vec_e)));
+//             total_area += 0.5f * std::abs(cross.z);
+//             // std::cout << "area xy: " << 0.5f * std::abs(cross.z()) << "\n";
+//             // std::cout << "area: " << area[idx] << "\n";
+//         }
+//     }
+//     return total_area;
+// }
+
+// f32 Mesh::panel_length(const u64 i, const u64 j) const {
+//     // Since chordwise segments are always parallel, we can simply measure the width of the first panel
+//     assert(i < nc);
+//     assert(j < ns);
+//     const linalg::alias::float3 v0 = get_v0(j + i * ns);
+//     const linalg::alias::float3 v1 = get_v1(j + i * ns);
+//     const linalg::alias::float3 v2 = get_v2(j + i * ns);
+//     const linalg::alias::float3 v3 = get_v3(j + i * ns);
+//     return 0.5f * (linalg::length(v3-v0) + linalg::length(v2-v1));
+// }
+
+// /// @brief Computes the width of a single panel in pure y direction
+// /// @param i panel index chordwise
+// /// @param j panel index spanwise
+// f32 Mesh::panel_width_y(const u64 i, const u64 j) const {
+//     assert(i < nc);
+//     assert(j < ns);
+//     const u64 ld = ns + 1;
+
+//     return v.y[j + 1 + i * ld] - v.y[j + i * ld];
+// }
+
+// f32 Mesh::strip_width(const u64 j) const {
+//     assert(j < ns);
+//     const linalg::alias::float3 v0 = get_v0(j);
+//     const linalg::alias::float3 v1 = get_v1(j);
+//     return linalg::length(v1 - v0);
+// }
+
+// /// @brief Computes the chord length of a chordwise segment
+// /// @details Since the mesh follows the camber line, the chord length is computed
+// /// as the distance between the first and last vertex of a chordwise segment
+// /// @param j chordwise segment index
+// f32 Mesh::chord_length(const u64 j) const {
+//     assert(j <= ns); // spanwise vertex range
+//     const f32 dx = v.x[j + nc * (ns+1)] - v.x[j];
+//     const f32 dy = 0.0f; // chordwise segments are parallel to the x axis
+//     const f32 dz = v.z[j + nc * (ns+1)] - v.z[j];
+
+//     return std::sqrt(dx * dx + dy * dy + dz * dz);
+// }
+
+// linalg::alias::float3 Mesh::get_v0(u64 i) const {
+//     const u64 idx = i + i / ns;
+//     return {v.x[idx], v.y[idx], v.z[idx]};
+// }
+
+// linalg::alias::float3 Mesh::get_v1(u64 i) const {
+//     const u64 idx = i + i / ns + 1;
+//     return {v.x[idx], v.y[idx], v.z[idx]};
+// }
+
+// linalg::alias::float3 Mesh::get_v2(u64 i) const {
+//     const u64 idx = i + i / ns + ns + 2;
+//     return {v.x[idx], v.y[idx], v.z[idx]};
+// }
+
+// linalg::alias::float3 Mesh::get_v3(u64 i) const {
+//     const u64 idx = i + i / ns + ns + 1;
+//     return {v.x[idx], v.y[idx], v.z[idx]};
+// }
+
+// void Mesh::update_wake(const linalg::alias::float3& freestream) {
+//     const f32 chord_root = chord_length(0); 
+//     const f32 off_x = freestream.x * 100.0f * chord_root;
+//     const f32 off_y = freestream.y * 100.0f * chord_root;
+//     const f32 off_z = freestream.z * 100.0f * chord_root;
+
+//     const u64 v_ns = ns + 1;
+//     const u64 begin_trailing_edge = nb_vertices_wing()-v_ns;
+//     const u64 end_trailing_edge = nb_vertices_wing();
+//     // Add one layer of wake vertices
+//     // this can be parallelized (careful to false sharing tho)
+//     for (u64 i = begin_trailing_edge; i < end_trailing_edge; ++i) {
+//         v.x[i + v_ns] = v.x[i] + off_x;
+//         v.y[i + v_ns] = v.y[i] + off_y;
+//         v.z[i + v_ns] = v.z[i] + off_z;
+//     }
+//     compute_metrics_wake();
+//     current_nw = 1;
+// }
+
+// // https://publications.polymtl.ca/2555/1/2017_MatthieuParenteau.pdf (Eq 3.4 p21)
+// void Mesh::correction_high_aoa(f32 alpha_rad) {
+//     const f32 factor = 0.5f * alpha_rad / (std::sin(alpha_rad) + EPS); // correction factor
+//     // Note: this can be vectorized and parallelized
+//     for (u64 i = 0; i < nb_panels_total(); i++) {
+//         // "chord vector" from center of leading line (v0-v1) to trailing line (v3-v2)
+//         const linalg::alias::float3 chord_vec = 0.5f * (get_v2(i) + get_v3(i) - get_v0(i) - get_v1(i));
+//         const linalg::alias::float3 colloc_pt = 0.5f * (get_v0(i) + get_v1(i)) + factor * chord_vec;
+//         // collocation calculated as a translation of the center of leading line center
+//         colloc.x[i] = colloc_pt.x;
+//         colloc.y[i] = colloc_pt.y;
+//         colloc.z[i] = colloc_pt.z;
+//     }
+// }
+
+// void Mesh::compute_connectivity() {
+//     // indices of the points forming the quad
+//     //             span(y) ->
+//     //  chord(x)     0--1
+//     //     |         |  |
+//     //    \/         3--2
+//     std::array<u64, 4> quad = {};
+
+//     // Note: with some modifications this loop can be parallelized
+//     for (u64 i = 0; i < nb_panels_wing(); i++) {
+//         // extra offset occuring from the fact that there is 1 more vertex
+//         // per row than surfaces
+//         u64 chord_idx = i / nc;
+//         quad[0] = i + chord_idx;
+//         quad[1] = quad[0] + 1;
+//         quad[2] = quad[1] + ns;
+//         quad[3] = quad[0] + ns;
+//         offsets[i+1] = offsets[i] + 4;
+//         connectivity.insert(connectivity.end(), quad.begin(), quad.end());
+//     }
+// }
+
+// void Mesh::compute_metrics_i(u64 i ) {
+//     const linalg::alias::float3 v0 = get_v0(i);
+//     const linalg::alias::float3 v1 = get_v1(i);
+//     const linalg::alias::float3 v2 = get_v2(i);
+//     const linalg::alias::float3 v3 = get_v3(i);
+
+//     // Vector v0 from p1 to p3
+//     const linalg::alias::float3 vec_0 = v3 - v1;
+
+//     // Vector v1 from p0 to p2
+//     const linalg::alias::float3 vec_1 = v2 - v0;
+
+//     // Normal = v0 x v1
+//     const linalg::alias::float3 normal_vec = linalg::normalize(linalg::cross(vec_0, vec_1));
+//     normal.x[i] = normal_vec.x;
+//     normal.y[i] = normal_vec.y;
+//     normal.z[i] = normal_vec.z;
+
+//     // 3 vectors f (P0P3), b (P0P2), e (P0P1) to compute the area:
+//     // area = 0.5 * (||f x b|| + ||b x e||)
+//     // this formula might also work: area = || 0.5 * ( f x b + b x e ) ||
+//     const linalg::alias::float3 vec_f = v3 - v0;
+//     const linalg::alias::float3 vec_b = v2 - v0;
+//     const linalg::alias::float3 vec_e = v1 - v0;
+
+//     area[i] = 0.5f * (linalg::length(linalg::cross(vec_f, vec_b)) + linalg::length(linalg::cross(vec_b, vec_e)));
     
-    // collocation point (center of the quad)
-    const linalg::alias::float3 colloc_pt = 0.25f * (v0 + v1 + v2 + v3);
+//     // collocation point (center of the quad)
+//     const linalg::alias::float3 colloc_pt = 0.25f * (v0 + v1 + v2 + v3);
     
-    colloc.x[i] = colloc_pt.x;
-    colloc.y[i] = colloc_pt.y;
-    colloc.z[i] = colloc_pt.z;
-}
-
-void Mesh::compute_metrics_wing() {
-    for (u64 i = 0; i < nb_panels_wing(); i++) {
-        compute_metrics_i(i);
-    }
-}
-
-void Mesh::compute_metrics_wake() {
-    for (u64 i = nb_panels_wing(); i < nb_panels_total(); i++) {
-        compute_metrics_i(i);
-    }
-}
-
-// plot3d is chordwise major
-void Mesh::io_read_plot3d_structured(std::ifstream& f) {
-    std::cout << "reading plot3d mesh" << std::endl;
-    u64 ni = 0; // number of vertices chordwise
-    u64 nj = 0; // number of vertices spanwise
-    u64 nk = 0;
-    u64 blocks = 0;
-    f32 x, y, z;
-    f >> blocks;
-    if (blocks != 1) {
-        throw std::runtime_error("Only single block plot3d mesh is supported");
-    }
-    f >> ni >> nj >> nk;
-    if (nk != 1) {
-        throw std::runtime_error("Only 2D plot3d mesh is supported");
-    }
-
-    ns = nj - 1;
-    nc = ni - 1;
-    alloc();
-
-    std::cout << "number of panels: " << nb_panels_wing() << std::endl;
-    std::cout << "ns: " << ns << std::endl;
-    std::cout << "nc: " << nc << std::endl;
+//     colloc.x[i] = colloc_pt.x;
+//     colloc.y[i] = colloc_pt.y;
+//     colloc.z[i] = colloc_pt.z;
+// }
+
+// void Mesh::compute_metrics_wing() {
+//     for (u64 i = 0; i < nb_panels_wing(); i++) {
+//         compute_metrics_i(i);
+//     }
+// }
+
+// void Mesh::compute_metrics_wake() {
+//     for (u64 i = nb_panels_wing(); i < nb_panels_total(); i++) {
+//         compute_metrics_i(i);
+//     }
+// }
+
+// // plot3d is chordwise major
+// void Mesh::io_read_plot3d_structured(std::ifstream& f) {
+//     std::cout << "reading plot3d mesh" << std::endl;
+//     u64 ni = 0; // number of vertices chordwise
+//     u64 nj = 0; // number of vertices spanwise
+//     u64 nk = 0;
+//     u64 blocks = 0;
+//     f32 x, y, z;
+//     f >> blocks;
+//     if (blocks != 1) {
+//         throw std::runtime_error("Only single block plot3d mesh is supported");
+//     }
+//     f >> ni >> nj >> nk;
+//     if (nk != 1) {
+//         throw std::runtime_error("Only 2D plot3d mesh is supported");
+//     }
+
+//     ns = nj - 1;
+//     nc = ni - 1;
+//     alloc();
+
+//     std::cout << "number of panels: " << nb_panels_wing() << std::endl;
+//     std::cout << "ns: " << ns << std::endl;
+//     std::cout << "nc: " << nc << std::endl;
     
-    for (u64 j = 0; j < nj; j++) {
-        for (u64 i = 0; i < ni; i++) {
-            f >> x;
-            v.x[nj*i + j] = x;
-        }
-    }
-    for (u64 j = 0; j < nj; j++) {
-        for (u64 i = 0; i < ni; i++) {
-            f >> y;
-            v.y[nj*i + j] = y;
-        }
-    }
-    for (u64 j = 0; j < nj; j++) {
-        for (u64 i = 0; i < ni; i++) {
-            f >> z;
-            v.z[nj*i + j] = z;
-        }
-    }
-
-    // TODO: this validation is not robust enough when eventually we will be using multiple bodies
-    // Validation
-    // const f32 eps = std::numeric_limits<f32>::epsilon();
-    // if (std::abs(m.v.x[0]) != eps || std::abs(m.v.y[0]) != eps) {
-    //     throw std::runtime_error("First vertex of plot3d mesh must be at origin");
-    // }
-    const f32 first_y = v.y[0];
-    for (u64 i = 1; i < ni; i++) {
-        if (v.y[i * nj] != first_y) {
-            throw std::runtime_error("Mesh vertices should be ordered in chordwise direction");
-        }
-    }
-}
-
-void Mesh::io_read(const std::string& filename) {
-    const std::filesystem::path path(filename);
-    if (!std::filesystem::exists(path)) {
-        throw std::runtime_error("Mesh file not found");
-    }
-
-    std::ifstream f(path);
-    if (f.is_open()) {
-        if (path.extension() == ".x") {
-            io_read_plot3d_structured(f);
-        } else {
-            throw std::runtime_error("Only structured gridpro mesh format is supported");
-        }
-        f.close();
-    } else {
-        throw std::runtime_error("Failed to open mesh file");
-    }
-}
-
-void Mesh::move(const linalg::alias::float4x4& transform, const SoA_3D_t<f32>& origin_pos) {
-    // const tiny::ScopedTimer t("Mesh::move");
-    assert(current_nw < nw); // check if we have capacity
+//     for (u64 j = 0; j < nj; j++) {
+//         for (u64 i = 0; i < ni; i++) {
+//             f >> x;
+//             v.x[nj*i + j] = x;
+//         }
+//     }
+//     for (u64 j = 0; j < nj; j++) {
+//         for (u64 i = 0; i < ni; i++) {
+//             f >> y;
+//             v.y[nj*i + j] = y;
+//         }
+//     }
+//     for (u64 j = 0; j < nj; j++) {
+//         for (u64 i = 0; i < ni; i++) {
+//             f >> z;
+//             v.z[nj*i + j] = z;
+//         }
+//     }
+
+//     // TODO: this validation is not robust enough when eventually we will be using multiple bodies
+//     // Validation
+//     // const f32 eps = std::numeric_limits<f32>::epsilon();
+//     // if (std::abs(m.v.x[0]) != eps || std::abs(m.v.y[0]) != eps) {
+//     //     throw std::runtime_error("First vertex of plot3d mesh must be at origin");
+//     // }
+//     const f32 first_y = v.y[0];
+//     for (u64 i = 1; i < ni; i++) {
+//         if (v.y[i * nj] != first_y) {
+//             throw std::runtime_error("Mesh vertices should be ordered in chordwise direction");
+//         }
+//     }
+// }
+
+// void Mesh::io_read(const std::string& filename) {
+//     const std::filesystem::path path(filename);
+//     if (!std::filesystem::exists(path)) {
+//         throw std::runtime_error("Mesh file not found");
+//     }
+
+//     std::ifstream f(path);
+//     if (f.is_open()) {
+//         if (path.extension() == ".x") {
+//             io_read_plot3d_structured(f);
+//         } else {
+//             throw std::runtime_error("Only structured gridpro mesh format is supported");
+//         }
+//         f.close();
+//     } else {
+//         throw std::runtime_error("Failed to open mesh file");
+//     }
+// }
+
+// void Mesh::move(const linalg::alias::float4x4& transform, const SoA_3D_t<f32>& origin_pos) {
+//     // const tiny::ScopedTimer t("Mesh::move");
+//     assert(current_nw < nw); // check if we have capacity
     
-    // Shed wake before moving    
-    const u64 src_begin = nb_vertices_wing() - (ns + 1);
-    const u64 src_end = nb_vertices_wing();
-    const u64 dst_begin = (nc + nw - current_nw) * (ns + 1);
-    std::copy(v.x.data() + src_begin, v.x.data() + src_end, v.x.data() + dst_begin);
-    std::copy(v.y.data() + src_begin, v.y.data() + src_end, v.y.data() + dst_begin);
-    std::copy(v.z.data() + src_begin, v.z.data() + src_end, v.z.data() + dst_begin);
+//     // Shed wake before moving    
+//     const u64 src_begin = nb_vertices_wing() - (ns + 1);
+//     const u64 src_end = nb_vertices_wing();
+//     const u64 dst_begin = (nc + nw - current_nw) * (ns + 1);
+//     std::copy(v.x.data() + src_begin, v.x.data() + src_end, v.x.data() + dst_begin);
+//     std::copy(v.y.data() + src_begin, v.y.data() + src_end, v.y.data() + dst_begin);
+//     std::copy(v.z.data() + src_begin, v.z.data() + src_end, v.z.data() + dst_begin);
     
-    // Perform the movement
-    for (u64 i = 0; i < nb_vertices_wing(); i++) {
-        const linalg::alias::float4 transformed_pt = linalg::mul(transform, linalg::alias::float4{origin_pos.x[i], origin_pos.y[i], origin_pos.z[i], 1.f});
-        v.x[i] = transformed_pt.x;
-        v.y[i] = transformed_pt.y;
-        v.z[i] = transformed_pt.z;
-    }
-
-    compute_metrics_wing(); // compute the new wing panel metrics
-
-    // Copy new trailing edge vertices on the wake buffer
-    std::copy(v.x.data() + src_begin, v.x.data() + src_end, v.x.data() + dst_begin - (ns + 1));
-    std::copy(v.y.data() + src_begin, v.y.data() + src_end, v.y.data() + dst_begin - (ns + 1));
-    std::copy(v.z.data() + src_begin, v.z.data() + src_end, v.z.data() + dst_begin - (ns + 1));
-
-    current_nw++;
-}
-
-void Mesh::resize_wake(const u64 nw_) {
-    nw = nw_;
-    alloc(); // resizes the buffers
-}
+//     // Perform the movement
+//     for (u64 i = 0; i < nb_vertices_wing(); i++) {
+//         const linalg::alias::float4 transformed_pt = linalg::mul(transform, linalg::alias::float4{origin_pos.x[i], origin_pos.y[i], origin_pos.z[i], 1.f});
+//         v.x[i] = transformed_pt.x;
+//         v.y[i] = transformed_pt.y;
+//         v.z[i] = transformed_pt.z;
+//     }
+
+//     compute_metrics_wing(); // compute the new wing panel metrics
+
+//     // Copy new trailing edge vertices on the wake buffer
+//     std::copy(v.x.data() + src_begin, v.x.data() + src_end, v.x.data() + dst_begin - (ns + 1));
+//     std::copy(v.y.data() + src_begin, v.y.data() + src_end, v.y.data() + dst_begin - (ns + 1));
+//     std::copy(v.z.data() + src_begin, v.z.data() + src_end, v.z.data() + dst_begin - (ns + 1));
+
+//     current_nw++;
+// }
+
+// void Mesh::resize_wake(const u64 nw_) {
+//     nw = nw_;
+//     alloc(); // resizes the buffers
+// }