diff --git a/tests/test_vlm_elliptic_coeffs.cpp b/tests/test_vlm_elliptic_coeffs.cpp
index 02c91ed..4f3ea52 100644
--- a/tests/test_vlm_elliptic_coeffs.cpp
+++ b/tests/test_vlm_elliptic_coeffs.cpp
@@ -114,13 +114,18 @@ int main(int  /*argc*/, char ** /*argv*/) {
     const std::vector<std::string> backends = get_available_backends();
     std::vector<f32> test_alphas = {0, 1, 2, 3, 4, 5, 10, 15};
     std::transform(test_alphas.begin(), test_alphas.end(), test_alphas.begin(), to_radians);
-
-    MeshGeom mesh_geom{};
+    
+    const MeshIO mesh_io{"plot3d"};
+    MeshGeom mesh_geom{}; // DEPRECATE
     auto solvers = tiny::make_combination(meshes, backends);
     for (const auto& [mesh_name, backend_name] : solvers) {
         std::printf(">>> MESH: %s | BACKEND: %s\n", mesh_name.get().c_str(), backend_name.get().c_str());
-        mesh_io_read_file(mesh_name, &mesh_geom);
-        mesh_quarterchord(&mesh_geom);
+
+        auto dims = mesh_io.get_dims(mesh_name);
+        // LinearVLM solver{backend_name, }
+
+        mesh_io_read_file(mesh_name, &mesh_geom); // DEPRECATE
+        mesh_quarterchord(&mesh_geom); // DEPRECATE
         LinearVLM solver{create_backend(backend_name, &mesh_geom, 1)};
 
         for (u64 i = 0; i < test_alphas.size(); i++) {
diff --git a/vlm/backends/cpu/include/vlm_backend_cpu.hpp b/vlm/backends/cpu/include/vlm_backend_cpu.hpp
index 325137e..5f1cf30 100644
--- a/vlm/backends/cpu/include/vlm_backend_cpu.hpp
+++ b/vlm/backends/cpu/include/vlm_backend_cpu.hpp
@@ -7,29 +7,28 @@ namespace vlm {
 
 class BackendCPU final : public Backend {
     public:
-        BackendCPU(MeshGeom* mesh, u64 timesteps);
+        BackendCPU();
         ~BackendCPU();
-        void reset() override;
-        void lhs_assemble() override;
-        void compute_rhs() override;
-        void add_wake_influence() override;
-        void wake_rollup(float dt) override;
-        void shed_gamma() override;
-        void lu_factor() override;
-        void lu_solve() override;
-        f32 compute_coefficient_cl(const FlowData& flow, const f32 area, const u64 j, const u64 n) override;
-        f32 compute_coefficient_unsteady_cl(const linalg::alias::float3& freestream, const SoA_3D_t<f32>& vel, f32 dt, const f32 area, const u64 j, const u64 n) override;
-        linalg::alias::float3 compute_coefficient_cm(const FlowData& flow, const f32 area, const f32 chord, const u64 j, const u64 n) override;
-        f32 compute_coefficient_cd(const FlowData& flow, const f32 area, const u64 j, const u64 n) override;
-        void compute_delta_gamma() override;
-        void set_velocities(const linalg::alias::float3& vel) override;
-        void set_velocities(const f32* vels) override;
+        void lhs_assemble(f32* lhs) override;
+        void rhs_assemble_velocities(f32* rhs, const f32* normals, const f32* velocities) override;
+        void rhs_assemble_wake_influence(f32* rhs, const f32* gamma) override;
+        void displace_wake_rollup(float dt, f32* rollup_vertices, f32* verts_wake, const f32* verts_wing, const f32* gamma) override;
+        // void displace_wing(const linalg::alias::float4x4& transform) override;
+        void displace_wing_and_shed(const std::vector<linalg::alias::float4x4>& transform, f32* verts_wing, f32* verts_wake) override;
+        void gamma_shed(f32* gamma, f32* gamma_prev) override;
+        void gamma_delta(f32* gamma_delta, const f32* gamma) override;
+        void lu_factor(f32* lhs) override;
+        void lu_solve(f32* lhs, f32* rhs, f32* gamma) override;
+        
+        // Per mesh kernels
+        f32 coeff_steady_cl(const MeshParams& param, const f32* verts_wing, const f32* gamma_delta, const FlowData& flow, const f32 area, const u64 j, const u64 n) override;
+        f32 coeff_unsteady_cl(const MeshParams& param, const f32* verts_wing, const f32* gamma_delta, const f32* gamma, const f32* gamma_prev, const f32* local_velocities, const f32* areas, const f32* normals, const linalg::alias::float3& freestream, f32 dt, const f32 area, const u64 j, const u64 n) override;
+        linalg::alias::float3 coeff_steady_cm(const MeshParams& param, const FlowData& flow, const f32 area, const f32 chord, const u64 j, const u64 n) override;
+        f32 coeff_steady_cd(const MeshParams& param, const FlowData& flow, const f32 area, const u64 j, const u64 n) override;
 
         void mesh_metrics(const f32 alpha) override;
-        void mesh_move(const linalg::alias::float4x4& transform) override;
-        void update_wake(const linalg::alias::float3& freestream) override;
-        f32 mesh_mac(u64 j, u64 n) override; // mean chord
-        f32 mesh_area(const u64 i, const u64 j, const u64 m, const u64 n) override; // mean span
+        f32 mesh_mac(const MeshParams& param, const u64 j, const u64 n) override;
+        f32 mesh_area(const MeshParams& param, const u64 i, const u64 j, const u64 m, const u64 n) override;
 };
 
 } // namespace vlm
\ No newline at end of file
diff --git a/vlm/backends/cpu/src/vlm_backend_cpu.cpp b/vlm/backends/cpu/src/vlm_backend_cpu.cpp
index 10d0182..87e3326 100644
--- a/vlm/backends/cpu/src/vlm_backend_cpu.cpp
+++ b/vlm/backends/cpu/src/vlm_backend_cpu.cpp
@@ -22,67 +22,75 @@
 using namespace vlm;
 using namespace linalg::ostream_overloads;
 
-ispc::Mesh2 mesh_to_ispc(Mesh2* mesh) {
-    ispc::Mesh2 mesh_ispc;
-    mesh_ispc.nc = mesh->nc;
-    mesh_ispc.ns = mesh->ns;
-    mesh_ispc.nw = mesh->nw;
-    mesh_ispc.nwa = mesh->nwa;
-    mesh_ispc.vertices = mesh->vertices;
-    mesh_ispc.normals = mesh->normals;
-    mesh_ispc.colloc = mesh->colloc;
-    mesh_ispc.area = mesh->area;
-    return mesh_ispc;
-}
-
-BackendCPU::~BackendCPU() = default; // Destructor definition
-
-BackendCPU::BackendCPU(MeshGeom* mesh, u64 timesteps) {
-    allocator.h_malloc = std::malloc;
-    allocator.d_malloc = std::malloc;
-    allocator.h_free = std::free;
-    allocator.d_free = std::free;
-    allocator.hh_memcpy = std::memcpy;
-    allocator.hd_memcpy = std::memcpy;
-    allocator.dh_memcpy = std::memcpy;
-    allocator.dd_memcpy = std::memcpy;
-    allocator.h_memset = std::memset;
-    allocator.d_memset = std::memset;
-    init(mesh, timesteps);
-
-    d_solver_info = (i32*)allocator.d_malloc(sizeof(i32));
-    d_solver_ipiv = (i32*)allocator.d_malloc(hd_mesh->nc*hd_mesh->ns*sizeof(i32));
-}
-
-void BackendCPU::reset() {
-    const u64 nb_panels_wing = hd_mesh->nc * hd_mesh->ns;
-    const u64 nb_vertices_wing = (hd_mesh->nc+1)*(hd_mesh->ns+1);
-
-
-    // std::fill(gamma.begin(), gamma.end(), 0.0f);
-    std::fill(hd_data->lhs, hd_data->lhs + nb_panels_wing*nb_panels_wing, 0.0f); // influence kernel is +=
-    // std::fill(rhs.begin(), rhs.end(), 0.0f);
-    allocator.dd_memcpy(PTR_MESH_V(hd_mesh, 0, 0, 0), PTR_MESHGEOM_V(hd_mesh_geom, 0, 0, 0), nb_vertices_wing*sizeof(f32));
-    allocator.dd_memcpy(PTR_MESH_V(hd_mesh, 0, 0, 1), PTR_MESHGEOM_V(hd_mesh_geom, 0, 0, 1), nb_vertices_wing*sizeof(f32));
-    allocator.dd_memcpy(PTR_MESH_V(hd_mesh, 0, 0, 2), PTR_MESHGEOM_V(hd_mesh_geom, 0, 0, 2), nb_vertices_wing*sizeof(f32));
-
-    dd_mesh->nwa = 0;
-}
+const Allocator allocator_cpu{
+    std::malloc,
+    std::malloc,
+	std::free,
+    std::free,
+	std::memcpy,
+    std::memcpy,
+ 	std::memcpy,
+    std::memcpy,
+  	std::memset,
+    std::memset
+};
+
+BackendCPU::BackendCPU() : Backend(allocator_cpu) {}
+BackendCPU::~BackendCPU() = default;
+
+// void BackendCPU::reset() {
+//     const u64 nb_panels_wing = hd_mesh->nc * hd_mesh->ns;
+//     const u64 nb_vertices_wing = (hd_mesh->nc+1)*(hd_mesh->ns+1);
+
+
+//     // std::fill(gamma.begin(), gamma.end(), 0.0f);
+//     std::fill(hd_data->lhs, hd_data->lhs + nb_panels_wing*nb_panels_wing, 0.0f); // influence kernel is +=
+//     // std::fill(rhs.begin(), rhs.end(), 0.0f);
+//     allocator.dd_memcpy(PTR_MESH_V(hd_mesh, 0, 0, 0), PTR_MESHGEOM_V(hd_mesh_geom, 0, 0, 0), nb_vertices_wing*sizeof(f32));
+//     allocator.dd_memcpy(PTR_MESH_V(hd_mesh, 0, 0, 1), PTR_MESHGEOM_V(hd_mesh_geom, 0, 0, 1), nb_vertices_wing*sizeof(f32));
+//     allocator.dd_memcpy(PTR_MESH_V(hd_mesh, 0, 0, 2), PTR_MESHGEOM_V(hd_mesh_geom, 0, 0, 2), nb_vertices_wing*sizeof(f32));
+
+//     dd_mesh->nwa = 0;
+// }
 
-void BackendCPU::compute_delta_gamma() {
-    const u64 nc = dd_mesh->nc;
-    const u64 ns = dd_mesh->ns;
-    const u64 nw = dd_mesh->nw;
-    const u64 nwa = dd_mesh->nwa;
+void BackendCPU::gamma_delta(f32* gamma_delta, const f32* gamma) {
     // const tiny::ScopedTimer timer("Delta Gamma");
-    std::copy(dd_data->gamma, dd_data->gamma+ns, dd_data->delta_gamma);
 
-    // note: this is efficient as the memory is contiguous
-    for (u64 i = 1; i < nc; i++) {
-        for (u64 j = 0; j < ns; j++) {
-            dd_data->delta_gamma[i*ns + j] = dd_data->gamma[i*ns + j] - dd_data->gamma[(i-1)*ns + j];
-        }
-    }
+    tf::Taskflow graph;
+
+    auto begin = graph.placeholder();
+    auto end = graph.placeholder();
+
+    for (const auto& p : prm)  {
+        f32* p_gamma_delta = gamma_delta + p.poff;
+        const f32* p_gamma = gamma + p.poff;
+        tf::Task first_row = graph.emplace([&]{
+            allocator.dd_memcpy(p_gamma_delta, p_gamma, p.ns * sizeof(f32));
+        });
+        tf::Task remaining_rows = graph.for_each_index((u64)1,p.nc, [&] (u64 b, u64 e) {
+            for (u64 i = b; i < e; i++) {
+                for (u64 j = 0; j < p.ns; j++) {
+                    p_gamma_delta[i*p.ns + j] = p_gamma[i*p.ns + j] - p_gamma[(i-1)*p.ns + j];
+                }
+            }
+        });
+
+        begin.precede(first_row);
+        begin.precede(remaining_rows);
+        first_row.precede(end);
+        remaining_rows.precede(end);
+    };
+
+    Executor::get().run(graph).wait();
+
+    // std::copy(gamma,gamma+ns, delta_gamma);
+
+    // // note: this is efficient as the memory is contiguous
+    // for (u64 i = 1; i < nc; i++) {
+    //     for (u64 j = 0; j < ns; j++) {
+    //         delta_gamma[i*ns + j] = gamma[i*ns + j] - gamma[(i-1)*ns + j];
+    //     }
+    // }
 }
 
 void BackendCPU::lhs_assemble() {
@@ -95,7 +103,7 @@ void BackendCPU::lhs_assemble() {
     const u64 zero = 0;
     const u64 end_wing = (nc - 1) * ns;
 
-    ispc::Mesh2 mesh = mesh_to_ispc(dd_mesh);
+    ispc::Mesh mesh = mesh_to_ispc(dd_mesh);
 
     tf::Taskflow taskflow;
 
@@ -128,7 +136,7 @@ void BackendCPU::lhs_assemble() {
     Executor::get().run(taskflow).wait();
 }
 
-void BackendCPU::compute_rhs() {
+void BackendCPU::rhs_assemble_velocities() {
     const tiny::ScopedTimer timer("RHS");
     const u64 nc = dd_mesh->nc;
     const u64 ns = dd_mesh->ns;
@@ -143,13 +151,13 @@ void BackendCPU::compute_rhs() {
     }
 }
 
-void BackendCPU::add_wake_influence() {
+void BackendCPU::rhs_assemble_wake_influence() {
     // const tiny::ScopedTimer timer("Wake Influence");
     const u64 nc = dd_mesh->nc;
     const u64 ns = dd_mesh->ns;
     const u64 nw = dd_mesh->nw;
 
-    ispc::Mesh2 mesh = mesh_to_ispc(dd_mesh);
+    ispc::Mesh mesh = mesh_to_ispc(dd_mesh);
 
     tf::Taskflow taskflow;
 
@@ -165,7 +173,7 @@ void BackendCPU::add_wake_influence() {
     Executor::get().run(taskflow).wait();
 }
 
-void BackendCPU::wake_rollup(float dt) {
+void BackendCPU::displace_wake_rollup(float dt) {
     const tiny::ScopedTimer timer("Wake Rollup");
 
     const u64 nc = dd_mesh->nc;
@@ -180,7 +188,7 @@ void BackendCPU::wake_rollup(float dt) {
     const f32* ry = dd_data->rollup_vertices + (nc+nw+1)*(ns+1)*1;
     const f32* rz = dd_data->rollup_vertices + (nc+nw+1)*(ns+1)*2;
 
-    ispc::Mesh2 mesh = mesh_to_ispc(dd_mesh);
+    ispc::Mesh mesh = mesh_to_ispc(dd_mesh);
 
     tf::Taskflow taskflow;
 
@@ -201,7 +209,7 @@ void BackendCPU::wake_rollup(float dt) {
     Executor::get().run(taskflow).wait();
 }
 
-void BackendCPU::shed_gamma() {
+void BackendCPU::gamma_shed() {
     // const tiny::ScopedTimer timer("Shed Gamma");
     const u64 nc = dd_mesh->nc;
     const u64 ns = dd_mesh->ns;
@@ -230,7 +238,7 @@ void BackendCPU::lu_solve() {
     LAPACKE_sgetrs(LAPACK_COL_MAJOR, 'N', n, 1, dd_data->lhs, n, d_solver_ipiv, dd_data->gamma, n);
 }
 
-f32 BackendCPU::compute_coefficient_cl(const FlowData& flow, const f32 area, const u64 j, const u64 n) {
+f32 BackendCPU::coeff_steady_cl(const FlowData& flow, const f32 area, const u64 j, const u64 n) {
     // const tiny::ScopedTimer timer("Compute CL");
 
     const u64 nc = dd_mesh->nc;
@@ -264,7 +272,7 @@ f32 BackendCPU::compute_coefficient_cl(const FlowData& flow, const f32 area, con
     return cl;
 }
 
-f32 BackendCPU::compute_coefficient_unsteady_cl(const linalg::alias::float3& freestream, const SoA_3D_t<f32>& vel, f32 dt, const f32 area, const u64 j, const u64 n) {
+f32 BackendCPU::coeff_unsteady_cl(const linalg::alias::float3& freestream, const SoA_3D_t<f32>& vel, f32 dt, const f32 area, const u64 j, const u64 n) {
     assert(n > 0);
     assert(j >= 0 && j+n <= dd_mesh->ns);
 
@@ -315,7 +323,7 @@ f32 BackendCPU::compute_coefficient_unsteady_cl(const linalg::alias::float3& fre
     return cl;
 }
 
-linalg::alias::float3 BackendCPU::compute_coefficient_cm(
+linalg::alias::float3 BackendCPU::coeff_steady_cm(
     const FlowData& flow,
     const f32 area,
     const f32 chord,
@@ -352,7 +360,7 @@ linalg::alias::float3 BackendCPU::compute_coefficient_cm(
     return cm;
 }
 
-f32 BackendCPU::compute_coefficient_cd(
+f32 BackendCPU::coeff_steady_cd(
     const FlowData& flow,
     const f32 area,
     const u64 j,
@@ -362,7 +370,7 @@ f32 BackendCPU::compute_coefficient_cd(
     assert(n > 0);
     assert(j >= 0 && j+n <= dd_mesh->ns);
     // std::fill(dd_data->trefftz_buffer, dd_data->trefftz_buffer + hd_mesh->ns, 0.0f);
-    ispc::Mesh2 mesh = mesh_to_ispc(dd_mesh);
+    ispc::Mesh mesh = mesh_to_ispc(dd_mesh);
     f32 cd = ispc::kernel_trefftz_cd2(&mesh, dd_data->gamma, j, n, sigma_vatistas);
     cd /= linalg::length2(flow.freestream) * area;
     return cd;
@@ -519,7 +527,7 @@ f32 BackendCPU::mesh_area(const u64 i, const u64 j, const u64 m, const u64 n) {
     return area_sum;
 }
 
-void BackendCPU::mesh_move(const linalg::alias::float4x4& transform) {
+void BackendCPU::displace_wing_and_shed(const linalg::alias::float4x4& transform) {
     // const tiny::ScopedTimer t("Mesh::move");
     assert(dd_mesh->nwa < dd_mesh->nw); // check if we have capacity
     
diff --git a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc
index d4d2df4..51179c2 100644
--- a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc
+++ b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc
@@ -44,7 +44,7 @@ inline F3 quad_normal(const F3& v0, const F3& v1, const F3& v2, const F3& v3) {
     return normalize(cross(v3-v1, v2-v0));
 }
 
-export struct Mesh2 {
+export struct Mesh {
     float* vertices; // (nc+nw+1)*(ns+1)*3
     // TODO: evaluate if we really need those values or we can just compute on the fly in the kernels
     float* normals; // nc*ns*3
@@ -109,7 +109,7 @@ inline void kernel_symmetry(float3& inf, C colloc, const V& vertex0, const V& ve
 }
 
 export void kernel_influence(
-    uniform const Mesh2* uniform m,
+    uniform const Mesh* uniform m,
     uniform float* uniform lhs,
     uniform uint64 ia, uniform uint64 lidx, uniform float sigma
     ) {
@@ -153,7 +153,7 @@ export void kernel_influence(
 }
 
 export void kernel_wake_influence(
-    uniform const Mesh2* uniform m,
+    uniform const Mesh* uniform m,
     uniform uint64 lidx, uniform float* uniform gamma, uniform float* uniform rhs, uniform float sigma
     ) {
     const uniform uint64 nb_panels = m->ns * m->nc;
@@ -199,7 +199,7 @@ export void kernel_wake_influence(
 }
 
 export void kernel_rollup(
-    uniform const Mesh2* uniform m, uniform float dt, 
+    uniform const Mesh* uniform m, uniform float dt, 
     uniform float* uniform rollup,
     uniform uint64 vidx, uniform float* uniform gamma, uniform float sigma
     ) {
@@ -269,7 +269,7 @@ export void kernel_rollup(
 }
 
 // export uniform float kernel_trefftz_cd(
-//     uniform const Mesh2* uniform m,
+//     uniform const Mesh* uniform m,
 //     uniform float* uniform gamma,
 //     uniform float* uniform trefftz_buffer,
 //     uniform uint64 j, uniform uint64 n, uniform float sigma
@@ -325,7 +325,7 @@ export void kernel_rollup(
 // }
 
 export uniform float kernel_trefftz_cd2(
-    uniform const Mesh2* uniform m,
+    uniform const Mesh* uniform m,
     uniform float* uniform gamma,
     uniform uint64 jp, uniform uint64 n, uniform float sigma
     ) {
@@ -382,7 +382,7 @@ export uniform float kernel_trefftz_cd2(
 }
 
 export uniform float kernel_trefftz_cl(
-    uniform const Mesh2* uniform m,
+    uniform const Mesh* uniform m,
     uniform float* uniform gamma,
     uniform uint64 j, uniform uint64 n
     ) {
diff --git a/vlm/include/vlm.hpp b/vlm/include/vlm.hpp
index 194c590..bb8ce00 100644
--- a/vlm/include/vlm.hpp
+++ b/vlm/include/vlm.hpp
@@ -8,77 +8,123 @@
 
 namespace vlm {
 
-class Solver {
+class Simulation {
     public:
     std::unique_ptr<Backend> backend;
-    Solver(const tiny::Config& cfg);
-    Solver(std::unique_ptr<Backend>&& backend_) : backend(std::move(backend_)) {};
-    virtual ~Solver() = default;
-};
+    Mesh2 mesh;
 
-class NonLinearVLM final: public Solver {
-    public:
-    static constexpr f64 DEFAULT_TOL = 1e-5;
-    static constexpr u64 DEFAULT_MAX_ITER = 100;
-
-    const f64 tol;
-    const u64 max_iter;
-    f32* strip_alphas = nullptr; // ns
-    f32* velocities = nullptr; // nc*ns*3
-
-    NonLinearVLM( // init from cfg
-        const tiny::Config& cfg,
-        const f64 tol_= DEFAULT_TOL,
-        const u64 max_iter_ = DEFAULT_MAX_ITER
-    ):
-        Solver(cfg),
-        tol(tol_),
-        max_iter(max_iter_)
-    {
-        alloc();
-    } 
-    NonLinearVLM( // init from backend
-        std::unique_ptr<Backend>&& backend_,
-        const f64 tol_= DEFAULT_TOL,
-        const u64 max_iter_ = DEFAULT_MAX_ITER
-    ) :
-        Solver(std::move(backend_)),
-        tol(tol_),
-        max_iter(max_iter_)
-    {
-        alloc();
-    }
-    ~NonLinearVLM() {
-        dealloc();
-    };
-    AeroCoefficients solve(const FlowData& flow, const Database& db);
+    Simulation(const std::string& backend_name, const std::vector<std::string>& meshes) : backend(create_backend(backend_name)), mesh(backend->allocator) {
+        // Read the sizes of all the meshes
+        {
+            u64 off_wing_p = 0;
+            u64 off_wing_v = 0;
+            for (const auto& m_name : meshes) {
+                const MeshIO mesh_io{"plot3d"}; // TODO, infer from mesh_name
+                auto [nc,ns] = mesh_io.get_dims(m_name);
+                mesh.params.emplace_back(MeshParams{nc, ns, 0, 0, off_wing_p, off_wing_v, 0, 0, true});
+                off_wing_p += mesh.params.back().nb_panels_wing();
+                off_wing_v += mesh.params.back().nb_vertices_wing();
+            }
+        }
+        mesh.alloc_wing();
 
-    private:
-    void alloc() {
-        strip_alphas = (f32*)backend->allocator.h_malloc(backend->hd_mesh->ns * sizeof(f32));
-        velocities = (f32*)backend->allocator.h_malloc(backend->hd_mesh->nc * backend->hd_mesh->ns * 3 * sizeof(f32));
-    }
-    void dealloc() {
-        backend->allocator.h_free(strip_alphas);
-        backend->allocator.h_free(velocities);        
-    }
+        // Perform the actual read of the mesh files
+        for (u64 i = 0; i < meshes.size(); i++) {
+            const MeshIO mesh_io{"plot3d"}; // TODO, infer from mesh_name
+            mesh_io.read(meshes[i], mesh.verts_wing_init.h_ptr() + mesh.params[i].off_wing_p);
+        }
+    };
+    virtual ~Simulation() = default;
 };
 
-class LinearVLM final: public Solver {
+class VLM final: public Simulation {
     public:
-    LinearVLM(const tiny::Config& cfg): Solver(cfg) {}
-    LinearVLM(std::unique_ptr<Backend>&& backend_): Solver(std::move(backend_)) {};
-    ~LinearVLM() = default;
-    AeroCoefficients solve(const FlowData& flow);
+        VLM(const std::string& backend_name, const std::vector<std::string>& meshes);
+        ~VLM() = default;
+        AeroCoefficients run(const FlowData& flow);
+    private:
+        void alloc_buffers();
+        Buffer<f32, Device> lhs; // (ns*nc)^2
+        Buffer<f32, Device> rhs; // ns*nc
+        Buffer<f32, HostDevice> gamma_wing; // nc*ns
+        Buffer<f32, Device> gamma_wake; // nw*ns
+        Buffer<f32, Device> gamma_wing_prev; // nc*ns
+        Buffer<f32, Device> gamma_wing_delta; // nc*ns
+        Buffer<f32, Device> local_velocities; // ns*nc*3
 };
 
-class UVLM final: public Solver {
-    public:
+// class Solver {
+//     public:
+//     std::unique_ptr<Backend> backend;
+//     Solver(const tiny::Config& cfg);
+//     Solver(std::unique_ptr<Backend>&& backend_) : backend(std::move(backend_)) {};
+//     virtual ~Solver() = default;
+// };
 
-    UVLM(const tiny::Config& cfg): Solver(cfg) {}
-    UVLM(std::unique_ptr<Backend>&& backend_): Solver(std::move(backend_)) {};
-    ~UVLM() = default;
-    AeroCoefficients solve(const FlowData& flow);
-};
+// class NonLinearVLM final: public Solver {
+//     public:
+//     static constexpr f64 DEFAULT_TOL = 1e-5;
+//     static constexpr u64 DEFAULT_MAX_ITER = 100;
+
+//     const f64 tol;
+//     const u64 max_iter;
+//     f32* strip_alphas = nullptr; // ns
+//     f32* velocities = nullptr; // nc*ns*3
+
+//     NonLinearVLM( // init from cfg
+//         const tiny::Config& cfg,
+//         const f64 tol_= DEFAULT_TOL,
+//         const u64 max_iter_ = DEFAULT_MAX_ITER
+//     ):
+//         Solver(cfg),
+//         tol(tol_),
+//         max_iter(max_iter_)
+//     {
+//         alloc();
+//     } 
+//     NonLinearVLM( // init from backend
+//         std::unique_ptr<Backend>&& backend_,
+//         const f64 tol_= DEFAULT_TOL,
+//         const u64 max_iter_ = DEFAULT_MAX_ITER
+//     ) :
+//         Solver(std::move(backend_)),
+//         tol(tol_),
+//         max_iter(max_iter_)
+//     {
+//         alloc();
+//     }
+//     ~NonLinearVLM() {
+//         dealloc();
+//     };
+//     AeroCoefficients solve(const FlowData& flow, const Database& db);
+
+//     private:
+//     void alloc() {
+//         strip_alphas = (f32*)backend->allocator.h_malloc(backend->hd_mesh->ns * sizeof(f32));
+//         velocities = (f32*)backend->allocator.h_malloc(backend->hd_mesh->nc * backend->hd_mesh->ns * 3 * sizeof(f32));
+//     }
+//     void dealloc() {
+//         backend->allocator.h_free(strip_alphas);
+//         backend->allocator.h_free(velocities);        
+//     }
+// };
+
+// class LinearVLM final: public Solver {
+//     public:
+//     LinearVLM(const tiny::Config& cfg): Solver(cfg) {}
+//     LinearVLM(std::unique_ptr<Backend>&& backend_): Solver(std::move(backend_)) {};
+//     ~LinearVLM() = default;
+//     AeroCoefficients solve(const FlowData& flow);
+// };
+
+
+// class UVLM final: public Solver {
+//     public:
+
+//     UVLM(const tiny::Config& cfg): Solver(cfg) {}
+//     UVLM(std::unique_ptr<Backend>&& backend_): Solver(std::move(backend_)) {};
+//     ~UVLM() = default;
+//     AeroCoefficients solve(const FlowData& flow);
+// };
 
 } // namespace vlm
diff --git a/vlm/include/vlm_allocator.hpp b/vlm/include/vlm_allocator.hpp
index 3ac1e88..c91c555 100644
--- a/vlm/include/vlm_allocator.hpp
+++ b/vlm/include/vlm_allocator.hpp
@@ -1,23 +1,212 @@
 #pragma once
 
+#include <cstddef>
+#include <cassert>
+#include <type_traits>
+
 namespace vlm {
 
-typedef void* (*malloc_f)(size_t size);
-typedef void (*free_f)(void* ptr);
-typedef void* (*memcpy_f)(void* dst, const void* src, size_t size);
-typedef void* (*memset_f)(void* dst, int value, size_t size);
+using malloc_f = void* (*)(std::size_t size);
+using free_f = void (*)(void* ptr);
+using memcpy_f = void* (*)(void* dst, const void* src, std::size_t size);
+using memset_f = void* (*)(void* dst, int value, std::size_t size);
 
 struct Allocator {
-    malloc_f h_malloc;
-    malloc_f d_malloc;
-    free_f h_free;
-    free_f d_free;
-    memcpy_f hh_memcpy;
-    memcpy_f hd_memcpy;
-    memcpy_f dh_memcpy;
-    memcpy_f dd_memcpy;
-    memset_f h_memset;
-    memset_f d_memset;
+    const malloc_f h_malloc;
+    const malloc_f d_malloc;
+    const free_f h_free;
+    const free_f d_free;
+    const memcpy_f hh_memcpy;
+    const memcpy_f hd_memcpy;
+    const memcpy_f dh_memcpy;
+    const memcpy_f dd_memcpy;
+    const memset_f h_memset;
+    const memset_f d_memset;
+
+    Allocator() = delete;
+    Allocator(
+        const malloc_f h_malloc,
+        const malloc_f d_malloc,
+        const free_f h_free,
+        const free_f d_free,
+        const memcpy_f hh_memcpy,
+        const memcpy_f hd_memcpy,
+        const memcpy_f dh_memcpy,
+        const memcpy_f dd_memcpy,
+        const memset_f h_memset,
+        const memset_f d_memset
+    ) : h_malloc(h_malloc), d_malloc(d_malloc), h_free(h_free), d_free(d_free),
+        hh_memcpy(hh_memcpy), hd_memcpy(hd_memcpy), dh_memcpy(dh_memcpy), dd_memcpy(dd_memcpy),
+        h_memset(h_memset), d_memset(d_memset) {}
+};
+
+struct Host {};
+struct Device {};
+struct HostDevice {};
+
+// template<typename T, typename Location>
+// class Buffer {
+//     static_assert(std::is_same_v<Location, Host> || std::is_same_v<Location, Device> || std::is_same_v<Location, HostDevice>, "Invalid Location type");
+//     using is_host = std::bool_constant<std::is_same_v<Location, Host> || std::is_same_v<Location, HostDevice>>;
+//     using is_device = std::bool_constant<std::is_same_v<Location, Device> || std::is_same_v<Location, HostDevice>>;
+
+// public:
+//     explicit Buffer(Allocator* allocator, std::size_t n) : allocator(allocator) { alloc(n); }
+//     explicit Buffer() = default;
+//     ~Buffer() { dealloc(); }
+
+//     T* h_ptr() {
+//         static_assert(is_host::value);
+//         return _host;
+//     }
+//     const T* h_ptr() const {
+//         static_assert(is_host::value);
+//         return _host;
+//     }
+//     T* d_ptr() {
+//         static_assert(is_device::value);
+//         return _device;
+//     }
+//     const T* d_ptr() const {
+//         static_assert(is_device::value);
+//         return _device;
+//     }
+
+//     void to_device() {
+//         static_assert(is_host::value && is_device::value);
+//         if (allocator->d_malloc != allocator->h_malloc) 
+//             allocator->hd_memcpy(_device, static_cast<void*>(_host), _size * sizeof(T));
+//     }
+
+//     void to_host() {
+//         static_assert(is_host::value && is_device::value);
+//         if (allocator->d_malloc != allocator->h_malloc) 
+//             allocator->dh_memcpy(_host, static_cast<void*>(_device), _size * sizeof(T));
+//     }
+
+//     void set_allocator(Allocator* allocator) {
+//         this->allocator = allocator;
+//     }
+
+// private:
+//     // 32 bytes max
+//     Allocator* allocator = nullptr;
+//     T* _host = nullptr;
+//     T* _device = nullptr;
+//     std::size_t _size = 0;
+
+//     void alloc(std::size_t n) {
+//         assert(allocator);
+//         _size = n;
+//         if constexpr (is_host::value) {
+//             _host = static_cast<T*>(allocator->h_malloc(_size * sizeof(T)));
+//         }
+//         if constexpr (is_device::value) {
+//             if constexpr (!is_host::value) assert(allocator->d_malloc != allocator->h_malloc);
+//             _device = (allocator->d_malloc != allocator->h_malloc) 
+//                 ? static_cast<T*>(allocator->d_malloc(_size * sizeof(T))) 
+//                 : _host;
+//         }
+//     }
+
+//     void dealloc() {
+//         if (allocator) {
+//             if constexpr (is_host::value) {
+//                 allocator->h_free(_host);
+//             }
+//             if constexpr (is_device::value) {
+//                 if (allocator->d_malloc != allocator->h_malloc) allocator->d_free(_device);
+//             }
+//         }
+//     }
+
+//     Buffer(Buffer&&) = delete;
+//     Buffer(const Buffer&) = delete;
+//     Buffer& operator=(Buffer&&) = delete;
+//     Buffer& operator=(const Buffer&) = delete;
+// };
+
+template<typename T, typename Location>
+class Buffer {
+    static_assert(std::is_same_v<Location, Host> || std::is_same_v<Location, Device> || std::is_same_v<Location, HostDevice>, "Invalid Location type");
+    using is_host = std::bool_constant<std::is_same_v<Location, Host> || std::is_same_v<Location, HostDevice>>;
+    using is_device = std::bool_constant<std::is_same_v<Location, Device> || std::is_same_v<Location, HostDevice>>;
+
+public:
+    Buffer() = delete;
+    explicit Buffer(const Allocator& allocator, std::size_t n) : allocator(allocator) { alloc(n); }
+    explicit Buffer(const Allocator& allocator) : allocator(allocator) {}
+    ~Buffer() { dealloc(); }
+
+    T* h_ptr() {
+        static_assert(is_host::value);
+        return _host;
+    }
+    const T* h_ptr() const {
+        static_assert(is_host::value);
+        return _host;
+    }
+    T* d_ptr() {
+        static_assert(is_device::value);
+        return _device;
+    }
+    const T* d_ptr() const {
+        static_assert(is_device::value);
+        return _device;
+    }
+
+    void to_device() {
+        static_assert(is_host::value && is_device::value);
+        if (allocator.d_malloc != allocator.h_malloc) 
+            allocator.hd_memcpy(_device, static_cast<void*>(_host), _size * sizeof(T));
+    }
+
+    void to_host() {
+        static_assert(is_host::value && is_device::value);
+        if (allocator.d_malloc != allocator.h_malloc) 
+            allocator.dh_memcpy(_host, static_cast<void*>(_device), _size * sizeof(T));
+    }
+
+    void set_allocator(Allocator* allocator) {
+        this->allocator = allocator;
+    }
+
+    std::size_t len() const { return _size; }
+    std::size_t size() const { return _size * sizeof(T); }
+
+    void alloc(std::size_t n) {
+        _size = n;
+        if constexpr (is_host::value) {
+            _host = static_cast<T*>(allocator.h_malloc(_size * sizeof(T)));
+        }
+        if constexpr (is_device::value) {
+            if constexpr (!is_host::value) assert(allocator.d_malloc != allocator.h_malloc);
+            _device = (allocator.d_malloc != allocator.h_malloc) 
+                ? static_cast<T*>(allocator.d_malloc(_size * sizeof(T))) 
+                : _host;
+        }
+    }
+
+    void dealloc() {
+        if constexpr (is_host::value) {
+            allocator.h_free(_host);
+        }
+        if constexpr (is_device::value) {
+            if (allocator.d_malloc != allocator.h_malloc) allocator.d_free(_device);
+        }
+    }
+
+private:
+    // 32 bytes max
+    const Allocator& allocator;
+    T* _host = nullptr;
+    T* _device = nullptr;
+    std::size_t _size = 0;
+
+    Buffer(Buffer&&) = delete;
+    Buffer(const Buffer&) = delete;
+    Buffer& operator=(Buffer&&) = delete;
+    Buffer& operator=(const Buffer&) = delete;
 };
 
 } // namespace vlm
\ No newline at end of file
diff --git a/vlm/include/vlm_backend.hpp b/vlm/include/vlm_backend.hpp
index a818aab..fcd08b7 100644
--- a/vlm/include/vlm_backend.hpp
+++ b/vlm/include/vlm_backend.hpp
@@ -10,56 +10,41 @@ namespace vlm {
 
 class Backend {
     public:
-        Allocator allocator;
-        
-        // Constant inital position meshes (for kinematics)
-        MeshGeom* hh_mesh_geom; // borrowed ptr
-        MeshGeom* hd_mesh_geom; 
-        MeshGeom* dd_mesh_geom; 
-
-        // Mutable meshes (temporal state)
-        // Mesh2* hh_mesh; // host ptr to host buffers for io
-        Mesh2* hd_mesh; // host ptr to device buffers
-        Mesh2* dd_mesh; // device ptr to device buffers for kernels
-
-        Data* hd_data;
-        Data* dd_data;
+        const Allocator allocator;
+        std::vector<MeshParams> prm;
 
         i32* d_solver_info = nullptr;
         i32* d_solver_ipiv = nullptr;
         f32* d_solver_buffer = nullptr;
 
         f32 sigma_vatistas = 0.0f;
-        Backend() = default;
+        Backend(const Allocator& allocator) : allocator(allocator) {}
         ~Backend();
-        void init(MeshGeom* mesh_geom, u64 timesteps); // Acts as delayed constructor
-        virtual void reset() = 0;
-        virtual void lhs_assemble() = 0;
-        virtual void compute_rhs() = 0;
-        virtual void add_wake_influence() = 0;
-        virtual void wake_rollup(float dt) = 0;
-        virtual void shed_gamma() = 0;
-        virtual void lu_factor() = 0;
-        virtual void lu_solve() = 0;
-        virtual f32 compute_coefficient_cl(const FlowData& flow, const f32 area, const u64 j, const u64 n) = 0;
-        virtual f32 compute_coefficient_unsteady_cl(const linalg::alias::float3& freestream, const SoA_3D_t<f32>& vel, f32 dt, const f32 area, const u64 j, const u64 n) = 0;
-        f32 compute_coefficient_cl(const FlowData& flow);
-        virtual linalg::alias::float3 compute_coefficient_cm(const FlowData& flow, const f32 area, const f32 chord, const u64 j, const u64 n) = 0;
-        linalg::alias::float3 compute_coefficient_cm(const FlowData& flow);
-        virtual f32 compute_coefficient_cd(const FlowData& flow, const f32 area, const u64 j, const u64 n) = 0;
-        f32 compute_coefficient_cd(const FlowData& flow);
-        virtual void compute_delta_gamma() = 0;
-        virtual void set_velocities(const linalg::alias::float3& vel) = 0;
-        virtual void set_velocities(const f32* vels) = 0;
+
+        // Kernels that run for all the meshes
+        virtual void lhs_assemble(f32* lhs) = 0;
+        virtual void rhs_assemble_velocities(f32* rhs, const f32* normals, const f32* velocities) = 0;
+        virtual void rhs_assemble_wake_influence(f32* rhs, const f32* gamma) = 0;
+        virtual void displace_wake_rollup(float dt, f32* rollup_vertices, f32* verts_wake, const f32* verts_wing, const f32* gamma) = 0;
+        // virtual void displace_wing(const linalg::alias::float4x4& transform) = 0;
+        virtual void displace_wing_and_shed(const std::vector<linalg::alias::float4x4>& transform, f32* verts_wing, f32* verts_wake) = 0;
+        virtual void gamma_shed(f32* gamma, f32* gamma_prev) = 0;
+        virtual void gamma_delta(f32* gamma_delta, const f32* gamma) = 0;
+        virtual void lu_factor(f32* lhs) = 0;
+        virtual void lu_solve(f32* lhs, f32* rhs, f32* gamma) = 0;
+        
+        // Per mesh kernels
+        virtual f32 coeff_steady_cl(const MeshParams& param, const f32* verts_wing, const f32* gamma_delta, const FlowData& flow, const f32 area, const u64 j, const u64 n) = 0;
+        virtual f32 coeff_unsteady_cl(const MeshParams& param, const f32* verts_wing, const f32* gamma_delta, const f32* gamma, const f32* gamma_prev, const f32* local_velocities, const f32* areas, const f32* normals, const linalg::alias::float3& freestream, f32 dt, const f32 area, const u64 j, const u64 n) = 0;
+        virtual linalg::alias::float3 coeff_steady_cm(const MeshParams& param, const FlowData& flow, const f32 area, const f32 chord, const u64 j, const u64 n) = 0;
+        virtual f32 coeff_steady_cd(const MeshParams& param, const FlowData& flow, const f32 area, const u64 j, const u64 n) = 0;
 
         virtual void mesh_metrics(const f32 alpha) = 0;
-        virtual void mesh_move(const linalg::alias::float4x4& transform) = 0;
-        virtual void update_wake(const linalg::alias::float3& freestream) = 0; // TEMPORARY
-        virtual f32 mesh_mac(u64 j, u64 n) = 0; // mean chord
-        virtual f32 mesh_area(const u64 i, const u64 j, const u64 m, const u64 n) = 0; // mean span
+        virtual f32 mesh_mac(const MeshParams& param, const u64 j, const u64 n) = 0;
+        virtual f32 mesh_area(const MeshParams& param, const u64 i, const u64 j, const u64 m, const u64 n) = 0;
 };
 
-std::unique_ptr<Backend> create_backend(const std::string& backend_name, MeshGeom* mesh, int timesteps);
+std::unique_ptr<Backend> create_backend(const std::string& backend_name);
 std::vector<std::string> get_available_backends();
 
 } // namespace vlm
diff --git a/vlm/include/vlm_mesh.hpp b/vlm/include/vlm_mesh.hpp
index cee8fea..a042382 100644
--- a/vlm/include/vlm_mesh.hpp
+++ b/vlm/include/vlm_mesh.hpp
@@ -1,11 +1,11 @@
 #pragma once
 
+#include <iosfwd>
+
 #include "vlm_types.hpp"
 #include "vlm_allocator.hpp"
 
 #include "tinyfwd.hpp"
-#include "tinyview.hpp"
-
 
 #define PTR_MESHGEOM_V(m, i,j,k) (m->vertices + (j) + (i) * (m->ns+1) + (k) * (m->nc+1) * (m->ns+1))
 #define PTR_MESH_V(m, i,j,k) (m->vertices + (j) + (i) * (m->ns+1) + (k) * (m->nc+m->nw+1) * (m->ns+1))
@@ -14,138 +14,76 @@
 
 namespace vlm {
 
-// C interface
+struct MeshParams {
+    u64 nc = 0; // nb of panels chordwise
+    u64 ns = 0; // nb of panels spanwise
+    u64 nw = 0; // nb of wake panels chordwise
+    u64 nwa = 0; // nb of acive wake panels chordwise
+    u64 off_wing_p = 0; // wing panel offset for buffers
+    u64 off_wing_v = 0; // wing vertex offset for buffers
+    u64 off_wake_p = 0; // wake panel offset for buffers
+    u64 off_wake_v = 0; // wake vertex offset for buffers
+    bool wake = true; // whether the surface produces wake
+
+    inline u64 nb_panels_wing() {return nc * ns;}
+    inline u64 nb_panels_wake() {return nw * ns;}
+    inline u64 nb_vertices_wing() {return (nc+1) * (ns+1);}
+    inline u64 nb_vertices_wake() {return (nw+1) * (ns+1);}
+};
+
+// TODO: move functions into cpp file
 struct Mesh2 {
-    f32* vertices = nullptr; // (nc+nw+1)*(ns+1)*3
-    //f32* verts_wake = nullptr; // (nw+1)*(ns+1)*3
-    // TODO: evaluate if we really need those values or we can just compute on the fly in the kernels
-    f32* normals = nullptr; // nc*ns*3
-    f32* colloc = nullptr; // nc*ns*3
-    f32* area = nullptr; // nc*ns
+    Buffer<f32, HostDevice> verts_wing_init; // (nc+1)*(ns+1)*3
+    Buffer<f32, HostDevice> verts_wing; // (nc+1)*(ns+1)*3
+    Buffer<f32, HostDevice> verts_wake; // (nw+1)*(ns+1)*3
+    Buffer<f32, Device> normals; // nc*ns*3
+    Buffer<f32, Device> colloc; // nc*ns*3
+    Buffer<f32, Device> area; // nc*ns
 
-    u64 nc = 0; // number of wing panels chordwise
-    u64 ns = 0; // number of wing panels spanwise
-    u64 nw = 0; // number of wake panels chordwise
-    u64 nwa = 0; // number of wake panels active chordwise
+    std::vector<MeshParams> params;
 
-    f32 s_ref = 0.0f; // reference area
-    f32 c_ref = 0.0f; // reference chord
     f32 frame[16] = {
         1.0f, 0.0f, 0.0f, 0.0f,
         0.0f, 1.0f, 0.0f, 0.0f,
         0.0f, 0.0f, 1.0f, 0.0f,
         0.0f, 0.0f, 0.0f, 1.0f
-    }; // Col major order (TODO: move this to kinematic tracker)
-
-    char* name = nullptr; // mesh name
-    bool lifting = true;
+    }; // Col major order (TODO: move this to kinematic tracker?)
+
+    Mesh2(const Allocator& allocator) : 
+    verts_wing_init(allocator), verts_wing(allocator), verts_wake(allocator),
+    normals(allocator), colloc(allocator), area(allocator) {};
+    void alloc_wing() {
+        const u64 total_verts = params.back().off_wing_v + params.back().nb_panels_wing();
+        const u64 total_panels = params.back().off_wing_p + params.back().nb_vertices_wing();
+        verts_wing_init.alloc(total_verts*3);
+        verts_wing.alloc(total_verts*3);
+        normals.alloc(total_panels*3);
+        colloc.alloc(total_panels*3);
+        area.alloc(total_panels);
+    }
+    void alloc_wake() {
+        const u64 total_wake_verts = params.back().off_wake_v + params.back().nb_vertices_wake();
+        verts_wake.alloc(total_wake_verts*3);
+    }
 };
 
-inline u64 mesh_nb_panels_wing(const Mesh2* m) {return m->nc * m->ns;}
-inline u64 mesh_nb_panels_total(const Mesh2* m) {return (m->nc+m->nw) * m->ns;}
-inline u64 mesh_nb_vertices_wing(const Mesh2* m) {return (m->nc+1) * (m->ns+1);}
-inline u64 mesh_nb_vertices_total(const Mesh2* m) {return (m->nc+m->nw+1) * (m->ns+1);}
+using SurfDims = std::pair<u64, u64>; // nc, ns
 
-// Contains only the raw geometry 
-struct MeshGeom {
-    f32* vertices = nullptr;
-    u64 nc = 0; // number of wing panels chordwise
-    u64 ns = 0;
+class MeshFile {
+public:
+    virtual ~MeshFile() = default;
+    virtual SurfDims get_dims(std::ifstream& file) const = 0;
+    virtual SurfDims read(std::ifstream& file, f32* vertices) const = 0;
 };
 
-void mesh_quarterchord(MeshGeom* mesh_geom);
-void mesh_io_read_file(const std::string& filename, MeshGeom* mesh_geom);
-void mesh_alloc(const malloc_f malloc, Mesh2* mesh, u64 nc, u64 ns, u64 nw, u64 nwa);
-void mesh_free(const free_f free, Mesh2* mesh);
-void mesh_copy(const memcpy_f memcpy, Mesh2* dst, const Mesh2* src);
-
-// // === STRUCTURED MESH ===
-// // nc : number of panels chordwise
-// // ns : number of panels spanwise
-// // nw : number of panels in chordwise wake
-// // ncw : nc + nw
-// class Mesh {
-//     public:
-//     // Unstructured members (for exporting results)
-//     // ---------------------
-//     // All vertices stored in single SoA for result exporting
-//     // (stored in span major order)
-//     SoA_3D_t<f32> v; // size (ncw+1)*(ns+1)
-//     // Offsets for indexing in connectivity array for each panel
-//     std::vector<u64> offsets = {}; // size nc*ns + 1
-//     // Panel-vertex connectivity
-//     // vertices indices of a panel: connectivity[offsets[i]] to connectivity[offsets[i+1]]
-//     std::vector<u64> connectivity = {}; // size 4*(nc*ns)
-
-//     // Panels metrics (stored span major order)
-//     // ---------------------
-//     // Collocation points of panels
-//     SoA_3D_t<f32> colloc; // size ncw*ns
-//     // Normals of panels
-//     SoA_3D_t<f32> normal; // size ncw*ns
-//     // Area of panels
-//     std::vector<f32> area = {}; // size ncw*ns
-
-//     // Structured dimensions
-//     // ---------------------
-//     u64 nc = 1; // number of panels chordwise
-//     u64 ns = 1; // number of panels spanwise
-//     u64 nw; // number of wake panels chordwise (max capacity)
-//     u64 current_nw = 0; // current number of built wake panels (temporary)
-
-//     // Analytical quanities when provided
-//     // ---------------------
-//     f32 s_ref = 0.0f; // reference area (of wing)
-//     f32 c_ref = 0.0f; // reference chord (of wing)
-
-//     linalg::alias::float4x4 frame = linalg::identity; // transformation matrix
-//     linalg::alias::float3 ref_pt = {0.25f, 0.0f, 0.0f}; // TODO: deprecate
-
-//     void update_wake(const linalg::alias::float3& u_inf); // update wake vertices
-//     void correction_high_aoa(f32 alpha_rad); // correct collocation points for high aoa
-//     void create_vortex_panels(); // create true vortex panels
-//     void compute_connectivity(); // fill offsets, connectivity
-//     void compute_metrics_wing(); // fill colloc, normal, area
-//     void compute_metrics_wake();
-//     void compute_metrics_i(u64 i);
-//     void move(const linalg::alias::float4x4& transform, const SoA_3D_t<f32>& origin_pos);
-//     void resize_wake(const u64 nw);
- 
-//     u64 nb_panels_wing() const;
-//     u64 nb_panels_total() const;
-//     u64 nb_vertices_wing() const;
-//     u64 nb_vertices_total() const;
-//     f32 panels_area(const u64 i, const u64 j, const u64 m, const u64 n) const;
-//     f32 panels_area_xy(const u64 i, const u64 j, const u64 m, const u64 n) const;
-//     f32 panel_width_y(const u64 i, const u64 j) const;
-//     f32 panel_length(const u64 i, const u64 j) const;
-//     f32 strip_width(const u64 j) const;
-//     f32 chord_length(const u64 j) const; // vertex idx
-//     f32 chord_mean(const u64 j, const u64 n) const;
-
-//     // i panel vertices coordinates
-//     linalg::alias::float3 get_v0(u64 i) const; // upper left
-//     linalg::alias::float3 get_v1(u64 i) const; // upper right
-//     linalg::alias::float3 get_v2(u64 i) const; // lower right
-//     linalg::alias::float3 get_v3(u64 i) const; // lower left
-
-//     void io_read(const std::string& filename);
-//     Mesh(const tiny::Config& cfg);
-//     Mesh(
-//         const std::string& filename,
-//         const u64 nw,
-//         const f32 s_ref_,
-//         const f32 c_ref_,
-//         const linalg::alias::float3& ref_pt_ // todo: deprecate
-//     );
-    
-//     private:
-//     void alloc(); // allocate memory for all buffers
-//     void init(); // called at the end of constructor
-//     void io_read_plot3d_structured(std::ifstream& f);
-// };
-
-// // todo, update this to mirror the constructor
-// std::unique_ptr<Mesh> create_mesh(const std::string& filename, const u64 nw=1);
+class MeshIO {
+public:
+    MeshIO(const std::string& format);
+    SurfDims get_dims(const std::string& filename) const;
+    void read(const std::string& filename, f32* vertices) const;
+    // void write(const std::string& filename, const Mesh* mesh) const;
+private:
+    std::unique_ptr<MeshFile> _file;
+};
 
 } // namespace vlm
\ No newline at end of file
diff --git a/vlm/src/vlm.cpp b/vlm/src/vlm.cpp
index c4588e0..a6fd658 100644
--- a/vlm/src/vlm.cpp
+++ b/vlm/src/vlm.cpp
@@ -23,92 +23,123 @@ using namespace vlm;
 //     backend = create_backend(backend_name, *mesh);
 // }
 
-AeroCoefficients LinearVLM::solve(const FlowData& flow) {
-    auto init_pos = translation_matrix<f32>({
-        -100.0f * flow.u_inf*std::cos(flow.alpha),
-        0.0f,
-        -100.0f * flow.u_inf*std::sin(flow.alpha)
-    });
+VLM::VLM(const std::string& backend_name, const std::string& mesh_name) : Simulation(backend_name, mesh_name), 
+    lhs(backend->allocator), rhs(backend->allocator), gamma(backend->allocator), gamma_prev(backend->allocator), delta_gamma(backend->allocator), local_velocities(backend->allocator) {
+    mesh.alloc_wake(1); // single wake panel
+    mesh.verts_wing_init.to_device();
+    alloc_buffers();
+};
 
-    backend->reset();
-    //backend->update_wake(flow.freestream);
-    backend->mesh_move(init_pos);
-    backend->mesh_metrics(flow.alpha);
-    backend->lhs_assemble();
-    backend->set_velocities(flow.freestream);
-    backend->compute_rhs();
-    backend->lu_factor();
-    backend->lu_solve();
-    backend->compute_delta_gamma();
-    return AeroCoefficients{
-        backend->compute_coefficient_cl(flow),
-        backend->compute_coefficient_cd(flow),
-        backend->compute_coefficient_cm(flow)
-    };
+void VLM::alloc_buffers() {
+    lhs.alloc(mesh.nb_panels_wing() * mesh.nb_panels_wing());
+    rhs.alloc(mesh.nb_panels_wing());
+    gamma.alloc(mesh.nb_panels_total());
+    gamma_prev.alloc(mesh.nb_panels_wing());
+    delta_gamma.alloc(mesh.nb_panels_wing());
+    local_velocities.alloc(mesh.nb_panels_wing() * 3);
 }
 
-void strip_alpha_to_vel(const FlowData& flow, Mesh2* mesh, f32* local_velocities, f32* strip_alphas) {
-    const u64 nb_panels_wing = mesh_nb_panels_wing(mesh);
-    f32* lvx = local_velocities + 0 * nb_panels_wing;
-    f32* lvy = local_velocities + 1 * nb_panels_wing;
-    f32* lvz = local_velocities + 2 * nb_panels_wing;
-    for (u64 j = 0; j < mesh->ns; j++) {
-        auto fs = compute_freestream(flow.u_inf, strip_alphas[j], 0.0f);
-        lvx[j] = fs.x;
-        lvy[j] = fs.y;
-        lvz[j] = fs.z;
-    }
-    for (u64 i = 1; i < mesh->nc; i++) {
-        std::memcpy(lvx + i * mesh->ns, lvx, mesh->ns * sizeof(*lvx));
-        std::memcpy(lvy + i * mesh->ns, lvy, mesh->ns * sizeof(*lvy));
-        std::memcpy(lvz + i * mesh->ns, lvz, mesh->ns * sizeof(*lvz));
-    }
-}
+AeroCoefficients VLM::run(const FlowData& flow) {
+    // Reset buffer state
+    backend->allocator.d_memset(lhs.d_ptr(), 0, lhs.size());
+    backend->allocator.dd_memcpy(mesh.verts_wing.d_ptr(), mesh.verts_wing_init.d_ptr(), mesh.verts_wing.size());
+    mesh.nwa = 0;
 
-AeroCoefficients NonLinearVLM::solve(const FlowData& flow, const Database& db) {
-    f64 err = 1.0f; // l1 error
     auto init_pos = translation_matrix<f32>({
         -100.0f * flow.u_inf*std::cos(flow.alpha),
         0.0f,
         -100.0f * flow.u_inf*std::sin(flow.alpha)
     });
 
-    std::fill(strip_alphas, strip_alphas + backend->hd_mesh->ns, flow.alpha); // memset
-
-    backend->reset();
-    backend->mesh_move(init_pos);
-    backend->mesh_metrics(flow.alpha);
-    backend->lhs_assemble();
-    backend->lu_factor();
-
-    for (u64 iter = 0; iter < max_iter && err > tol; iter++) {
-        err = 0.0; // reset l1 error
-        strip_alpha_to_vel(flow, backend->hd_mesh, velocities, strip_alphas); // Compute local panel velocities based on strip alphas
-        backend->allocator.hd_memcpy(backend->dd_data->local_velocities, velocities, 3 * mesh_nb_panels_wing(backend->hd_mesh) * sizeof(*velocities));
-        backend->compute_rhs(); // Compute RHS using strip alphas (on CPU)
-        backend->lu_solve(); // Copy RHS on device and solve for gamma
-        backend->compute_delta_gamma(); // Compute the chordwise delta gammas for force computation
+    backend->mesh_move(mesh, init_pos);
+}
+
+// AeroCoefficients LinearVLM::solve(const FlowData& flow) {
+//     auto init_pos = translation_matrix<f32>({
+//         -100.0f * flow.u_inf*std::cos(flow.alpha),
+//         0.0f,
+//         -100.0f * flow.u_inf*std::sin(flow.alpha)
+//     });
+
+//     backend->reset();
+//     //backend->update_wake(flow.freestream);
+//     backend->mesh_move(init_pos);
+//     backend->mesh_metrics(flow.alpha);
+//     backend->lhs_assemble();
+//     backend->set_velocities(flow.freestream);
+//     backend->compute_rhs();
+//     backend->lu_factor();
+//     backend->lu_solve();
+//     backend->compute_delta_gamma();
+//     return AeroCoefficients{
+//         backend->compute_coefficient_cl(flow),
+//         backend->compute_coefficient_cd(flow),
+//         backend->compute_coefficient_cm(flow)
+//     };
+// }
+
+// void strip_alpha_to_vel(const FlowData& flow, Mesh* mesh, f32* local_velocities, f32* strip_alphas) {
+//     const u64 nb_panels_wing = mesh_nb_panels_wing(mesh);
+//     f32* lvx = local_velocities + 0 * nb_panels_wing;
+//     f32* lvy = local_velocities + 1 * nb_panels_wing;
+//     f32* lvz = local_velocities + 2 * nb_panels_wing;
+//     for (u64 j = 0; j < mesh->ns; j++) {
+//         auto fs = compute_freestream(flow.u_inf, strip_alphas[j], 0.0f);
+//         lvx[j] = fs.x;
+//         lvy[j] = fs.y;
+//         lvz[j] = fs.z;
+//     }
+//     for (u64 i = 1; i < mesh->nc; i++) {
+//         std::memcpy(lvx + i * mesh->ns, lvx, mesh->ns * sizeof(*lvx));
+//         std::memcpy(lvy + i * mesh->ns, lvy, mesh->ns * sizeof(*lvy));
+//         std::memcpy(lvz + i * mesh->ns, lvz, mesh->ns * sizeof(*lvz));
+//     }
+// }
+
+// AeroCoefficients NonLinearVLM::solve(const FlowData& flow, const Database& db) {
+//     f64 err = 1.0f; // l1 error
+//     auto init_pos = translation_matrix<f32>({
+//         -100.0f * flow.u_inf*std::cos(flow.alpha),
+//         0.0f,
+//         -100.0f * flow.u_inf*std::sin(flow.alpha)
+//     });
+
+//     std::fill(strip_alphas, strip_alphas + backend->hd_mesh->ns, flow.alpha); // memset
+
+//     backend->reset();
+//     backend->mesh_move(init_pos);
+//     backend->mesh_metrics(flow.alpha);
+//     backend->lhs_assemble();
+//     backend->lu_factor();
+
+//     for (u64 iter = 0; iter < max_iter && err > tol; iter++) {
+//         err = 0.0; // reset l1 error
+//         strip_alpha_to_vel(flow, backend->hd_mesh, velocities, strip_alphas); // Compute local panel velocities based on strip alphas
+//         backend->allocator.hd_memcpy(backend->dd_data->local_velocities, velocities, 3 * mesh_nb_panels_wing(backend->hd_mesh) * sizeof(*velocities));
+//         backend->compute_rhs(); // Compute RHS using strip alphas (on CPU)
+//         backend->lu_solve(); // Copy RHS on device and solve for gamma
+//         backend->compute_delta_gamma(); // Compute the chordwise delta gammas for force computation
         
-        // Parallel Reduce
-        // loop over the chordwise strips and apply Van Dam algorithm
-        for (u64 j = 0; j < backend->hd_mesh->ns; j++) {
-            const f32 strip_area = backend->mesh_area(0, j, backend->hd_mesh->nc, 1);
-            const FlowData strip_flow = {strip_alphas[j], flow.beta, flow.u_inf, flow.rho};
-            const f32 strip_cl = backend->compute_coefficient_cl(strip_flow, strip_area, j, 1);
-            const f32 effective_aoa = strip_cl / (2.f*PI_f) - strip_flow.alpha + flow.alpha;
-
-            // TODO: interpolated value should be computed at the y mid point of the strip
-            const f32 correction = (db.interpolate_CL(effective_aoa, 0.f) - strip_cl) / (2.f*PI_f);
-            // std::printf(">>> Strip: %d | CL: %.3f | Interpolated: %.3f | Correction: %.3e\n", j, strip_cl, db.interpolate_CL(effective_aoa, 0.f), correction);
-            strip_alphas[j] += correction;
-            err += (f64)std::abs(correction);
-        }
-        err /= (f64)backend->hd_mesh->ns; // normalize l1 error
-        //std::printf(">>> Iter: %d | Error: %.3e \n", iter, err);
-    }
-    return AeroCoefficients{
-        backend->compute_coefficient_cl(flow),
-        backend->compute_coefficient_cd(flow),
-        backend->compute_coefficient_cm(flow)
-    };
-}
\ No newline at end of file
+//         // Parallel Reduce
+//         // loop over the chordwise strips and apply Van Dam algorithm
+//         for (u64 j = 0; j < backend->hd_mesh->ns; j++) {
+//             const f32 strip_area = backend->mesh_area(0, j, backend->hd_mesh->nc, 1);
+//             const FlowData strip_flow = {strip_alphas[j], flow.beta, flow.u_inf, flow.rho};
+//             const f32 strip_cl = backend->compute_coefficient_cl(strip_flow, strip_area, j, 1);
+//             const f32 effective_aoa = strip_cl / (2.f*PI_f) - strip_flow.alpha + flow.alpha;
+
+//             // TODO: interpolated value should be computed at the y mid point of the strip
+//             const f32 correction = (db.interpolate_CL(effective_aoa, 0.f) - strip_cl) / (2.f*PI_f);
+//             // std::printf(">>> Strip: %d | CL: %.3f | Interpolated: %.3f | Correction: %.3e\n", j, strip_cl, db.interpolate_CL(effective_aoa, 0.f), correction);
+//             strip_alphas[j] += correction;
+//             err += (f64)std::abs(correction);
+//         }
+//         err /= (f64)backend->hd_mesh->ns; // normalize l1 error
+//         //std::printf(">>> Iter: %d | Error: %.3e \n", iter, err);
+//     }
+//     return AeroCoefficients{
+//         backend->compute_coefficient_cl(flow),
+//         backend->compute_coefficient_cd(flow),
+//         backend->compute_coefficient_cm(flow)
+//     };
+// }
\ No newline at end of file
diff --git a/vlm/src/vlm_backend.cpp b/vlm/src/vlm_backend.cpp
index cb8fe88..00dc14b 100644
--- a/vlm/src/vlm_backend.cpp
+++ b/vlm/src/vlm_backend.cpp
@@ -13,32 +13,18 @@
 
 using namespace vlm;
 
-// Here we assume constant shape wing since mesh_sref
-// TODO: remove this limitation
-f32 Backend::compute_coefficient_cl(const FlowData& flow) {
-    return compute_coefficient_cl(flow, mesh_area(0,0,hd_mesh->nc, hd_mesh->ns), 0, hd_mesh->ns);
-}
-
-linalg::alias::float3 Backend::compute_coefficient_cm(const FlowData& flow) {
-    return compute_coefficient_cm(flow, mesh_area(0,0,hd_mesh->nc, hd_mesh->ns), mesh_mac(0,hd_mesh->ns), 0, hd_mesh->ns);
-}
-
-f32 Backend::compute_coefficient_cd(const FlowData& flow) {
-    return compute_coefficient_cd(flow, mesh_area(0,0,hd_mesh->nc, hd_mesh->ns), 0, hd_mesh->ns);
-}
-
-std::unique_ptr<Backend> vlm::create_backend(const std::string& backend_name, MeshGeom* mesh, int timesteps) {
+std::unique_ptr<Backend> vlm::create_backend(const std::string& backend_name) {
     //tiny::CPUID cpuid;
     //cpuid.print_info();
 
     #ifdef VLM_CPU
     if (backend_name == "cpu") {
-        return std::make_unique<BackendCPU>(mesh, timesteps);
+        return std::make_unique<BackendCPU>();
     }
     #endif
     #ifdef VLM_CUDA
     if (backend_name == "cuda") {
-        return std::make_unique<BackendCUDA>(mesh, timesteps);
+        return std::make_unique<BackendCUDA>();
     }
     #endif
     throw std::runtime_error("Unsupported backend: " + backend_name);
@@ -55,54 +41,9 @@ std::vector<std::string> vlm::get_available_backends() {
     return backends;
 }
 
-void Backend::init(MeshGeom* mesh_geom, u64 timesteps) {
-    hh_mesh_geom = mesh_geom; // Borrowed ptr
-    const u64 nb_vertices_wing = (hh_mesh_geom->nc+1)*(hh_mesh_geom->ns+1);
-
-    // Mesh structs allocation
-    hd_mesh_geom = (MeshGeom*)allocator.h_malloc(sizeof(MeshGeom));
-    dd_mesh_geom = (MeshGeom*)allocator.d_malloc(sizeof(MeshGeom));
-    // hh_mesh = (Mesh2*)allocator.h_malloc(sizeof(Mesh2));
-    hd_mesh = (Mesh2*)allocator.h_malloc(sizeof(Mesh2));
-    dd_mesh = (Mesh2*)allocator.d_malloc(sizeof(Mesh2));
-    hd_data = (Data*)allocator.h_malloc(sizeof(Data));
-    dd_data = (Data*)allocator.d_malloc(sizeof(Data));
-
-    // Allocate mesh buffers on device and host
-    hd_mesh_geom->vertices = (f32*) allocator.d_malloc(nb_vertices_wing*3*sizeof(f32));
-    // mesh_alloc(allocator.h_malloc, hh_mesh, hh_mesh_geom->nc, hh_mesh_geom->ns, timesteps, 0);
-    mesh_alloc(allocator.d_malloc, hd_mesh, hh_mesh_geom->nc, hh_mesh_geom->ns, timesteps, 0); 
-    data_alloc(allocator.d_malloc, hd_data, hh_mesh_geom->nc, hh_mesh_geom->ns, timesteps);
-    
-    // Copy indices
-    hd_mesh_geom->nc = hh_mesh_geom->nc;
-    hd_mesh_geom->ns = hh_mesh_geom->ns;
-
-    // Copy raw vertex geometry directly to device
-    allocator.hd_memcpy(hd_mesh_geom->vertices, hh_mesh_geom->vertices, nb_vertices_wing*3*sizeof(f32));
-
-    // Copy host-device mesh ptr to device-device
-    allocator.hd_memcpy(dd_mesh_geom, hd_mesh_geom, sizeof(*hd_mesh_geom));
-    allocator.hd_memcpy(dd_mesh, hd_mesh, sizeof(*hd_mesh));
-    allocator.hd_memcpy(dd_data, hd_data, sizeof(*hd_data));
-}
-
 Backend::~Backend() {
     // Free device-device ptrs
-    allocator.d_free(dd_data);
-    allocator.d_free(dd_mesh);
-    allocator.d_free(dd_mesh_geom);
     allocator.d_free(d_solver_info);
     allocator.d_free(d_solver_ipiv);
     allocator.d_free(d_solver_buffer);
-
-    // Free device buffers store in host-device ptrs
-    allocator.d_free(hd_mesh_geom->vertices);
-    mesh_free(allocator.d_free, hd_mesh);
-    data_free(allocator.d_free, hd_data);
-
-    // Free host-device ptrs
-    allocator.h_free(hd_data);
-    allocator.h_free(hd_mesh);
-    allocator.h_free(hd_mesh_geom);
 }
\ No newline at end of file
diff --git a/vlm/src/vlm_mesh.cpp b/vlm/src/vlm_mesh.cpp
index 84360d2..cc761c6 100644
--- a/vlm/src/vlm_mesh.cpp
+++ b/vlm/src/vlm_mesh.cpp
@@ -12,117 +12,10 @@
 
 using namespace vlm;
 
-void vlm::mesh_alloc(const malloc_f malloc, Mesh2* mesh, u64 nc, u64 ns, u64 nw, u64 nwa) {
-    mesh->vertices = (f32*)malloc((nc+nw+1)*(ns+1)*3*sizeof(f32));
-    mesh->normals = (f32*)malloc(nc*ns*3*sizeof(f32));
-    mesh->colloc = (f32*)malloc(nc*ns*3*sizeof(f32));
-    mesh->area = (f32*)malloc(nc*ns*sizeof(f32));
-    mesh->nc = nc;
-    mesh->ns = ns;
-    mesh->nw = nw;
-    mesh->nwa = nwa;
-}
-
-void vlm::mesh_copy(const memcpy_f memcpy, Mesh2* dst, const Mesh2* src) {
-    memcpy(dst->vertices, src->vertices, (dst->nc+dst->nw+1)*(dst->ns+1)*3*sizeof(f32));
-    memcpy(dst->normals, src->normals, dst->nc*dst->ns*3*sizeof(f32));
-    memcpy(dst->colloc, src->colloc, dst->nc*dst->ns*3*sizeof(f32));
-    memcpy(dst->area, src->area, dst->nc*dst->ns*sizeof(f32));
-}
-
-void vlm::mesh_free(const free_f free, Mesh2* mesh) {
-    free(mesh->vertices);
-    free(mesh->normals);
-    free(mesh->colloc);
-    free(mesh->area);
-}
-
-// Reads the Plot3D Structured file, allocates vertex buffer and fills it with the file data
-void mesh_io_read_file_plot3d(std::ifstream& f, MeshGeom* mesh_geom) {
-    std::cout << "reading plot3d mesh" << std::endl;
-    u64 ni, nj, nk, blocks;
-    f32 x, y, z;
-    f >> blocks;
-    if (blocks != 1) {
-        throw std::runtime_error("Only single block plot3d mesh is supported");
-    }
-    f >> ni >> nj >> nk;
-    if (nk != 1) {
-        throw std::runtime_error("Only 2D plot3d mesh is supported");
-    }
-
-    mesh_geom->ns = nj - 1;
-    mesh_geom->nc = ni - 1;
-    mesh_geom->vertices = new f32[ni*nj*3];
-    const u64 nb_panels = mesh_geom->ns * mesh_geom->nc;
-    const u64 nb_vertices = ni * nj;
-
-    std::cout << "number of panels: " << nb_panels << std::endl;
-    std::cout << "ns: " << mesh_geom->ns << std::endl;
-    std::cout << "nc: " << mesh_geom->nc << std::endl;
-    
-    for (u64 j = 0; j < nj; j++) {
-        for (u64 i = 0; i < ni; i++) {
-            f >> x;
-            mesh_geom->vertices[nb_vertices * 0 + nj*i + j] = x;
-        }
-    }
-    for (u64 j = 0; j < nj; j++) {
-        for (u64 i = 0; i < ni; i++) {
-            f >> y;
-            mesh_geom->vertices[nb_vertices * 1 + nj*i + j] = y;
-        }
-    }
-    for (u64 j = 0; j < nj; j++) {
-        for (u64 i = 0; i < ni; i++) {
-            f >> z;
-            mesh_geom->vertices[nb_vertices * 2 + nj*i + j] = z;
-        }
-    }
-
-    // TODO: this validation is not robust enough when eventually we will be using multiple bodies
-    // Validation
-    // const f32 eps = std::numeric_limits<f32>::epsilon();
-    // if (std::abs(m.v.x[0]) != eps || std::abs(m.v.y[0]) != eps) {
-    //     throw std::runtime_error("First vertex of plot3d mesh must be at origin");
-    // }
-    const f32 first_y = mesh_geom->vertices[nb_vertices * 1 + 0];
-    for (u64 i = 1; i < ni; i++) {
-        if ( mesh_geom->vertices[nb_vertices * 1 + i*nj] != first_y) {
-            throw std::runtime_error("Mesh vertices should be ordered in chordwise direction");
-        }
-    }
-}
-
-void vlm::mesh_io_read_file(const std::string& filename, MeshGeom* mesh_geom) {
-    const std::filesystem::path path(filename);
-    if (!std::filesystem::exists(path)) {
-        throw std::runtime_error("Mesh file not found"); // TODO: consider not using exceptions anymore
-    }
-
-    std::ifstream f(path);
-    try {
-        if (f.is_open()) {
-            if (path.extension() == ".x") {
-                mesh_io_read_file_plot3d(f, mesh_geom);
-            } else {
-                throw std::runtime_error("Only structured gridpro mesh format is supported");
-            }
-            f.close();
-        } else {
-            throw std::runtime_error("Failed to open mesh file");
-        }
-    } catch (const std::exception& e) {
-        std::cerr << e.what() << std::endl;
-    }
-}
-
-void vlm::mesh_quarterchord(MeshGeom* mesh_geom) {
-    const u64 nc = mesh_geom->nc;
-    const u64 ns = mesh_geom->ns;
-    f32* vx = mesh_geom->vertices + 0 * (nc+1) * (ns+1);
-    f32* vy = mesh_geom->vertices + 1 * (nc+1) * (ns+1);
-    f32* vz = mesh_geom->vertices + 2 * (nc+1) * (ns+1);
+void mesh_quarterchord(f32* vertices, u64 nc, u64 ns) {
+    f32* vx = vertices + 0 * (nc+1) * (ns+1);
+    f32* vy = vertices + 1 * (nc+1) * (ns+1);
+    f32* vz = vertices + 2 * (nc+1) * (ns+1);
 
     for (u64 i = 0; i < nc; i++) {
         for (u64 j = 0; j < ns+1; j++) {
@@ -147,277 +40,106 @@ void vlm::mesh_quarterchord(MeshGeom* mesh_geom) {
     }
 }
 
-// void Mesh::init() {
-//     create_vortex_panels();
-//     if (c_ref == 0.0f) c_ref = chord_mean(0, ns+1); // use MAC as default
-//     if (s_ref == 0.0f) s_ref = panels_area_xy(0,0, nc, ns); // use projected area as default
-//     compute_connectivity();
-//     compute_metrics_wing();
-// }
-
-// void Mesh::alloc() {
-//     const u64 ncw = nc + nw;
-//     v.resize((ncw + 1) * (ns + 1));
-//     offsets.resize(nc * ns + 1);
-//     connectivity.resize(4 * nc * ns);
-
-//     colloc.resize(ncw * ns);
-//     normal.resize(ncw * ns);
-//     area.resize(ncw * ns);
-// }
-
-// u64 Mesh::nb_panels_wing() const { return nc * ns; };
-// u64 Mesh::nb_panels_total() const { return (nc+nw) * ns; };
-// u64 Mesh::nb_vertices_wing() const { return (nc + 1) * (ns + 1); };
-// u64 Mesh::nb_vertices_total() const { return (nc + nw + 1) * (ns + 1); };
-
-// /// @brief Computes the mean chord of a set of panels
-// /// @details
-// /// Mean Aerodynamic Chord = \frac{2}{S} \int_{0}^{b/2} c(y)^{2} dy
-// /// Integration using the Trapezoidal Rule
-// /// Validated against empirical formulas for tapered wings
-// /// @param j first panel index chordwise
-// /// @param n number of panels spanwise
-// /// @return mean chord of the set of panels
-// f32 Mesh::chord_mean(const u64 j, const u64 n) const {
-//     assert(j + n <= ns+1); // spanwise range
-//     assert(n > 1); // minimum 2 chords
-//     f32 mac = 0.0f;
-//     // loop over panel chordwise sections in spanwise direction
-//     // Note: can be done optimally with vertical fused simd
-//     for (u64 v = 0; v < n - 1; v++) {
-//         const f32 c0 = chord_length(j + v);
-//         const f32 c1 = chord_length(j + v + 1);
-//         mac += 0.5f * (c0 * c0 + c1 * c1) * panel_width_y(0, j + v);
-//     }
-//     // Since we divide by half the total wing area (both sides) we dont need to multiply by 2
-//     return mac / panels_area_xy(0, j, nc, n-1);
-// }
-
-// /// @brief Computes the total area of a 2D set of panels
-// /// @param i first panel index chordwise
-// /// @param j first panel panel index spanwise
-// /// @param m number of panels chordwise
-// /// @param n number of panels spanwise
-// /// @return sum of the areas of the panels
-// f32 Mesh::panels_area(const u64 i, const u64 j, const u64 m, const u64 n) const {
-//     assert(i + m <= nc);
-//     assert(j + n <= ns);
-    
-//     const u64 ld = ns;
-//     const f32* areas = &area[j + i * ld];
-
-//     f32 total_area = 0.0f;
-//     for (u64 u = 0; u < m; u++) {
-//         for (u64 v = 0; v < n; v++) {
-//             total_area += areas[v + u * ld];
-//         }
-//     }
-//     return total_area;
-// }
-
-// /// @brief Computes the total area of a 2D set of panels projected on the xy plane
-// /// @param i first panel index chordwise
-// /// @param j first panel panel index spanwise
-// /// @param m number of panels chordwise
-// /// @param n number of panels spanwise
-// /// @return sum of the areas of the panels projected on the xy plane
-// f32 Mesh::panels_area_xy(const u64 i, const u64 j, const u64 m, const u64 n) const {
-//     assert(i + m <= nc);
-//     assert(j + n <= ns);
-    
-//     // Area of a quad:
-//     // 0.5 * || d1 x d2 || where d1 and d2 are the diagonals of the quad
-//     f32 total_area = 0.0f;
-//     // Note: this is highly inefficient as vertex coordinates should be loaded with simd
-//     for (u64 u = 0; u < m; u++) {
-//         for (u64 v = 0; v < n; v++) {
-//             const u64 idx = j + v + (i+u) * ns;
-//             const linalg::alias::float3 d1 = get_v0(idx) - get_v2(idx);
-//             const linalg::alias::float3 d2 = get_v1(idx) - get_v3(idx);
-//             const linalg::alias::float3 cross = linalg::cross(d1, d2);
-            
-//             total_area += 0.5f * std::abs(cross.z);
-//             // std::cout << "area xy: " << 0.5f * std::abs(cross.z()) << "\n";
-//             // std::cout << "area: " << area[idx] << "\n";
-//         }
-//     }
-//     return total_area;
-// }
-
-// f32 Mesh::panel_length(const u64 i, const u64 j) const {
-//     // Since chordwise segments are always parallel, we can simply measure the width of the first panel
-//     assert(i < nc);
-//     assert(j < ns);
-//     const linalg::alias::float3 v0 = get_v0(j + i * ns);
-//     const linalg::alias::float3 v1 = get_v1(j + i * ns);
-//     const linalg::alias::float3 v2 = get_v2(j + i * ns);
-//     const linalg::alias::float3 v3 = get_v3(j + i * ns);
-//     return 0.5f * (linalg::length(v3-v0) + linalg::length(v2-v1));
-// }
-
-// /// @brief Computes the width of a single panel in pure y direction
-// /// @param i panel index chordwise
-// /// @param j panel index spanwise
-// f32 Mesh::panel_width_y(const u64 i, const u64 j) const {
-//     assert(i < nc);
-//     assert(j < ns);
-//     const u64 ld = ns + 1;
-
-//     return v.y[j + 1 + i * ld] - v.y[j + i * ld];
-// }
-
-// f32 Mesh::strip_width(const u64 j) const {
-//     assert(j < ns);
-//     const linalg::alias::float3 v0 = get_v0(j);
-//     const linalg::alias::float3 v1 = get_v1(j);
-//     return linalg::length(v1 - v0);
-// }
-
-// /// @brief Computes the chord length of a chordwise segment
-// /// @details Since the mesh follows the camber line, the chord length is computed
-// /// as the distance between the first and last vertex of a chordwise segment
-// /// @param j chordwise segment index
-// f32 Mesh::chord_length(const u64 j) const {
-//     assert(j <= ns); // spanwise vertex range
-//     const f32 dx = v.x[j + nc * (ns+1)] - v.x[j];
-//     const f32 dy = 0.0f; // chordwise segments are parallel to the x axis
-//     const f32 dz = v.z[j + nc * (ns+1)] - v.z[j];
-
-//     return std::sqrt(dx * dx + dy * dy + dz * dz);
-// }
-
-// linalg::alias::float3 Mesh::get_v0(u64 i) const {
-//     const u64 idx = i + i / ns;
-//     return {v.x[idx], v.y[idx], v.z[idx]};
-// }
-
-// linalg::alias::float3 Mesh::get_v1(u64 i) const {
-//     const u64 idx = i + i / ns + 1;
-//     return {v.x[idx], v.y[idx], v.z[idx]};
-// }
-
-// linalg::alias::float3 Mesh::get_v2(u64 i) const {
-//     const u64 idx = i + i / ns + ns + 2;
-//     return {v.x[idx], v.y[idx], v.z[idx]};
-// }
-
-// linalg::alias::float3 Mesh::get_v3(u64 i) const {
-//     const u64 idx = i + i / ns + ns + 1;
-//     return {v.x[idx], v.y[idx], v.z[idx]};
-// }
-
-// void Mesh::update_wake(const linalg::alias::float3& freestream) {
-//     const f32 chord_root = chord_length(0); 
-//     const f32 off_x = freestream.x * 100.0f * chord_root;
-//     const f32 off_y = freestream.y * 100.0f * chord_root;
-//     const f32 off_z = freestream.z * 100.0f * chord_root;
-
-//     const u64 v_ns = ns + 1;
-//     const u64 begin_trailing_edge = nb_vertices_wing()-v_ns;
-//     const u64 end_trailing_edge = nb_vertices_wing();
-//     // Add one layer of wake vertices
-//     // this can be parallelized (careful to false sharing tho)
-//     for (u64 i = begin_trailing_edge; i < end_trailing_edge; ++i) {
-//         v.x[i + v_ns] = v.x[i] + off_x;
-//         v.y[i + v_ns] = v.y[i] + off_y;
-//         v.z[i + v_ns] = v.z[i] + off_z;
-//     }
-//     compute_metrics_wake();
-//     current_nw = 1;
-// }
-
-// // https://publications.polymtl.ca/2555/1/2017_MatthieuParenteau.pdf (Eq 3.4 p21)
-// void Mesh::correction_high_aoa(f32 alpha_rad) {
-//     const f32 factor = 0.5f * alpha_rad / (std::sin(alpha_rad) + EPS); // correction factor
-//     // Note: this can be vectorized and parallelized
-//     for (u64 i = 0; i < nb_panels_total(); i++) {
-//         // "chord vector" from center of leading line (v0-v1) to trailing line (v3-v2)
-//         const linalg::alias::float3 chord_vec = 0.5f * (get_v2(i) + get_v3(i) - get_v0(i) - get_v1(i));
-//         const linalg::alias::float3 colloc_pt = 0.5f * (get_v0(i) + get_v1(i)) + factor * chord_vec;
-//         // collocation calculated as a translation of the center of leading line center
-//         colloc.x[i] = colloc_pt.x;
-//         colloc.y[i] = colloc_pt.y;
-//         colloc.z[i] = colloc_pt.z;
-//     }
-// }
-
-// void Mesh::compute_connectivity() {
-//     // indices of the points forming the quad
-//     //             span(y) ->
-//     //  chord(x)     0--1
-//     //     |         |  |
-//     //    \/         3--2
-//     std::array<u64, 4> quad = {};
-
-//     // Note: with some modifications this loop can be parallelized
-//     for (u64 i = 0; i < nb_panels_wing(); i++) {
-//         // extra offset occuring from the fact that there is 1 more vertex
-//         // per row than surfaces
-//         u64 chord_idx = i / nc;
-//         quad[0] = i + chord_idx;
-//         quad[1] = quad[0] + 1;
-//         quad[2] = quad[1] + ns;
-//         quad[3] = quad[0] + ns;
-//         offsets[i+1] = offsets[i] + 4;
-//         connectivity.insert(connectivity.end(), quad.begin(), quad.end());
-//     }
-// }
-
-// void Mesh::compute_metrics_i(u64 i ) {
-//     const linalg::alias::float3 v0 = get_v0(i);
-//     const linalg::alias::float3 v1 = get_v1(i);
-//     const linalg::alias::float3 v2 = get_v2(i);
-//     const linalg::alias::float3 v3 = get_v3(i);
+class Plot3DFile : public MeshFile {
+public:
+    SurfDims get_dims(std::ifstream& f) const override {
+        u64 ni, nj, nk, blocks;
+        f >> blocks;
+        if (blocks != 1) {
+            throw std::runtime_error("Only single block plot3d mesh is supported");
+        }
+        f >> ni >> nj >> nk;
+        if (nk != 1) {
+            throw std::runtime_error("Only 2D plot3d mesh is supported");
+        }
+        return {ni-1, nj-1};
+    }
 
-//     // Vector v0 from p1 to p3
-//     const linalg::alias::float3 vec_0 = v3 - v1;
+    SurfDims read(std::ifstream& f, f32* vertices) const override {
+        u64 ni, nj, nk, blocks;
+        f32 x, y, z;
+        f >> blocks;
+        f >> ni >> nj >> nk;
+        assert(mesh->ns == nj - 1);
+        assert(mesh->nc == ni - 1);
+        const u64 nb_vertices = ni * nj;
+        
+        for (u64 j = 0; j < nj; j++) {
+            for (u64 i = 0; i < ni; i++) {
+                f >> x;
+                vertices[nb_vertices * 0 + nj*i + j] = x;
+            }
+        }
+        for (u64 j = 0; j < nj; j++) {
+            for (u64 i = 0; i < ni; i++) {
+                f >> y;
+                vertices[nb_vertices * 1 + nj*i + j] = y;
+            }
+        }
+        for (u64 j = 0; j < nj; j++) {
+            for (u64 i = 0; i < ni; i++) {
+                f >> z;
+                vertices[nb_vertices * 2 + nj*i + j] = z;
+            }
+        }
 
-//     // Vector v1 from p0 to p2
-//     const linalg::alias::float3 vec_1 = v2 - v0;
+        // TODO: this validation is not robust enough when eventually we will be using multiple bodies
+        // Validation
+        // const f32 eps = std::numeric_limits<f32>::epsilon();
+        // if (std::abs(m.v.x[0]) != eps || std::abs(m.v.y[0]) != eps) {
+        //     throw std::runtime_error("First vertex of plot3d mesh must be at origin");
+        // }
+        const f32 first_y =vertices[nb_vertices * 1 + 0];
+        for (u64 i = 1; i < ni; i++) {
+            if ( vertices[nb_vertices * 1 + i*nj] != first_y) {
+                throw std::runtime_error("Mesh vertices should be ordered in chordwise direction");
+            }
+        }
+        return {ni-1, nj-1};
+    }
+};
 
-//     // Normal = v0 x v1
-//     const linalg::alias::float3 normal_vec = linalg::normalize(linalg::cross(vec_0, vec_1));
-//     normal.x[i] = normal_vec.x;
-//     normal.y[i] = normal_vec.y;
-//     normal.z[i] = normal_vec.z;
+class MeshIO {
+public:
+    MeshIO(const std::string& format) {
+        if (format == "plot3d") {
+            _file = std::make_unique<Plot3DFile>();
+        } else {
+            throw std::runtime_error("Unsupported file format");
+        }
+    }
 
-//     // 3 vectors f (P0P3), b (P0P2), e (P0P1) to compute the area:
-//     // area = 0.5 * (||f x b|| + ||b x e||)
-//     // this formula might also work: area = || 0.5 * ( f x b + b x e ) ||
-//     const linalg::alias::float3 vec_f = v3 - v0;
-//     const linalg::alias::float3 vec_b = v2 - v0;
-//     const linalg::alias::float3 vec_e = v1 - v0;
+    SurfDims get_dims(const std::string& filename) const {
+        std::ifstream file(filename);
+        if (!file.is_open()) {
+            throw std::runtime_error("Failed to open mesh file");
+        }
+        auto dims = _file->get_dims(file);
+        std::cout << "Number of panels: " << dims.first * dims.second << "\n";
+        std::cout << "nc: " << dims.first << "\n";
+        std::cout << "ns: " << dims.second << "\n";
+        return dims;
+    }
 
-//     area[i] = 0.5f * (linalg::length(linalg::cross(vec_f, vec_b)) + linalg::length(linalg::cross(vec_b, vec_e)));
-    
-//     // collocation point (center of the quad)
-//     const linalg::alias::float3 colloc_pt = 0.25f * (v0 + v1 + v2 + v3);
-    
-//     colloc.x[i] = colloc_pt.x;
-//     colloc.y[i] = colloc_pt.y;
-//     colloc.z[i] = colloc_pt.z;
-// }
+    void read(const std::string& filename, f32* vertices) const {
+        std::ifstream file(filename);
+        if (!file.is_open()) {
+            throw std::runtime_error("Failed to open mesh file");
+        }
+        auto [nc, ns] = _file->read(file, vertices);
+        mesh_quarterchord(vertices, nc, ns);
+    }
 
-// void Mesh::compute_metrics_wing() {
-//     for (u64 i = 0; i < nb_panels_wing(); i++) {
-//         compute_metrics_i(i);
-//     }
-// }
+private:
+    std::unique_ptr<MeshFile> _file;
+};
 
-// void Mesh::compute_metrics_wake() {
-//     for (u64 i = nb_panels_wing(); i < nb_panels_total(); i++) {
-//         compute_metrics_i(i);
-//     }
-// }
+// // TODO: DEPRECATE ALL BELOW
 
-// // plot3d is chordwise major
-// void Mesh::io_read_plot3d_structured(std::ifstream& f) {
+// // Reads the Plot3D Structured file, allocates vertex buffer and fills it with the file data
+// void mesh_io_read_file_plot3d(std::ifstream& f, MeshGeom* mesh_geom) {
 //     std::cout << "reading plot3d mesh" << std::endl;
-//     u64 ni = 0; // number of vertices chordwise
-//     u64 nj = 0; // number of vertices spanwise
-//     u64 nk = 0;
-//     u64 blocks = 0;
+//     u64 ni, nj, nk, blocks;
 //     f32 x, y, z;
 //     f >> blocks;
 //     if (blocks != 1) {
@@ -428,30 +150,32 @@ void vlm::mesh_quarterchord(MeshGeom* mesh_geom) {
 //         throw std::runtime_error("Only 2D plot3d mesh is supported");
 //     }
 
-//     ns = nj - 1;
-//     nc = ni - 1;
-//     alloc();
+//     mesh_geom->ns = nj - 1;
+//     mesh_geom->nc = ni - 1;
+//     mesh_geom->vertices = new f32[ni*nj*3];
+//     const u64 nb_panels = mesh_geom->ns * mesh_geom->nc;
+//     const u64 nb_vertices = ni * nj;
 
-//     std::cout << "number of panels: " << nb_panels_wing() << std::endl;
-//     std::cout << "ns: " << ns << std::endl;
-//     std::cout << "nc: " << nc << std::endl;
+//     std::cout << "number of panels: " << nb_panels << std::endl;
+//     std::cout << "ns: " << mesh_geom->ns << std::endl;
+//     std::cout << "nc: " << mesh_geom->nc << std::endl;
     
 //     for (u64 j = 0; j < nj; j++) {
 //         for (u64 i = 0; i < ni; i++) {
 //             f >> x;
-//             v.x[nj*i + j] = x;
+//             mesh_geom->vertices[nb_vertices * 0 + nj*i + j] = x;
 //         }
 //     }
 //     for (u64 j = 0; j < nj; j++) {
 //         for (u64 i = 0; i < ni; i++) {
 //             f >> y;
-//             v.y[nj*i + j] = y;
+//             mesh_geom->vertices[nb_vertices * 1 + nj*i + j] = y;
 //         }
 //     }
 //     for (u64 j = 0; j < nj; j++) {
 //         for (u64 i = 0; i < ni; i++) {
 //             f >> z;
-//             v.z[nj*i + j] = z;
+//             mesh_geom->vertices[nb_vertices * 2 + nj*i + j] = z;
 //         }
 //     }
 
@@ -461,64 +185,63 @@ void vlm::mesh_quarterchord(MeshGeom* mesh_geom) {
 //     // if (std::abs(m.v.x[0]) != eps || std::abs(m.v.y[0]) != eps) {
 //     //     throw std::runtime_error("First vertex of plot3d mesh must be at origin");
 //     // }
-//     const f32 first_y = v.y[0];
+//     const f32 first_y = mesh_geom->vertices[nb_vertices * 1 + 0];
 //     for (u64 i = 1; i < ni; i++) {
-//         if (v.y[i * nj] != first_y) {
+//         if ( mesh_geom->vertices[nb_vertices * 1 + i*nj] != first_y) {
 //             throw std::runtime_error("Mesh vertices should be ordered in chordwise direction");
 //         }
 //     }
 // }
 
-// void Mesh::io_read(const std::string& filename) {
+// void vlm::mesh_io_read_file(const std::string& filename, MeshGeom* mesh_geom) {
 //     const std::filesystem::path path(filename);
 //     if (!std::filesystem::exists(path)) {
-//         throw std::runtime_error("Mesh file not found");
+//         throw std::runtime_error("Mesh file not found"); // TODO: consider not using exceptions anymore
 //     }
 
 //     std::ifstream f(path);
-//     if (f.is_open()) {
-//         if (path.extension() == ".x") {
-//             io_read_plot3d_structured(f);
+//     try {
+//         if (f.is_open()) {
+//             if (path.extension() == ".x") {
+//                 mesh_io_read_file_plot3d(f, mesh_geom);
+//             } else {
+//                 throw std::runtime_error("Only structured gridpro mesh format is supported");
+//             }
+//             f.close();
 //         } else {
-//             throw std::runtime_error("Only structured gridpro mesh format is supported");
+//             throw std::runtime_error("Failed to open mesh file");
 //         }
-//         f.close();
-//     } else {
-//         throw std::runtime_error("Failed to open mesh file");
+//     } catch (const std::exception& e) {
+//         std::cerr << e.what() << std::endl;
 //     }
 // }
 
-// void Mesh::move(const linalg::alias::float4x4& transform, const SoA_3D_t<f32>& origin_pos) {
-//     // const tiny::ScopedTimer t("Mesh::move");
-//     assert(current_nw < nw); // check if we have capacity
-    
-//     // Shed wake before moving    
-//     const u64 src_begin = nb_vertices_wing() - (ns + 1);
-//     const u64 src_end = nb_vertices_wing();
-//     const u64 dst_begin = (nc + nw - current_nw) * (ns + 1);
-//     std::copy(v.x.data() + src_begin, v.x.data() + src_end, v.x.data() + dst_begin);
-//     std::copy(v.y.data() + src_begin, v.y.data() + src_end, v.y.data() + dst_begin);
-//     std::copy(v.z.data() + src_begin, v.z.data() + src_end, v.z.data() + dst_begin);
-    
-//     // Perform the movement
-//     for (u64 i = 0; i < nb_vertices_wing(); i++) {
-//         const linalg::alias::float4 transformed_pt = linalg::mul(transform, linalg::alias::float4{origin_pos.x[i], origin_pos.y[i], origin_pos.z[i], 1.f});
-//         v.x[i] = transformed_pt.x;
-//         v.y[i] = transformed_pt.y;
-//         v.z[i] = transformed_pt.z;
-//     }
-
-//     compute_metrics_wing(); // compute the new wing panel metrics
+// void vlm::mesh_quarterchord(MeshGeom* mesh_geom) {
+//     const u64 nc = mesh_geom->nc;
+//     const u64 ns = mesh_geom->ns;
+//     f32* vx = mesh_geom->vertices + 0 * (nc+1) * (ns+1);
+//     f32* vy = mesh_geom->vertices + 1 * (nc+1) * (ns+1);
+//     f32* vz = mesh_geom->vertices + 2 * (nc+1) * (ns+1);
 
-//     // Copy new trailing edge vertices on the wake buffer
-//     std::copy(v.x.data() + src_begin, v.x.data() + src_end, v.x.data() + dst_begin - (ns + 1));
-//     std::copy(v.y.data() + src_begin, v.y.data() + src_end, v.y.data() + dst_begin - (ns + 1));
-//     std::copy(v.z.data() + src_begin, v.z.data() + src_end, v.z.data() + dst_begin - (ns + 1));
+//     for (u64 i = 0; i < nc; i++) {
+//         for (u64 j = 0; j < ns+1; j++) {
+//             const u64 v0 = (i+0) * (ns+1) + j;
+//             const u64 v3 = (i+1) * (ns+1) + j;
 
-//     current_nw++;
-// }
+//             vx[v0] = 0.75f * vx[v0] + 0.25f * vx[v3];
+//             vy[v0] = 0.75f * vy[v0] + 0.25f * vy[v3];
+//             vz[v0] = 0.75f * vz[v0] + 0.25f * vz[v3];
+//         }
+//     }
 
-// void Mesh::resize_wake(const u64 nw_) {
-//     nw = nw_;
-//     alloc(); // resizes the buffers
+//     // Trailing edge vertices
+//     const u64 i = nc-1;
+//     for (u64 j = 0; j < ns+1; j++) {
+//         const u64 v0 = (i+0) * (ns+1) + j;
+//         const u64 v3 = (i+1) * (ns+1) + j;
+        
+//         vx[v3] = (4.f/3.f) * vx[v3] - (1.f/3.f) * vx[v0];
+//         vy[v3] = (4.f/3.f) * vy[v3] - (1.f/3.f) * vy[v0];
+//         vz[v3] = (4.f/3.f) * vz[v3] - (1.f/3.f) * vz[v0];
+//     }
 // }