add ispc cd trefftz kernel

samayala22 · Feb 14, 2024 · 1f33f32 · 1f33f32
1 parent 655d7e7
commit 1f33f32
Show file tree

Hide file tree

Showing 3 changed files with 121 additions and 146 deletions.
diff --git a/headeronly/tinytimer.hpp b/headeronly/tinytimer.hpp
@@ -5,24 +5,24 @@
 
 namespace tiny {
 
-constexpr static long long us_in_s = 1'000'000LL;
-constexpr static long long us_in_ms = 1'000LL;
+constexpr static double us_in_s = 1'000'000;
+constexpr static double us_in_ms = 1'000;
 
 class ScopedTimer {
 public:
     ScopedTimer(const char* name) : m_name(name), m_start(std::chrono::high_resolution_clock::now()) {}
 
     ~ScopedTimer() {
         const auto m_end = std::chrono::high_resolution_clock::now();
-        const auto duration = static_cast<long long>(std::chrono::duration_cast<std::chrono::microseconds>(m_end - m_start).count());
+        const auto duration = static_cast<double>(std::chrono::duration_cast<std::chrono::microseconds>(m_end - m_start).count());
         std::printf("%s: ", m_name);
 
         if (duration > us_in_s) {
-            std::printf("%lld s\n", duration / us_in_s);
+            std::printf("%.2f s\n", duration / us_in_s);
         } else if (duration > us_in_ms) {
-            std::printf("%lld ms\n", duration / us_in_ms);
+            std::printf("%.0f ms\n", duration / us_in_ms);
         } else {
-            std::printf("%lld us\n", duration);
+            std::printf("%.0f us\n", duration);
         }
     }
 private:

diff --git a/vlm/backends/cpu/src/vlm_backend_cpu.cpp b/vlm/backends/cpu/src/vlm_backend_cpu.cpp
@@ -9,12 +9,9 @@
 #include "vlm_utils.hpp"
 #include "vlm_executor.hpp" // includes taskflow/taskflow.hpp
 
-#include <algorithm>
-#include <cstdio>
-#include <fstream>
-#include <immintrin.h>
+#include <algorithm> // std::fill
+#include <cstdio> // std::printf
 
-#include <limits>
 #include <taskflow/algorithm/for_each.hpp>
 
 #include <lapacke.h>
@@ -135,29 +132,29 @@ void BackendCPU::lu_solve() {
     LAPACKE_sgetrs(LAPACK_COL_MAJOR, 'N', n, 1, lhs.data(), n, ipiv.data(), gamma.data(), n);
 }
 
-f32 BackendCPU::compute_coefficient_cl(const FlowData& flow, const f32 area,
-    const u32 j, const u32 n) {
-    assert(n > 0);
-    assert(j >= 0 && j+n <= mesh.ns);
+// f32 BackendCPU::compute_coefficient_cl(const FlowData& flow, const f32 area,
+//     const u32 j, const u32 n) {
+//     assert(n > 0);
+//     assert(j >= 0 && j+n <= mesh.ns);
 
-    f32 cl = 0.0f;
-
-    for (u32 u = 0; u < mesh.nc; u++) {
-        for (u32 v = j; v < j + n; v++) {
-            const u32 li = u * mesh.ns + v; // linear index
-            const linalg::alias::float3 v0 = mesh.get_v0(li);
-            const linalg::alias::float3 v1 = mesh.get_v1(li);
-            // Leading edge vector pointing outward from wing root
-            const linalg::alias::float3 dl = v1 - v0;
-            // Distance from the center of leading edge to the reference point
-            const linalg::alias::float3 force = linalg::cross(flow.freestream, dl) * flow.rho * delta_gamma[li];
-            cl += linalg::dot(force, flow.lift_axis);
-        }
-    }
-    cl /= 0.5f * flow.rho * linalg::length2(flow.freestream) * area;
-
-    return cl;
-}
+//     f32 cl = 0.0f;
+
+//     for (u32 u = 0; u < mesh.nc; u++) {
+//         for (u32 v = j; v < j + n; v++) {
+//             const u32 li = u * mesh.ns + v; // linear index
+//             const linalg::alias::float3 v0 = mesh.get_v0(li);
+//             const linalg::alias::float3 v1 = mesh.get_v1(li);
+//             // Leading edge vector pointing outward from wing root
+//             const linalg::alias::float3 dl = v1 - v0;
+//             // Distance from the center of leading edge to the reference point
+//             const linalg::alias::float3 force = linalg::cross(flow.freestream, dl) * flow.rho * delta_gamma[li];
+//             cl += linalg::dot(force, flow.lift_axis);
+//         }
+//     }
+//     cl /= 0.5f * flow.rho * linalg::length2(flow.freestream) * area;
+
+//     return cl;
+// }
 
 linalg::alias::float3 BackendCPU::compute_coefficient_cm(
     const FlowData& flow,
@@ -188,75 +185,6 @@ linalg::alias::float3 BackendCPU::compute_coefficient_cm(
     return cm;
 }
 
-inline void kernel_biosavart(f32& vx, f32& vy, f32& vz, f32& x, f32& y, f32& z, f32& x1, f32& y1, f32& z1, f32& x2, f32& y2, f32& z2) {
-    static const f32 rcut = 1.0e-10f;
-    vx = 0.0f;
-    vy = 0.0f;
-    vz = 0.0f;
-
-    // Katz Plotkin, Low speed Aero | Eq 10.115
-    const f32 r1r2x =   (y-y1)*(z-z2) - (z-z1)*(y-y2);
-    const f32 r1r2y = -((x-x1)*(z-z2) - (z-z1)*(x-x2));
-    const f32 r1r2z =   (x-x1)*(y-y2) - (y-y1)*(x-x2);
-
-    const f32 r1 = std::sqrt(pow<2>(x-x1)+pow<2>(y-y1)+pow<2>(z-z1));
-    const f32 r2 = std::sqrt(pow<2>(x-x2)+pow<2>(y-y2)+pow<2>(z-z2));
-    const f32 square = pow<2>(r1r2x) + pow<2>(r1r2y) + pow<2>(r1r2z);
-    if ((r1<rcut) || (r2<rcut) || (square<rcut)) return;
-
-    const f32 r0r1 = (x2-x1)*(x-x1)+(y2-y1)*(y-y1)+(z2-z1)*(z-z1);
-    const f32 r0r2 = (x2-x1)*(x-x2)+(y2-y1)*(y-y2)+(z2-z1)*(z-z2);
-    const f32 coeff = 1.0f/(4.0f*PI_f*square) * (r0r1/r1 - r0r2/r2);
-
-    vx = coeff * r1r2x;
-    vy = coeff * r1r2y;
-    vz = coeff * r1r2z;
-}
-
-inline void kernel_symmetry(f32& inf_x, f32& inf_y, f32& inf_z, f32 x, f32 y, f32 z, f32 x1, f32 y1, f32 z1, f32 x2, f32 y2, f32 z2) {
-    f32 vx, vy, vz;
-    kernel_biosavart(vx, vy, vz, x, y, z, x1, y1, z1, x2, y2, z2);
-    inf_x += vx;
-    inf_y += vy;
-    inf_z += vz;
-    y = -y; // wing symmetry
-    kernel_biosavart(vx, vy, vz, x, y, z, x1, y1, z1, x2, y2, z2);
-    inf_x += vx;
-    inf_y -= vy;
-    inf_z += vz;
-}
-
-using namespace linalg::alias;
-
-inline float3 kernel_biosavart(const float3& colloc, const float3& vertex1, const float3& vertex2, const f32& sigma) {
-    const f32 rcut = 1e-10f; // highly tuned value
-    const float3 r0 = vertex2 - vertex1;
-    const float3 r1 = colloc - vertex1;
-    const float3 r2 = colloc - vertex2;
-    // Katz Plotkin, Low speed Aero | Eq 10.115
-    const float3 r1r2cross = linalg::cross(r1, r2);
-    const f32 r1_norm = linalg::length(r1);
-    const f32 r2_norm = linalg::length(r2);
-    const f32 square = linalg::length2(r1r2cross);
-    if ((r1_norm<rcut) || (r2_norm<rcut) || (square<rcut)) {
-        return float3{0.0f, 0.0f, 0.0f};
-    }
-
-    const f32 smoother = sigma*sigma*linalg::length2(r0);
-    const f32 coeff = (linalg::dot(r0,r1)/r1_norm - linalg::dot(r0,r2)/r2_norm) / (4.0f*PI_f*std::sqrt(square*square + smoother*smoother));
-    return r1r2cross * coeff;
-}
-
-inline void kernel_symmetry(float3& inf, float3 colloc, const float3& vertex0, const float3& vertex1, const f32 sigma) {
-    float3 induced_speed = kernel_biosavart(colloc, vertex0, vertex1, sigma);
-    inf += induced_speed;
-    colloc.y = -colloc.y; // wing symmetry
-    float3 induced_speed_sym = kernel_biosavart(colloc, vertex0, vertex1, sigma);
-    inf.x += induced_speed_sym.x;
-    inf.y -= induced_speed_sym.y;
-    inf.z += induced_speed_sym.z;
-}
-
 f32 BackendCPU::compute_coefficient_cd(
     const FlowData& flow,
     const f32 area,
@@ -267,42 +195,34 @@ f32 BackendCPU::compute_coefficient_cd(
     assert(n > 0);
     assert(j >= 0 && j+n <= mesh.ns);
     std::fill(trefftz_buffer.begin(), trefftz_buffer.end(), 0.0f);
-
-    f32 cd = 0.0f;
-    // Drag coefficent computed using Trefftz plane
-    const u32 begin = j + mesh.nb_panels_wing();
-    const u32 end = begin + n;
-    // parallel for
-    for (u32 ia = begin; ia < end; ia++) {
-        const u32 v0 = ia + ia / mesh.ns;
-        const u32 v1 = v0 + 1;
-        const u32 v3 = v0 + n+1;
-        const u32 v2 = v3 + 1;
-
-        const float3 vertex0{mesh.v.x[v0], mesh.v.y[v0], mesh.v.z[v0]};
-        const float3 vertex1{mesh.v.x[v1], mesh.v.y[v1], mesh.v.z[v1]};
-        const float3 vertex2{mesh.v.x[v2], mesh.v.y[v2], mesh.v.z[v2]};
-        const float3 vertex3{mesh.v.x[v3], mesh.v.y[v3], mesh.v.z[v3]};
-
-        const f32 gammaw = gamma[ia - mesh.ns];
-
-        for (u32 ia2 = begin; ia2 < end; ia2++) {
-            const float3 colloc(mesh.colloc.x[ia2], mesh.colloc.y[ia2], mesh.colloc.z[ia2]);
-            const float3 normal(mesh.normal.x[ia2], mesh.normal.y[ia2], mesh.normal.z[ia2]);
-            linalg::alias::float3 inf2(0.f, 0.f, 0.f);
-            // Influence from the streamwise vortex lines
-            kernel_symmetry(inf2, colloc, vertex1, vertex2, flow.sigma_vatistas);
-            kernel_symmetry(inf2, colloc, vertex3, vertex0, flow.sigma_vatistas);
-            // This is the induced velocity calculated with the vortex (gamma) calculated earlier (according to kutta condition)
-            trefftz_buffer[ia2 - begin] += gammaw * linalg::dot(inf2, normal);
-        }
-    }
 
-    for (u32 i = 0; i < mesh.ns; i++) {
-        const u32 li = (mesh.nc-1) * mesh.ns + i;
-        const f32 dl = mesh.strip_width(i);
-        cd -= gamma[li] * trefftz_buffer[i] * dl; // used to have 0.5f * flow.rho
-    }
+    Mesh& m = mesh;
+    ispc::MeshProxy mesh_proxy = {
+        m.ns, m.nc, m.nb_panels_wing(),
+        {m.v.x.data(), m.v.y.data(), m.v.z.data()}, 
+        {m.colloc.x.data(), m.colloc.y.data(), m.colloc.z.data()},
+        {m.normal.x.data(), m.normal.y.data(), m.normal.z.data()}
+    };
+    f32 cd = ispc::kernel_trefftz_cd(mesh_proxy, gamma.data(), trefftz_buffer.data(), j, n, flow.sigma_vatistas);
     cd /= linalg::length2(flow.freestream) * area;
     return cd;
-}
+}
+
+// Using Trefftz plane
+f32 BackendCPU::compute_coefficient_cl(
+    const FlowData& flow,
+    const f32 area,
+    const u32 j,
+    const u32 n) 
+{
+    Mesh& m = mesh;
+    ispc::MeshProxy mesh_proxy = {
+        m.ns, m.nc, m.nb_panels_wing(),
+        {m.v.x.data(), m.v.y.data(), m.v.z.data()}, 
+        {m.colloc.x.data(), m.colloc.y.data(), m.colloc.z.data()},
+        {m.normal.x.data(), m.normal.y.data(), m.normal.z.data()}
+    };
+    f32 cl = ispc::kernel_trefftz_cl(mesh_proxy, gamma.data(), j, n);
+    cl /= 0.5f * flow.rho * linalg::length2(flow.freestream) * area;
+    return cl;
+}
diff --git a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc
@@ -44,7 +44,8 @@ export struct MeshProxy {
 };
 
 // Bio-savart Kernel
-#define RCUT 1e-10f // highly tuned value
+#define RCUT 1e-10f
+#define RCUT2 1e-5f
 
 #define PI_f 3.141593f
 
@@ -58,7 +59,7 @@ inline float3 kernel_biosavart(float3& colloc, const uniform float3& vertex1, co
     float r2_norm = length(r2);
     float square = length2(r1r2cross);
 
-    if ((r1_norm<RCUT) || (r2_norm<RCUT) || (square<RCUT)) {
+    if ((square<RCUT) || (r1_norm<RCUT2) || (r2_norm<RCUT2)) {
         float3 res = {0.0f, 0.0f, 0.0f};
         return res;
     }
@@ -110,9 +111,63 @@ export void kernel_influence(
     }
 }
 
-// export uniform float kernel_trefftz_cd(
-//     uniform const MeshProxy& m,
-//     uniform float* uniform gamma,
-//     uniform unsigned int ia, uniform float sigma
-//     ) {
-// }
+export uniform float kernel_trefftz_cd(
+    uniform const MeshProxy& m,
+    uniform float* uniform gamma,
+    uniform float* uniform trefftz_buffer,
+    uniform unsigned int j, uniform unsigned int n, uniform float sigma
+    ) {
+    uniform unsigned int begin = m.nb_panels + j;
+    uniform unsigned int end = begin + n;
+    float cd = 0.0f;
+
+    // Compute the induced velocity of the streamwise wake vortex segments
+    for (uniform unsigned int ia = m.nb_panels; ia < m.nb_panels + m.ns; ia++) {
+        const uniform unsigned int v0 = ia + ia / m.ns;
+        const uniform unsigned int v1 = v0 + 1;
+        const uniform unsigned int v3 = v0 + m.ns+1;
+        const uniform unsigned int v2 = v3 + 1;
+
+        // Broadcast vertices
+        const uniform float3 vertex0 = {m.v.x[v0], m.v.y[v0], m.v.z[v0]};
+        const uniform float3 vertex1 = {m.v.x[v1], m.v.y[v1], m.v.z[v1]};
+        const uniform float3 vertex2 = {m.v.x[v2], m.v.y[v2], m.v.z[v2]};
+        const uniform float3 vertex3 = {m.v.x[v3], m.v.y[v3], m.v.z[v3]};
+
+        uniform float gammaw = gamma[ia - m.ns];
+        foreach(ia2 = begin ... end) {
+            const float3 colloc = {m.colloc.x[ia2], m.colloc.y[ia2], m.colloc.z[ia2]};
+            const float3 normal = {m.normal.x[ia2], m.normal.y[ia2], m.normal.z[ia2]};
+            float3 inf = {0.0f, 0.0f, 0.0f};
+
+            kernel_symmetry(inf, colloc, vertex1, vertex2, sigma);
+            kernel_symmetry(inf, colloc, vertex3, vertex0, sigma);
+            trefftz_buffer[ia2 - begin + j] += gammaw * dot(inf, normal); // store
+        }
+    }
+    // Perform the integration (Katz Plotkin, Low speed Aero | Eq 8.146)
+    foreach(i = j ... j + n) {
+        unsigned int li = (m.nc-1) * m.ns + i;
+        float3 v0 = {m.v.x[i], m.v.y[i], m.v.z[i]};
+        float3 v1 = {m.v.x[i+1], m.v.y[i+1], m.v.z[i+1]};
+        float dl = length(v1 - v0);
+        cd -= gamma[li] * trefftz_buffer[i] * dl; // used to have 0.5f * flow.rho
+    }
+    return reduce_add(cd);
+}
+
+export uniform float kernel_trefftz_cl(
+    uniform const MeshProxy& m,
+    uniform float* uniform gamma,
+    uniform unsigned int j, uniform unsigned int n
+    ) {
+    float cl = 0.0f;
+    foreach(i = j ... j + n) {
+        unsigned int li = (m.nc-1) * m.ns + i;
+        float3 v0 = {m.v.x[i], m.v.y[i], m.v.z[i]};
+        float3 v1 = {m.v.x[i+1], m.v.y[i+1], m.v.z[i+1]};
+        float dl = length(v1 - v0);
+        cl += gamma[li]* dl; // used to have 0.5f * flow.rho
+    }
+    return reduce_add(cl);
+}