From 0713929a744af7616a16e329c212b79e1db2a61f Mon Sep 17 00:00:00 2001 From: Samuel Ayala Date: Sun, 11 Feb 2024 19:10:39 -0500 Subject: [PATCH 1/6] add ispc kernel tests are failing tho --- config/elliptic.vlm | 2 +- config/swept.vlm | 2 +- headeronly/tinytimer.hpp | 6 +- tests/test_linear.cpp | 141 ++++++++++++++++ tests/test_non_linear.cpp | 20 +-- .../include/vlm_backend_cpu.hpp} | 6 +- .../cpu/include/vlm_backend_cpu_kernels.hpp | 12 ++ .../src/vlm_backend_cpu.cpp} | 152 +++++++++++++----- .../cpu/src/vlm_backend_cpu_kernels.cpp | 67 ++++++++ .../cpu/src/vlm_backend_cpu_kernels.ispc | 107 ++++++++++++ vlm/backends/{avx2 => cpu}/xmake.lua | 7 +- .../cuda/include/vlm_backend_cuda.hpp | 4 +- vlm/backends/cuda/src/vlm_backend_cuda.cu | 4 +- vlm/backends/cuda/xmake.lua | 2 +- vlm/dev/main.cpp | 4 +- vlm/include/vlm_executor.hpp | 1 - vlm/include/vlm_types.hpp | 3 + vlm/include/vlm_utils.hpp | 2 + vlm/src/vlm.cpp | 4 +- vlm/src/vlm_backend.cpp | 12 +- vlm/src/vlm_utils.cpp | 10 ++ xmake.lua | 4 +- 22 files changed, 493 insertions(+), 79 deletions(-) create mode 100644 tests/test_linear.cpp rename vlm/backends/{avx2/include/vlm_backend_avx2.hpp => cpu/include/vlm_backend_cpu.hpp} (90%) create mode 100644 vlm/backends/cpu/include/vlm_backend_cpu_kernels.hpp rename vlm/backends/{avx2/src/vlm_backend_avx2.cpp => cpu/src/vlm_backend_cpu.cpp} (73%) create mode 100644 vlm/backends/cpu/src/vlm_backend_cpu_kernels.cpp create mode 100644 vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc rename vlm/backends/{avx2 => cpu}/xmake.lua (68%) diff --git a/config/elliptic.vlm b/config/elliptic.vlm index 2b4232d..a899515 100644 --- a/config/elliptic.vlm +++ b/config/elliptic.vlm @@ -8,7 +8,7 @@ alphas = [5] s_ref = 3.92699 # half wing area c_ref = 0.85 sigma_vatistas = 0.0 -backend = avx2 +backend = cpu wake_included = false diff --git a/config/swept.vlm b/config/swept.vlm index abada90..c2b6521 100644 --- a/config/swept.vlm +++ b/config/swept.vlm @@ -7,7 +7,7 @@ ref_point = [0.25, 0.0, 0.0] alphas = [5] -backend = avx2 +backend = cpu wake_included = false \ No newline at end of file diff --git a/headeronly/tinytimer.hpp b/headeronly/tinytimer.hpp index 16c7a38..cdeac87 100644 --- a/headeronly/tinytimer.hpp +++ b/headeronly/tinytimer.hpp @@ -8,11 +8,11 @@ namespace tiny { constexpr static long long us_in_s = 1'000'000LL; constexpr static long long us_in_ms = 1'000LL; -class Timer { +class ScopedTimer { public: - Timer(const char* name) : m_name(name), m_start(std::chrono::high_resolution_clock::now()) {} + ScopedTimer(const char* name) : m_name(name), m_start(std::chrono::high_resolution_clock::now()) {} - ~Timer() { + ~ScopedTimer() { const auto m_end = std::chrono::high_resolution_clock::now(); const auto duration = static_cast(std::chrono::duration_cast(m_end - m_start).count()); std::printf("%s: ", m_name); diff --git a/tests/test_linear.cpp b/tests/test_linear.cpp new file mode 100644 index 0000000..c1cad6f --- /dev/null +++ b/tests/test_linear.cpp @@ -0,0 +1,141 @@ +#include +#include +#include +#include +#include + +#include "tinycombination.hpp" + +#include "vlm.hpp" +#include "vlm_utils.hpp" + +using namespace vlm; + +// area of whole wing +float s_ref(float a, float b) { + return 0.5f * vlm::PI_f * a * b; +} + +// wing aspect ratio +float aspect_ratio(float a, float b) { + return (2*b)*(2*b) / s_ref(a, b); +} + +float compute_analytical_cl(float alpha, float a, float b) { + return 2.0f * vlm::PI_f * alpha / (1.0f + 2.0f/aspect_ratio(a, b)); +} + +float compute_analytical_cd(float cl, float a, float b) { + return cl*cl / (vlm::PI_f * aspect_ratio(a, b)); +} + +float compute_analytical_gamma(float y, float a, float b, float alpha) { + const float gamma0 = compute_analytical_cl(alpha, a, b) * 1.0f * s_ref(a, b) / (vlm::PI_f * b); + const float ratio = y / b; + return gamma0 * std::sqrt(1.0f - ratio*ratio); +} + +// bool elliptic_convergence() { +// tiny::Config cfg("../../../../config/elliptic.vlm"); + +// vlm::VLM vlm(cfg); + +// std::vector dimensions = { +// 16, 32, 45, 64, 90, 128 +// }; + +// const float a = 1.0f; // wing chord root +// const float b = 5.0f; // half wing span + +// std::vector norm_l1; +// std::vector norm_l2; +// std::vector norm_linf; + +// const float alpha = 0.1f; // degrees + +// for (const auto& dim : dimensions) { +// std::string filename = std::format("../../../../mesh/elliptic_{}x{}.x", dim, dim); +// vlm.mesh.io_read(filename); +// vlm.init(); +// vlm::Solver solver(vlm.mesh, vlm.data, cfg); +// solver.run(alpha); + +// double l1 = 0.0f; +// double l2 = 0.0f; +// double linf = 0.0f; +// int begin = (vlm.mesh.nc - 1) * vlm.mesh.ns; +// int end = vlm.mesh.nc * vlm.mesh.ns; +// // loop over last row of panels +// for (int i = begin; i < end; i++) { +// const float y = vlm.mesh.colloc.y[i]; +// const float gamma = vlm.data.gamma[i]; +// const float gamma_analytical = analytical_gamma(y, a, b, alpha); +// const double error = std::abs((gamma - gamma_analytical) / (gamma_analytical + 1e-7f)); +// std::printf("y: %f, gamma: %f, gamma_analytical: %f, error: %f \n", y, gamma, gamma_analytical, error); + +// l1 += error; +// l2 += error * error; +// linf = std::max(linf, error); +// } +// l1 /= (end - begin); +// l2 = std::sqrt(l2) / (end - begin); +// std::printf("L1: %f, L2: %f, Linf: %f\n", l1, l2, linf); + +// norm_l1.push_back(l1); +// norm_l2.push_back(l2); +// norm_linf.push_back(linf); +// } + +// double order_l1 = 0.0f; +// double order_l2 = 0.0f; +// double order_linf = 0.0f; + +// auto order = [=](double norm0, double norm1, float dim0, float dim1) { +// return std::log(norm0 / norm1) / std::log((b/dim0)/(b/dim1)); +// }; + +// for (int i = 0; i < dimensions.size() - 1; i++) { +// order_l1 += order(norm_l1[i], norm_l1[i+1], dimensions[i], dimensions[i+1]); +// order_l2 += order(norm_l2[i], norm_l2[i+1], dimensions[i], dimensions[i+1]); +// order_linf += order(norm_linf[i], norm_linf[i+1], dimensions[i], dimensions[i+1]); +// } +// order_l1 /= (dimensions.size() - 1); +// order_l2 /= (dimensions.size() - 1); +// order_linf /= (dimensions.size() - 1); +// std::printf("Order L1: %f, Order L2: %f, Order Linf: %f\n", order_l1, order_l2, order_linf); +// return 0; +// } + +int main(int argc, char **argv) { + const float a = 1.0f; // wing chord root + const float b = 5.0f; // half wing span + + const std::vector meshes = {"../../../../mesh/elliptic_64x64.x"}; + const std::vector backends = {"cpu"}; + std::vector test_alphas = {0, 1, 2, 3, 4, 5, 10, 15}; + std::transform(test_alphas.begin(), test_alphas.end(), test_alphas.begin(), to_radians); + + auto solvers = tiny::make_combination(meshes, backends); + for (const auto& [mesh_name, backend_name] : solvers) { + LinearVLM solver{}; + solver.mesh = create_mesh(mesh_name); + solver.backend = create_backend(backend_name, *solver.mesh); + + for (u64 i = 0; i < test_alphas.size(); i++) { + const FlowData flow{test_alphas[i], 0.0f, 1.0f, 1.0f}; + const auto coeffs = solver.solve(flow); + const f32 analytical_cl = compute_analytical_cl(flow.alpha, a, b); + const f32 analytical_cd = compute_analytical_cd(analytical_cl, a, b); + const f32 cl_aerr = std::abs(coeffs.cl - analytical_cl); + const f32 cl_rerr = analytical_cl < EPS_f ? 0.f : cl_aerr / analytical_cl; + const f32 cd_aerr = std::abs(coeffs.cd - analytical_cd); + const f32 cd_rerr = analytical_cd < EPS_f ? 0.f : cd_aerr / analytical_cd; + std::printf(">>> Alpha: %.1f | CL = %.6f CD = %.6f CMx = %.6f CMy = %.6f CMz = %.6f\n", to_degrees(flow.alpha), coeffs.cl, coeffs.cd, coeffs.cm.x, coeffs.cm.y, coeffs.cm.z); + std::printf(">>> Analytical CL: %.6f | Abs Error: %.3E | Relative Error: %.5f%% \n", analytical_cl, cl_aerr, cl_rerr*100.f); + std::printf(">>> Analytical CD: %.6f | Abs Error: %.3E | Relative Error: %.5f%% \n", analytical_cd, cd_aerr, cd_rerr*100.f); + if (cl_rerr > 0.03f || cd_rerr > 0.01f) return 1; + } + } + + return 0; +} \ No newline at end of file diff --git a/tests/test_non_linear.cpp b/tests/test_non_linear.cpp index 95a9738..6348fdb 100644 --- a/tests/test_non_linear.cpp +++ b/tests/test_non_linear.cpp @@ -7,9 +7,11 @@ #include #include "tinyinterpolate.hpp" -#include "vlm.hpp" #include "tinycombination.hpp" +#include "vlm.hpp" +#include "vlm_utils.hpp" + using namespace vlm; struct LiftCurveFunctor { @@ -41,14 +43,6 @@ void linspace(T start, T end, u32 n, std::vector& out) { } } -f32 to_radians(f32 degrees) { - return degrees * PI_f / 180.0f; -} - -f32 to_degrees(f32 radians) { - return radians * 180.0f / PI_f; -} - template void write_vector_pair(const std::string& filename, const std::vector& vec1, const std::vector& vec2) { assert(vec1.size() == vec2.size()); @@ -67,7 +61,7 @@ void write_vector_pair(const std::string& filename, const std::vector& vec1, int main(int argc, char** argv) { const std::vector meshes = {"../../../../mesh/infinite_rectangular_5x200.x"}; - const std::vector backends = {"avx2"}; + const std::vector backends = {"cpu"}; std::vector>> lift_curves; lift_curves.emplace_back(std::make_pair("spallart1", std::make_unique(1.2f, 0.28f, 0.02f, 2.f*PI_f, 2.f*PI_f))); lift_curves.emplace_back(std::make_pair("spallart2", std::make_unique(0.72f, 0.28f, 0.04f, 2.f*PI_f, 1.5f*PI_f))); @@ -113,9 +107,9 @@ int main(int argc, char** argv) { std::printf(">>> Alpha: %.1f | CL = %.6f CD = %.6f CMx = %.6f CMy = %.6f CMz = %.6f\n", to_degrees(test_alphas[i]), coeffs.cl, coeffs.cd, coeffs.cm.x, coeffs.cm.y, coeffs.cm.z); const f32 analytical_cl = (*lift_curve.second)(flow.alpha); const f32 abs_error = std::abs(coeffs.cl - analytical_cl); - const f32 rel_error = 100.0f * abs_error / (analytical_cl + std::numeric_limits::epsilon()); - std::printf(">>> Analytical: %.6f | Abs Error: %.3E | Relative Error: %.5f%% \n", analytical_cl, abs_error, rel_error); - if (rel_error > 1.f) return 1; // Failure + const f32 rel_error = abs_error / (analytical_cl + std::numeric_limits::epsilon()); + std::printf(">>> Analytical: %.6f | Abs Error: %.3E | Relative Error: %.5f%% \n", analytical_cl, abs_error, rel_error*100.f); + if (rel_error > 0.01f) return 1; // Failure } write_vector_pair(lift_curve.first + "_nonlinear_cl", test_alphas, test_cl); } diff --git a/vlm/backends/avx2/include/vlm_backend_avx2.hpp b/vlm/backends/cpu/include/vlm_backend_cpu.hpp similarity index 90% rename from vlm/backends/avx2/include/vlm_backend_avx2.hpp rename to vlm/backends/cpu/include/vlm_backend_cpu.hpp index 732c598..81b0227 100644 --- a/vlm/backends/avx2/include/vlm_backend_avx2.hpp +++ b/vlm/backends/cpu/include/vlm_backend_cpu.hpp @@ -5,7 +5,7 @@ namespace vlm { -class BackendAVX2 : public Backend { +class BackendGeneric : public Backend { public: std::vector lhs; std::vector rhs; @@ -13,8 +13,8 @@ class BackendAVX2 : public Backend { std::vector gamma; std::vector delta_gamma; - BackendAVX2(Mesh& mesh); - ~BackendAVX2(); + BackendGeneric(Mesh& mesh); + ~BackendGeneric(); void reset() override; void compute_lhs(const FlowData& flow) override; void compute_rhs(const FlowData& flow) override; diff --git a/vlm/backends/cpu/include/vlm_backend_cpu_kernels.hpp b/vlm/backends/cpu/include/vlm_backend_cpu_kernels.hpp new file mode 100644 index 0000000..2b8be98 --- /dev/null +++ b/vlm/backends/cpu/include/vlm_backend_cpu_kernels.hpp @@ -0,0 +1,12 @@ +#include "vlm_types.hpp" + +namespace vlm { +void kernel_influence( + u32 m, u32 n, + f32 lhs[], + f32 vx[], f32 vy[], f32 vz[], + f32 collocx[], f32 collocy[], f32 collocz[], + f32 normalx[], f32 normaly[], f32 normalz[], + u32 ia, u32 lidx, f32 sigma_p4 + ); +} \ No newline at end of file diff --git a/vlm/backends/avx2/src/vlm_backend_avx2.cpp b/vlm/backends/cpu/src/vlm_backend_cpu.cpp similarity index 73% rename from vlm/backends/avx2/src/vlm_backend_avx2.cpp rename to vlm/backends/cpu/src/vlm_backend_cpu.cpp index 22c95f5..a9d8444 100644 --- a/vlm/backends/avx2/src/vlm_backend_avx2.cpp +++ b/vlm/backends/cpu/src/vlm_backend_cpu.cpp @@ -1,4 +1,6 @@ -#include "vlm_backend_avx2.hpp" +#include "vlm_backend_cpu.hpp" +#include "vlm_backend_cpu_kernels.hpp" +#include "vlm_backend_cpu_kernels_ispc.h" #include "linalg.h" #include "simpletimer.hpp" @@ -12,6 +14,7 @@ #include #include +#include #include #include @@ -19,9 +22,9 @@ using namespace vlm; -BackendAVX2::~BackendAVX2() = default; // Destructor definition +BackendGeneric::~BackendGeneric() = default; // Destructor definition -BackendAVX2::BackendAVX2(Mesh& mesh) : Backend(mesh) { +BackendGeneric::BackendGeneric(Mesh& mesh) : Backend(mesh) { lhs.resize((u64)mesh.nb_panels_wing() * (u64)mesh.nb_panels_wing()); rhs.resize(mesh.nb_panels_wing()); ipiv.resize(mesh.nb_panels_wing()); @@ -29,13 +32,13 @@ BackendAVX2::BackendAVX2(Mesh& mesh) : Backend(mesh) { delta_gamma.resize(mesh.nb_panels_wing()); } -void BackendAVX2::reset() { +void BackendGeneric::reset() { std::fill(gamma.begin(), gamma.end(), 0.0f); std::fill(lhs.begin(), lhs.end(), 0.0f); std::fill(rhs.begin(), rhs.end(), 0.0f); } -void BackendAVX2::compute_delta_gamma() { +void BackendGeneric::compute_delta_gamma() { std::copy(gamma.begin(), gamma.begin()+mesh.ns, delta_gamma.begin()); // note: this is efficient as the memory is contiguous @@ -46,14 +49,13 @@ void BackendAVX2::compute_delta_gamma() { } } -inline void micro_kernel_influence_scalar(f32& vx, f32& vy, f32& vz, f32& x, f32& y, f32& z, f32& x1, f32& y1, f32& z1, f32& x2, f32& y2, f32& z2) { - static const f32 rcut = 1.0e-12f; +inline void kernel_biosavart(f32& vx, f32& vy, f32& vz, f32& x, f32& y, f32& z, f32& x1, f32& y1, f32& z1, f32& x2, f32& y2, f32& z2) { + static const f32 rcut = 1.0e-10f; vx = 0.0f; vy = 0.0f; vz = 0.0f; // Katz Plotkin, Low speed Aero | Eq 10.115 - const f32 r1r2x = (y-y1)*(z-z2) - (z-z1)*(y-y2); const f32 r1r2y = -((x-x1)*(z-z2) - (z-z1)*(x-x2)); const f32 r1r2z = (x-x1)*(y-y2) - (y-y1)*(x-x2); @@ -72,14 +74,14 @@ inline void micro_kernel_influence_scalar(f32& vx, f32& vy, f32& vz, f32& x, f32 vz = coeff * r1r2z; } -inline void kernel_influence_scalar(f32& inf_x, f32& inf_y, f32& inf_z, f32 x, f32 y, f32 z, f32 x1, f32 y1, f32 z1, f32 x2, f32 y2, f32 z2) { +inline void kernel_symmetry(f32& inf_x, f32& inf_y, f32& inf_z, f32 x, f32 y, f32 z, f32 x1, f32 y1, f32 z1, f32 x2, f32 y2, f32 z2) { f32 vx, vy, vz; - micro_kernel_influence_scalar(vx, vy, vz, x, y, z, x1, y1, z1, x2, y2, z2); + kernel_biosavart(vx, vy, vz, x, y, z, x1, y1, z1, x2, y2, z2); inf_x += vx; inf_y += vy; inf_z += vz; y = -y; // wing symmetry - micro_kernel_influence_scalar(vx, vy, vz, x, y, z, x1, y1, z1, x2, y2, z2); + kernel_biosavart(vx, vy, vz, x, y, z, x1, y1, z1, x2, y2, z2); inf_x += vx; inf_y -= vy; inf_z += vz; @@ -118,10 +120,56 @@ inline void macro_kernel_remainder_scalar(Mesh& m, std::vector& lhs, u32 ia f32 inf_x = 0.0f; f32 inf_y = 0.0f; f32 inf_z = 0.0f; - kernel_influence_scalar(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v0x, v0y, v0z, v1x, v1y, v1z); - kernel_influence_scalar(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v1x, v1y, v1z, v2x, v2y, v2z); - kernel_influence_scalar(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v2x, v2y, v2z, v3x, v3y, v3z); - kernel_influence_scalar(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v3x, v3y, v3z, v0x, v0y, v0z); + kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v0x, v0y, v0z, v1x, v1y, v1z); + kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v1x, v1y, v1z, v2x, v2y, v2z); + kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v2x, v2y, v2z, v3x, v3y, v3z); + kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v3x, v3y, v3z, v0x, v0y, v0z); + f32 nx = m.normal.x[ia2]; + f32 ny = m.normal.y[ia2]; + f32 nz = m.normal.z[ia2]; + f32 ring_inf = inf_x * nx + inf_y * ny + inf_z * nz; + // store in col major order + if (Overwrite) { + lhs[ia * m.nb_panels_wing() + ia2] = ring_inf; + } else { + lhs[ia * m.nb_panels_wing() + ia2] += ring_inf; + } + } +} + +template +inline void macro_kernel_scalar(Mesh& m, std::vector& lhs, u32 ia, u32 lidx) { + const u32 v0 = lidx + lidx / m.ns; + const u32 v1 = v0 + 1; + const u32 v3 = v0 + m.ns+1; + const u32 v2 = v3 + 1; + + f32 v0x = m.v.x[v0]; + f32 v0y = m.v.y[v0]; + f32 v0z = m.v.z[v0]; + f32 v1x = m.v.x[v1]; + f32 v1y = m.v.y[v1]; + f32 v1z = m.v.z[v1]; + f32 v2x = m.v.x[v2]; + f32 v2y = m.v.y[v2]; + f32 v2z = m.v.z[v2]; + f32 v3x = m.v.x[v3]; + f32 v3y = m.v.y[v3]; + f32 v3z = m.v.z[v3]; + + for (u32 ia2 = 0; ia2 < m.nb_panels_wing(); ia2++) { + const f32 colloc_x = m.colloc.x[ia2]; + const f32 colloc_y = m.colloc.y[ia2]; + const f32 colloc_z = m.colloc.z[ia2]; + + // 3 regs to store induced velocity + f32 inf_x = 0.0f; + f32 inf_y = 0.0f; + f32 inf_z = 0.0f; + kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v0x, v0y, v0z, v1x, v1y, v1z); + kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v1x, v1y, v1z, v2x, v2y, v2z); + kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v2x, v2y, v2z, v3x, v3y, v3z); + kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v3x, v3y, v3z, v0x, v0y, v0z); f32 nx = m.normal.x[ia2]; f32 ny = m.normal.y[ia2]; f32 nz = m.normal.z[ia2]; @@ -136,7 +184,7 @@ inline void macro_kernel_remainder_scalar(Mesh& m, std::vector& lhs, u32 ia } inline void micro_kernel_influence_avx2(__m256& vx, __m256& vy, __m256& vz, __m256& x, __m256& y, __m256& z, __m256& x1, __m256& y1, __m256& z1, __m256& x2, __m256& y2, __m256& z2, f32 sigma_p4) { - static const __m256 threshold = _mm256_set1_ps(1.0e-10f); + static const __m256 threshold = _mm256_set1_ps(1e-10f); static const __m256 four_pi = _mm256_set1_ps(4.0f * PI_f); static const __m256 zero = _mm256_set1_ps(0.0f); @@ -178,16 +226,16 @@ inline void micro_kernel_influence_avx2(__m256& vx, __m256& vy, __m256& vz, __m2 __m256 four_pi_r1_mag_r2_mag = _mm256_mul_ps(four_pi, _mm256_mul_ps(r1_mag, r2_mag)); // 4*pi*|r1|*|r2| __m256 denominator; - if (sigma_p4 == 0.0f) { - // Singular Bio-Savart - denominator = _mm256_mul_ps(four_pi_r1_mag_r2_mag, r1_x_r2_mag_p2); - } else { + // if (sigma_p4 < 1e-6f) { + // // Singular Bio-Savart + // denominator = _mm256_mul_ps(four_pi_r1_mag_r2_mag, r1_x_r2_mag_p2); + // } else { // Vatistas smoothing kernel (n=2) (https://doi.org/10.3390/fluids7020081) __m256 r1_x_r2_mag_p4 = _mm256_mul_ps(r1_x_r2_mag_p2, r1_x_r2_mag_p2); // ^2n __m256 r0_mag_p2 = _mm256_fmadd_ps(r0x, r0x, _mm256_fmadd_ps(r0y, r0y, _mm256_mul_ps(r0z, r0z))); __m256 r0_mag_p4 = _mm256_mul_ps(r0_mag_p2, r0_mag_p2); // ^2n denominator = _mm256_mul_ps(four_pi_r1_mag_r2_mag, _mm256_sqrt_ps(_mm256_fmadd_ps(_mm256_set1_ps(sigma_p4), r0_mag_p4, r1_x_r2_mag_p4))); - } + //} __m256 coeff = _mm256_div_ps(numerator, denominator); @@ -266,14 +314,26 @@ inline void macro_kernel_avx2(Mesh& m, std::vector& lhs, u32 ia, u32 lidx, lhs_ia = _mm256_add_ps(lhs_ia, ring_inf); _mm256_storeu_ps(&lhs[ia * m.nb_panels_wing() + ia2], lhs_ia); } + if (ia == 0 && ia2 == 0) { + for (u32 iii = 0; iii < 8; iii++) { + std::printf("%f ", lhs[iii]); + } + std::printf("\n"); + } } } -void BackendAVX2::compute_lhs(const FlowData& flow) { +void BackendGeneric::compute_lhs(const FlowData& flow) { SimpleTimer timer("LHS"); Mesh& m = mesh; - const f32 sigma_p4 = pow<4>(flow.sigma_vatistas); // Vatistas coeffcient (^2n with n=2) - + ispc::MeshProxy mesh_proxy = { + m.nc, m.ns, m.nb_panels_wing(), + {m.v.x.data(), m.v.y.data(), m.v.z.data()}, + {m.colloc.x.data(), m.colloc.y.data(), m.colloc.z.data()}, + {m.normal.x.data(), m.normal.y.data(), m.normal.z.data()} + }; + // const f32 sigma_p4 = pow<4>(flow.sigma_vatistas); // Vatistas coeffcient (^2n with n=2) + const u32 start_wing = 0; const u32 end_wing = (m.nc - 1) * m.ns; @@ -281,8 +341,16 @@ void BackendAVX2::compute_lhs(const FlowData& flow) { auto init = taskflow.placeholder(); auto wing_pass = taskflow.for_each_index(start_wing, end_wing, [&] (u32 i) { - macro_kernel_avx2(m, lhs, i, i, sigma_p4); - macro_kernel_remainder_scalar(m, lhs, i, i); + // macro_kernel_avx2(m, lhs, i, i, flow.sigma_vatistas); + // macro_kernel_remainder_scalar(m, lhs, i, i); + // kernel_influence(m.nc, m.ns, + // lhs.data(), + // m.v.x.data(), m.v.y.data(), m.v.z.data(), + // m.colloc.x.data(), m.colloc.y.data(), m.colloc.z.data(), + // m.normal.x.data(), m.normal.y.data(), m.normal.z.data(), + // i, i, flow.sigma_vatistas); + //macro_kernel_scalar(m, lhs, i, i); + ispc::kernel_influence(mesh_proxy, lhs.data(), i, i, flow.sigma_vatistas); }); u32 idx = m.nc - 1; @@ -292,8 +360,14 @@ void BackendAVX2::compute_lhs(const FlowData& flow) { auto wake_pass = taskflow.for_each_index(0u, m.ns, [&] (u32 j) { const u32 ia = (m.nc - 1) * m.ns + j; const u32 lidx = idx * m.ns + j; - macro_kernel_avx2(m, lhs, ia, lidx, sigma_p4); - macro_kernel_remainder_scalar(m, lhs, idx, idx); + kernel_influence(m.nc, m.ns, + lhs.data(), + m.v.x.data(), m.v.y.data(), m.v.z.data(), + m.colloc.x.data(), m.colloc.y.data(), m.colloc.z.data(), + m.normal.x.data(), m.normal.y.data(), m.normal.z.data(), + ia, lidx, flow.sigma_vatistas); + // macro_kernel_avx2(m, lhs, ia, lidx, flow.sigma_vatistas); + // macro_kernel_remainder_scalar(m, lhs, idx, idx); }); auto back = taskflow.emplace([&]{ idx++; @@ -310,20 +384,20 @@ void BackendAVX2::compute_lhs(const FlowData& flow) { Executor::get().run(taskflow).wait(); } -void kernel_generic_rhs(u32 n, const float normal_x[], const float normal_y[], const float normal_z[], float freestream_x, float freestream_y, float freestream_z, float rhs[]) { +void kernel_cpu_rhs(u32 n, const float normal_x[], const float normal_y[], const float normal_z[], float freestream_x, float freestream_y, float freestream_z, float rhs[]) { for (u32 i = 0; i < n; i++) { rhs[i] = - (freestream_x * normal_x[i] + freestream_y * normal_y[i] + freestream_z * normal_z[i]); } } -void BackendAVX2::compute_rhs(const FlowData& flow) { +void BackendGeneric::compute_rhs(const FlowData& flow) { SimpleTimer timer("RHS"); const Mesh& m = mesh; - kernel_generic_rhs(m.nb_panels_wing(), m.normal.x.data(), m.normal.y.data(), m.normal.z.data(), flow.freestream.x, flow.freestream.y, flow.freestream.z, rhs.data()); + kernel_cpu_rhs(m.nb_panels_wing(), m.normal.x.data(), m.normal.y.data(), m.normal.z.data(), flow.freestream.x, flow.freestream.y, flow.freestream.z, rhs.data()); } -void BackendAVX2::compute_rhs(const FlowData& flow, const std::vector& section_alphas) { +void BackendGeneric::compute_rhs(const FlowData& flow, const std::vector& section_alphas) { SimpleTimer timer("Rebuild RHS"); assert(section_alphas.size() == mesh.ns); const Mesh& m = mesh; @@ -336,13 +410,13 @@ void BackendAVX2::compute_rhs(const FlowData& flow, const std::vector& sect } } -void BackendAVX2::lu_factor() { +void BackendGeneric::lu_factor() { SimpleTimer timer("Factor"); const u32 n = mesh.nb_panels_wing(); LAPACKE_sgetrf(LAPACK_COL_MAJOR, n, n, lhs.data(), n, ipiv.data()); } -void BackendAVX2::lu_solve() { +void BackendGeneric::lu_solve() { SimpleTimer timer("Solve"); const u32 n = mesh.nb_panels_wing(); std::copy(rhs.begin(), rhs.end(), gamma.begin()); @@ -350,7 +424,7 @@ void BackendAVX2::lu_solve() { LAPACKE_sgetrs(LAPACK_COL_MAJOR, 'N', n, 1, lhs.data(), n, ipiv.data(), gamma.data(), n); } -f32 BackendAVX2::compute_coefficient_cl(const FlowData& flow, const f32 area, +f32 BackendGeneric::compute_coefficient_cl(const FlowData& flow, const f32 area, const u32 j, const u32 n) { assert(n > 0); assert(j >= 0 and j+n <= mesh.ns); @@ -374,7 +448,7 @@ f32 BackendAVX2::compute_coefficient_cl(const FlowData& flow, const f32 area, return cl; } -linalg::alias::float3 BackendAVX2::compute_coefficient_cm( +linalg::alias::float3 BackendGeneric::compute_coefficient_cm( const FlowData& flow, const f32 area, const f32 chord, @@ -403,7 +477,7 @@ linalg::alias::float3 BackendAVX2::compute_coefficient_cm( return cm; } -f32 BackendAVX2::compute_coefficient_cd( +f32 BackendGeneric::compute_coefficient_cd( const FlowData& flow, const f32 area, const u32 j, @@ -429,8 +503,8 @@ f32 BackendAVX2::compute_coefficient_cd( linalg::alias::float3 v2 = mesh.get_v2(ia2); linalg::alias::float3 v3 = mesh.get_v3(ia2); // Influence from the streamwise vortex lines - kernel_influence_scalar(inf2.x, inf2.y, inf2.z, colloc_x, colloc_y, colloc_z, v1.x, v1.y, v1.z, v2.x, v2.y, v2.z); - kernel_influence_scalar(inf2.x, inf2.y, inf2.z, colloc_x, colloc_y, colloc_z, v3.x, v3.y, v3.z, v0.x, v0.y, v0.z); + kernel_symmetry(inf2.x, inf2.y, inf2.z, colloc_x, colloc_y, colloc_z, v1.x, v1.y, v1.z, v2.x, v2.y, v2.z); + kernel_symmetry(inf2.x, inf2.y, inf2.z, colloc_x, colloc_y, colloc_z, v3.x, v3.y, v3.z, v0.x, v0.y, v0.z); f32 gamma_w = gamma[(mesh.nc-1)*mesh.ns + ia2 % mesh.ns]; // This is the induced velocity calculated with the vortex (gamma) calculated earlier (according to kutta condition) inf += gamma_w * inf2; diff --git a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.cpp b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.cpp new file mode 100644 index 0000000..af2a0a7 --- /dev/null +++ b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.cpp @@ -0,0 +1,67 @@ +#include "vlm_backend_cpu_kernels.hpp" +#include "linalg.h" +#include "vlm_types.hpp" +#include + +using namespace vlm; +using namespace linalg::alias; + +inline float3 kernel_biosavart(const float3& colloc, const float3& vertex1, const float3& vertex2, const f32& sigma) { + const f32 rcut = 1e-10f; // highly tuned value + const float3 r0 = vertex2 - vertex1; + const float3 r1 = colloc - vertex1; + const float3 r2 = colloc - vertex2; + // Katz Plotkin, Low speed Aero | Eq 10.115 + const float3 r1r2cross = linalg::cross(r1, r2); + const f32 r1_norm = linalg::length(r1); + const f32 r2_norm = linalg::length(r2); + const f32 square = linalg::length2(r1r2cross); + if ((r1_norm float3; + +#define DOT(v0, v1) (v0.x*v1.x + v0.y*v1.y + v0.z*v1.z) + +#define CROSS(v0, v1, res) \ + res.x = v0.y*v1.z - v0.z*v1.y; \ + res.y = v0.z*v1.x - v0.x*v1.z; \ + res.z = v0.x*v1.y - v0.y*v1.x; + +#define LENGTH2(v) (v.x*v.x + v.y*v.y + v.z*v.z) + +#define LENGTH(v) (sqrt(LENGTH2(v))) + +inline float dot(const float3& v0, const float3& v1) {return DOT(v0, v1);} +inline float dot(const float3& v0, const uniform float3& v1) {return DOT(v0, v1);} +inline float dot(const uniform float3& v0, const float3& v1) {return DOT(v0, v1);} +inline uniform float dot(const uniform float3& v0, const uniform float3& v1) {return DOT(v0, v1);} + +inline float3 cross(const float3& v0, const float3& v1) {float3 res; CROSS(v0, v1, res); return res;} +inline float3 cross(const float3& v0, const uniform float3& v1) {float3 res; CROSS(v0, v1, res); return res;} +inline float3 cross(const uniform float3& v0, const float3& v1) {float3 res; CROSS(v0, v1, res); return res;} +inline uniform float3 cross(const uniform float3& v0, const uniform float3& v1) {uniform float3 res; CROSS(v0, v1, res); return res;} + +inline float length2(const float3& v) {return LENGTH2(v);} +inline uniform float length2(const uniform float3& v) {return LENGTH2(v);} + +inline float length(const float3& v) {return LENGTH(v);} +inline uniform float length(const uniform float3& v) {return LENGTH(v);} + +// Aggregate Structures + +export struct SoA3D { + uniform float* uniform x; + uniform float* uniform y; + uniform float* uniform z; +}; + +export struct MeshProxy { + uniform unsigned int ns; + uniform unsigned int nc; + uniform unsigned int nb_panels; + uniform SoA3D v; // vertices + uniform SoA3D colloc; // collocation points + uniform SoA3D normal; // normals +}; + +// Bio-savart Kernel +#define RCUT 1e-10f // highly tuned value +#define PI_f 3.14159274f + +inline float3 kernel_biosavart(float3& colloc, const uniform float3& vertex1, const uniform float3& vertex2, const uniform float& sigma) { + uniform float3 r0 = vertex2 - vertex1; + float3 r1 = colloc - vertex1; + float3 r2 = colloc - vertex2; + // Katz Plotkin, Low speed Aero | Eq 10.115 + float3 r1r2cross = cross(r1, r2); + float r1_norm = length(r1); + float r2_norm = length(r2); + float square = length2(r1r2cross); + if ((r1_norm #include #include @@ -65,7 +67,7 @@ int main(int argc, char **argv) { for (auto alpha : alphas) { FlowData flow(alpha, 0.0f, 1.0f, 1.0f); auto coeffs = solver.solve(flow); - std::printf(">>> Alpha: %.1f | CL = %.6f CD = %.6f CMx = %.6f CMy = %.6f CMz = %.6f\n", flow.alpha, coeffs.cl, coeffs.cd, coeffs.cm.x, coeffs.cm.y, coeffs.cm.z); + std::printf(">>> Alpha: %.1f | CL = %.6f CD = %.6f CMx = %.6f CMy = %.6f CMz = %.6f\n", vlm::to_degrees(flow.alpha), coeffs.cl, coeffs.cd, coeffs.cm.x, coeffs.cm.y, coeffs.cm.z); } // Pause for memory reading diff --git a/vlm/include/vlm_executor.hpp b/vlm/include/vlm_executor.hpp index c6e7e2d..64b51ca 100644 --- a/vlm/include/vlm_executor.hpp +++ b/vlm/include/vlm_executor.hpp @@ -1,7 +1,6 @@ // This header should not be included by another header file, only by source files. #pragma once -#include #include // includes , , // Taskflow executor singleton wrapper (not thread-safe by design) diff --git a/vlm/include/vlm_types.hpp b/vlm/include/vlm_types.hpp index 0098f50..8de7396 100644 --- a/vlm/include/vlm_types.hpp +++ b/vlm/include/vlm_types.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "linalg.h" #include "tinyconfig.hpp" @@ -27,6 +28,8 @@ using f64 = double; constexpr f64 PI_d = 3.14159265358979; constexpr f32 PI_f = 3.141593f; +constexpr f64 EPS_d = std::numeric_limits::epsilon(); +constexpr f32 EPS_f = std::numeric_limits::epsilon(); // Optimized n power function that uses template folding optimisations // to generate a series of mul instructions diff --git a/vlm/include/vlm_utils.hpp b/vlm/include/vlm_utils.hpp index f3a1376..9604cc6 100644 --- a/vlm/include/vlm_utils.hpp +++ b/vlm/include/vlm_utils.hpp @@ -9,4 +9,6 @@ linalg::alias::float3 compute_freestream(const f32 u_inf, const f32 alpha, const linalg::alias::float3 compute_lift_axis(const linalg::alias::float3& freestream_); linalg::alias::float3 compute_stream_axis(const linalg::alias::float3& freestream_); +f32 to_degrees(f32 radians); +f32 to_radians(f32 degrees); } \ No newline at end of file diff --git a/vlm/src/vlm.cpp b/vlm/src/vlm.cpp index 7d22a5f..f82e599 100644 --- a/vlm/src/vlm.cpp +++ b/vlm/src/vlm.cpp @@ -14,7 +14,7 @@ using namespace vlm; Solver::Solver(const tiny::Config& cfg) { - std::string backend_name = cfg().section("solver").get("backend", "avx2"); + std::string backend_name = cfg().section("solver").get("backend", "cpu"); mesh = std::make_unique(cfg); backend = create_backend(backend_name, *mesh); } @@ -55,7 +55,7 @@ AeroCoefficients NonLinearVLM::solve(const FlowData& flow, const Database& db) { // parallel reduce // loop over the chordwise strips and apply Van Dam algorithm { - const tiny::Timer timer("Strip correction"); + const tiny::ScopedTimer timer("Strip correction"); for (u32 j = 0; j < mesh->ns; j++) { const f32 strip_area = mesh->panels_area(0, j, mesh->nc, 1); const FlowData strip_flow = {strip_alphas[j], flow.beta, flow.u_inf, flow.rho}; diff --git a/vlm/src/vlm_backend.cpp b/vlm/src/vlm_backend.cpp index af16f5f..195e943 100644 --- a/vlm/src/vlm_backend.cpp +++ b/vlm/src/vlm_backend.cpp @@ -4,8 +4,8 @@ #include -#ifdef VLM_AVX2 -#include "vlm_backend_avx2.hpp" +#ifdef VLM_CPU +#include "vlm_backend_cpu.hpp" #endif #ifdef VLM_CUDA #include "vlm_backend_cuda.hpp" @@ -27,11 +27,11 @@ f32 Backend::compute_coefficient_cd(const FlowData& flow) { std::unique_ptr vlm::create_backend(const std::string& backend_name, Mesh& mesh) { tiny::CPUID cpuid; - cpuid.print_info(); + //cpuid.print_info(); - #ifdef VLM_AVX2 - if (backend_name == "avx2" && cpuid.has("AVX2")) { - return std::make_unique(mesh); + #ifdef VLM_CPU + if (backend_name == "cpu") { + return std::make_unique(mesh); } #endif #ifdef VLM_CUDA diff --git a/vlm/src/vlm_utils.cpp b/vlm/src/vlm_utils.cpp index 55c327b..38393e9 100644 --- a/vlm/src/vlm_utils.cpp +++ b/vlm/src/vlm_utils.cpp @@ -1,5 +1,7 @@ #include "vlm_utils.hpp" +using namespace vlm; + linalg::alias::float3 vlm::compute_freestream(const f32 u_inf, const f32 alpha, const f32 beta) { return linalg::alias::float3{ u_inf * std::cos(alpha) * std::cos(beta), @@ -14,4 +16,12 @@ linalg::alias::float3 vlm::compute_lift_axis(const linalg::alias::float3& freest linalg::alias::float3 vlm::compute_stream_axis(const linalg::alias::float3& freestream_) { return linalg::normalize(freestream_); +} + +f32 vlm::to_degrees(f32 radians) { + return radians * 180.0f / PI_f; +} + +f32 vlm::to_radians(f32 degrees) { + return degrees * PI_f / 180.0f; } \ No newline at end of file diff --git a/xmake.lua b/xmake.lua index 6e05785..9b6f440 100644 --- a/xmake.lua +++ b/xmake.lua @@ -18,7 +18,7 @@ set_languages("c++17", "c99") set_runtimes("MD") -- msvc runtime library (MD/MT/MDd/MTd) -- Define backends and helper functions -backends = {"cuda", "avx2"} +backends = {"cuda", "cpu"} backend_includes = function(name) return "vlm/backends/" .. name .. "/xmake.lua" end backend_defines = function(name) return "VLM_" .. name:upper() end backend_deps = function(name) return "backend-" .. name end @@ -31,7 +31,7 @@ add_includedirs("headeronly", {public = true}) -- must be set before options for _,name in ipairs(backends) do -- Create option option(backend_option(name)) - set_default(name == "avx2") -- set default to true for avx2, false otherwise + set_default(name == "cpu") -- set default to true for cpu, false otherwise set_showmenu(true) option_end() -- Add option includes From 655d7e756f56f462d5bcad25b4f183775834f588 Mon Sep 17 00:00:00 2001 From: Samuel Ayala Date: Tue, 13 Feb 2024 22:19:27 -0500 Subject: [PATCH 2/6] fixed ispc & removed AVX2 kernels still need to implement the trefftz cd kernel rip avx2 --- mesh/infinite_rectangular_2x8.x | 23 + tests/test_non_linear.cpp | 4 +- vlm/backends/cpu/include/vlm_backend_cpu.hpp | 7 +- .../cpu/include/vlm_backend_cpu_kernels.hpp | 3 +- vlm/backends/cpu/src/vlm_backend_cpu.cpp | 456 +++++------------- .../cpu/src/vlm_backend_cpu_kernels.cpp | 4 +- .../cpu/src/vlm_backend_cpu_kernels.ispc | 23 +- vlm/backends/cpu/xmake.lua | 2 +- .../cuda/include/vlm_backend_cuda.hpp | 2 +- vlm/dev/main.cpp | 2 +- vlm/include/vlm_mesh.hpp | 1 + vlm/src/vlm_backend.cpp | 2 +- vlm/src/vlm_mesh.cpp | 7 + 13 files changed, 184 insertions(+), 352 deletions(-) create mode 100644 mesh/infinite_rectangular_2x8.x diff --git a/mesh/infinite_rectangular_2x8.x b/mesh/infinite_rectangular_2x8.x new file mode 100644 index 0000000..c7e459c --- /dev/null +++ b/mesh/infinite_rectangular_2x8.x @@ -0,0 +1,23 @@ + 1 + 3 9 1 + 0.0000000000000000 0.50000000000000000 1.0000000000000000 0.0000000000000000 + 0.50000000000000000 1.0000000000000000 0.0000000000000000 0.50000000000000000 + 1.0000000000000000 0.0000000000000000 0.50000000000000000 1.0000000000000000 + 0.0000000000000000 0.50000000000000000 1.0000000000000000 0.0000000000000000 + 0.50000000000000000 1.0000000000000000 0.0000000000000000 0.50000000000000000 + 1.0000000000000000 0.0000000000000000 0.50000000000000000 1.0000000000000000 + 0.0000000000000000 0.50000000000000000 1.0000000000000000 + 0.0000000000000000 0.0000000000000000 0.0000000000000000 3750.0000000000000 + 3750.0000000000005 3750.0000000000000 7500.0000000000000 7500.0000000000009 + 7500.0000000000000 11250.000000000000 11250.000000000000 11250.000000000000 + 15000.000000000000 15000.000000000002 15000.000000000000 18750.000000000000 + 18750.000000000000 18750.000000000000 17500.000000000000 17500.000000000000 + 17500.000000000000 13750.000000000000 13750.000000000000 13750.000000000000 + 10000.000000000000 10000.000000000000 10000.000000000000 + 0.0000000000000000 0.0000000000000000 0.0000000000000000 0.0000000000000000 + 0.0000000000000000 0.0000000000000000 0.0000000000000000 0.0000000000000000 + 0.0000000000000000 0.0000000000000000 0.0000000000000000 0.0000000000000000 + 0.0000000000000000 0.0000000000000000 0.0000000000000000 0.0000000000000000 + 0.0000000000000000 0.0000000000000000 0.0000000000000000 0.0000000000000000 + 0.0000000000000000 0.0000000000000000 0.0000000000000000 0.0000000000000000 + 0.0000000000000000 0.0000000000000000 0.0000000000000000 diff --git a/tests/test_non_linear.cpp b/tests/test_non_linear.cpp index 6348fdb..240ccc6 100644 --- a/tests/test_non_linear.cpp +++ b/tests/test_non_linear.cpp @@ -64,8 +64,8 @@ int main(int argc, char** argv) { const std::vector backends = {"cpu"}; std::vector>> lift_curves; lift_curves.emplace_back(std::make_pair("spallart1", std::make_unique(1.2f, 0.28f, 0.02f, 2.f*PI_f, 2.f*PI_f))); - lift_curves.emplace_back(std::make_pair("spallart2", std::make_unique(0.72f, 0.28f, 0.04f, 2.f*PI_f, 1.5f*PI_f))); - lift_curves.emplace_back(std::make_pair("polar", std::make_unique())); + // lift_curves.emplace_back(std::make_pair("spallart2", std::make_unique(0.72f, 0.28f, 0.04f, 2.f*PI_f, 1.5f*PI_f))); + // lift_curves.emplace_back(std::make_pair("polar", std::make_unique())); std::vector test_alphas = {0, 5, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; std::transform(test_alphas.begin(), test_alphas.end(), test_alphas.begin(), to_radians); diff --git a/vlm/backends/cpu/include/vlm_backend_cpu.hpp b/vlm/backends/cpu/include/vlm_backend_cpu.hpp index 81b0227..28d9ca2 100644 --- a/vlm/backends/cpu/include/vlm_backend_cpu.hpp +++ b/vlm/backends/cpu/include/vlm_backend_cpu.hpp @@ -5,16 +5,17 @@ namespace vlm { -class BackendGeneric : public Backend { +class BackendCPU : public Backend { public: std::vector lhs; std::vector rhs; std::vector ipiv; std::vector gamma; std::vector delta_gamma; + std::vector trefftz_buffer; - BackendGeneric(Mesh& mesh); - ~BackendGeneric(); + BackendCPU(Mesh& mesh); + ~BackendCPU(); void reset() override; void compute_lhs(const FlowData& flow) override; void compute_rhs(const FlowData& flow) override; diff --git a/vlm/backends/cpu/include/vlm_backend_cpu_kernels.hpp b/vlm/backends/cpu/include/vlm_backend_cpu_kernels.hpp index 2b8be98..7ff4093 100644 --- a/vlm/backends/cpu/include/vlm_backend_cpu_kernels.hpp +++ b/vlm/backends/cpu/include/vlm_backend_cpu_kernels.hpp @@ -9,4 +9,5 @@ void kernel_influence( f32 normalx[], f32 normaly[], f32 normalz[], u32 ia, u32 lidx, f32 sigma_p4 ); -} \ No newline at end of file +} + diff --git a/vlm/backends/cpu/src/vlm_backend_cpu.cpp b/vlm/backends/cpu/src/vlm_backend_cpu.cpp index a9d8444..33aea8c 100644 --- a/vlm/backends/cpu/src/vlm_backend_cpu.cpp +++ b/vlm/backends/cpu/src/vlm_backend_cpu.cpp @@ -3,7 +3,7 @@ #include "vlm_backend_cpu_kernels_ispc.h" #include "linalg.h" -#include "simpletimer.hpp" +#include "tinytimer.hpp" #include "vlm_mesh.hpp" #include "vlm_data.hpp" #include "vlm_utils.hpp" @@ -22,23 +22,24 @@ using namespace vlm; -BackendGeneric::~BackendGeneric() = default; // Destructor definition +BackendCPU::~BackendCPU() = default; // Destructor definition -BackendGeneric::BackendGeneric(Mesh& mesh) : Backend(mesh) { +BackendCPU::BackendCPU(Mesh& mesh) : Backend(mesh) { lhs.resize((u64)mesh.nb_panels_wing() * (u64)mesh.nb_panels_wing()); rhs.resize(mesh.nb_panels_wing()); ipiv.resize(mesh.nb_panels_wing()); gamma.resize(mesh.nb_panels_wing()); delta_gamma.resize(mesh.nb_panels_wing()); + trefftz_buffer.resize(mesh.ns); } -void BackendGeneric::reset() { +void BackendCPU::reset() { std::fill(gamma.begin(), gamma.end(), 0.0f); std::fill(lhs.begin(), lhs.end(), 0.0f); std::fill(rhs.begin(), rhs.end(), 0.0f); } -void BackendGeneric::compute_delta_gamma() { +void BackendCPU::compute_delta_gamma() { std::copy(gamma.begin(), gamma.begin()+mesh.ns, delta_gamma.begin()); // note: this is efficient as the memory is contiguous @@ -49,290 +50,16 @@ void BackendGeneric::compute_delta_gamma() { } } -inline void kernel_biosavart(f32& vx, f32& vy, f32& vz, f32& x, f32& y, f32& z, f32& x1, f32& y1, f32& z1, f32& x2, f32& y2, f32& z2) { - static const f32 rcut = 1.0e-10f; - vx = 0.0f; - vy = 0.0f; - vz = 0.0f; - - // Katz Plotkin, Low speed Aero | Eq 10.115 - const f32 r1r2x = (y-y1)*(z-z2) - (z-z1)*(y-y2); - const f32 r1r2y = -((x-x1)*(z-z2) - (z-z1)*(x-x2)); - const f32 r1r2z = (x-x1)*(y-y2) - (y-y1)*(x-x2); - - const f32 r1 = std::sqrt(pow<2>(x-x1)+pow<2>(y-y1)+pow<2>(z-z1)); - const f32 r2 = std::sqrt(pow<2>(x-x2)+pow<2>(y-y2)+pow<2>(z-z2)); - const f32 square = pow<2>(r1r2x) + pow<2>(r1r2y) + pow<2>(r1r2z); - if ((r1 -inline void macro_kernel_remainder_scalar(Mesh& m, std::vector& lhs, u32 ia, u32 lidx) { - // quick return - const u32 remainder = m.nb_panels_wing() % 8; - if (remainder == 0) return; - - const u32 v0 = lidx + lidx / m.ns; - const u32 v1 = v0 + 1; - const u32 v3 = v0 + m.ns+1; - const u32 v2 = v3 + 1; - - f32 v0x = m.v.x[v0]; - f32 v0y = m.v.y[v0]; - f32 v0z = m.v.z[v0]; - f32 v1x = m.v.x[v1]; - f32 v1y = m.v.y[v1]; - f32 v1z = m.v.z[v1]; - f32 v2x = m.v.x[v2]; - f32 v2y = m.v.y[v2]; - f32 v2z = m.v.z[v2]; - f32 v3x = m.v.x[v3]; - f32 v3y = m.v.y[v3]; - f32 v3z = m.v.z[v3]; - - for (u32 ia2 = m.nb_panels_wing() - remainder; ia2 < m.nb_panels_wing(); ia2++) { - const f32 colloc_x = m.colloc.x[ia2]; - const f32 colloc_y = m.colloc.y[ia2]; - const f32 colloc_z = m.colloc.z[ia2]; - - // 3 regs to store induced velocity - f32 inf_x = 0.0f; - f32 inf_y = 0.0f; - f32 inf_z = 0.0f; - kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v0x, v0y, v0z, v1x, v1y, v1z); - kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v1x, v1y, v1z, v2x, v2y, v2z); - kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v2x, v2y, v2z, v3x, v3y, v3z); - kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v3x, v3y, v3z, v0x, v0y, v0z); - f32 nx = m.normal.x[ia2]; - f32 ny = m.normal.y[ia2]; - f32 nz = m.normal.z[ia2]; - f32 ring_inf = inf_x * nx + inf_y * ny + inf_z * nz; - // store in col major order - if (Overwrite) { - lhs[ia * m.nb_panels_wing() + ia2] = ring_inf; - } else { - lhs[ia * m.nb_panels_wing() + ia2] += ring_inf; - } - } -} - -template -inline void macro_kernel_scalar(Mesh& m, std::vector& lhs, u32 ia, u32 lidx) { - const u32 v0 = lidx + lidx / m.ns; - const u32 v1 = v0 + 1; - const u32 v3 = v0 + m.ns+1; - const u32 v2 = v3 + 1; - - f32 v0x = m.v.x[v0]; - f32 v0y = m.v.y[v0]; - f32 v0z = m.v.z[v0]; - f32 v1x = m.v.x[v1]; - f32 v1y = m.v.y[v1]; - f32 v1z = m.v.z[v1]; - f32 v2x = m.v.x[v2]; - f32 v2y = m.v.y[v2]; - f32 v2z = m.v.z[v2]; - f32 v3x = m.v.x[v3]; - f32 v3y = m.v.y[v3]; - f32 v3z = m.v.z[v3]; - - for (u32 ia2 = 0; ia2 < m.nb_panels_wing(); ia2++) { - const f32 colloc_x = m.colloc.x[ia2]; - const f32 colloc_y = m.colloc.y[ia2]; - const f32 colloc_z = m.colloc.z[ia2]; - - // 3 regs to store induced velocity - f32 inf_x = 0.0f; - f32 inf_y = 0.0f; - f32 inf_z = 0.0f; - kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v0x, v0y, v0z, v1x, v1y, v1z); - kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v1x, v1y, v1z, v2x, v2y, v2z); - kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v2x, v2y, v2z, v3x, v3y, v3z); - kernel_symmetry(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v3x, v3y, v3z, v0x, v0y, v0z); - f32 nx = m.normal.x[ia2]; - f32 ny = m.normal.y[ia2]; - f32 nz = m.normal.z[ia2]; - f32 ring_inf = inf_x * nx + inf_y * ny + inf_z * nz; - // store in col major order - if (Overwrite) { - lhs[ia * m.nb_panels_wing() + ia2] = ring_inf; - } else { - lhs[ia * m.nb_panels_wing() + ia2] += ring_inf; - } - } -} - -inline void micro_kernel_influence_avx2(__m256& vx, __m256& vy, __m256& vz, __m256& x, __m256& y, __m256& z, __m256& x1, __m256& y1, __m256& z1, __m256& x2, __m256& y2, __m256& z2, f32 sigma_p4) { - static const __m256 threshold = _mm256_set1_ps(1e-10f); - static const __m256 four_pi = _mm256_set1_ps(4.0f * PI_f); - static const __m256 zero = _mm256_set1_ps(0.0f); - - vx = zero; - vy = zero; - vz = zero; - // define vectors - __m256 r1x = _mm256_sub_ps(x, x1); - __m256 r1y = _mm256_sub_ps(y, y1); - __m256 r1z = _mm256_sub_ps(z, z1); - __m256 r2x = _mm256_sub_ps(x, x2); - __m256 r2y = _mm256_sub_ps(y, y2); - __m256 r2z = _mm256_sub_ps(z, z2); - - // crossproduct - // (v0y*v1z - v0z*v1y); - // (v0z*v1x - v0x*v1z); - // (v0x*v1y - v0y*v1x); - __m256 r1_x_r2_x = _mm256_fmsub_ps(r1y, r2z, _mm256_mul_ps(r1z, r2y)); - __m256 r1_x_r2_y = _mm256_fmsub_ps(r1z, r2x, _mm256_mul_ps(r1x, r2z)); - __m256 r1_x_r2_z = _mm256_fmsub_ps(r1x, r2y, _mm256_mul_ps(r1y, r2x)); - - // magnitude & mag squared of crossproduct - __m256 r1_x_r2_mag_p2 = _mm256_fmadd_ps(r1_x_r2_x, r1_x_r2_x, _mm256_fmadd_ps(r1_x_r2_y, r1_x_r2_y, _mm256_mul_ps(r1_x_r2_z, r1_x_r2_z))); - __m256 r1_mag = _mm256_sqrt_ps(_mm256_fmadd_ps(r1x, r1x, _mm256_fmadd_ps(r1y, r1y, _mm256_mul_ps(r1z, r1z)))); - __m256 r2_mag = _mm256_sqrt_ps(_mm256_fmadd_ps(r2x, r2x, _mm256_fmadd_ps(r2y, r2y, _mm256_mul_ps(r2z, r2z)))); - - // vector from point 1 to point 2 of segment - __m256 r0x = _mm256_sub_ps(x2, x1); - __m256 r0y = _mm256_sub_ps(y2, y1); - __m256 r0z = _mm256_sub_ps(z2, z1); - - // dot product r0.r1 and r0.r2 - __m256 r0_dot_r1 = _mm256_fmadd_ps(r0x, r1x, _mm256_fmadd_ps(r0y, r1y, _mm256_mul_ps(r0z, r1z))); - __m256 r0_dot_r2 = _mm256_fmadd_ps(r0x, r2x, _mm256_fmadd_ps(r0y, r2y, _mm256_mul_ps(r0z, r2z))); - - __m256 numerator = _mm256_fmsub_ps(r0_dot_r1, r2_mag, _mm256_mul_ps(r0_dot_r2, r1_mag)); - - __m256 four_pi_r1_mag_r2_mag = _mm256_mul_ps(four_pi, _mm256_mul_ps(r1_mag, r2_mag)); // 4*pi*|r1|*|r2| - __m256 denominator; - - // if (sigma_p4 < 1e-6f) { - // // Singular Bio-Savart - // denominator = _mm256_mul_ps(four_pi_r1_mag_r2_mag, r1_x_r2_mag_p2); - // } else { - // Vatistas smoothing kernel (n=2) (https://doi.org/10.3390/fluids7020081) - __m256 r1_x_r2_mag_p4 = _mm256_mul_ps(r1_x_r2_mag_p2, r1_x_r2_mag_p2); // ^2n - __m256 r0_mag_p2 = _mm256_fmadd_ps(r0x, r0x, _mm256_fmadd_ps(r0y, r0y, _mm256_mul_ps(r0z, r0z))); - __m256 r0_mag_p4 = _mm256_mul_ps(r0_mag_p2, r0_mag_p2); // ^2n - denominator = _mm256_mul_ps(four_pi_r1_mag_r2_mag, _mm256_sqrt_ps(_mm256_fmadd_ps(_mm256_set1_ps(sigma_p4), r0_mag_p4, r1_x_r2_mag_p4))); - //} - - __m256 coeff = _mm256_div_ps(numerator, denominator); - - // add the influence and blend with mask - // the masks should be done independently for optimal ILP but if compiler smart he can do it - __m256 mask = _mm256_cmp_ps(r1_mag, threshold, _CMP_LT_OS); - mask = _mm256_or_ps(mask, _mm256_cmp_ps(r2_mag, threshold, _CMP_LT_OS)); - mask = _mm256_or_ps(mask, _mm256_cmp_ps(r1_x_r2_mag_p2, threshold, _CMP_LT_OS)); - - vx = _mm256_blendv_ps(_mm256_mul_ps(r1_x_r2_x, coeff), zero, mask); - vy = _mm256_blendv_ps(_mm256_mul_ps(r1_x_r2_y, coeff), zero, mask); - vz = _mm256_blendv_ps(_mm256_mul_ps(r1_x_r2_z, coeff), zero, mask); -} - -inline void kernel_influence_avx2(__m256& inf_x, __m256& inf_y, __m256& inf_z, __m256 x, __m256 y, __m256 z, __m256 x1, __m256 y1, __m256 z1, __m256 x2, __m256 y2, __m256 z2, f32 sigma_p4) { - __m256 vx, vy, vz; - micro_kernel_influence_avx2(vx, vy, vz, x, y, z, x1, y1, z1, x2, y2, z2, sigma_p4); - inf_x = _mm256_add_ps(inf_x, vx); - inf_y = _mm256_add_ps(inf_y, vy); - inf_z = _mm256_add_ps(inf_z, vz); - y = _mm256_xor_ps(y, _mm256_set1_ps(-0.0f)); // wing symmetry - micro_kernel_influence_avx2(vx, vy, vz, x, y, z, x1, y1, z1, x2, y2, z2, sigma_p4); - inf_x = _mm256_add_ps(inf_x, vx); - inf_y = _mm256_sub_ps(inf_y, vy); // HERE IS SUB INSTEAD OF ADD ! - inf_z = _mm256_add_ps(inf_z, vz); -} - -// Fill a column of the LHS matrix (influence of a single panel on all the others) -template -inline void macro_kernel_avx2(Mesh& m, std::vector& lhs, u32 ia, u32 lidx, f32 sigma_p4) { - const u32 v0 = lidx + lidx / m.ns; - const u32 v1 = v0 + 1; - const u32 v3 = v0 + m.ns+1; - const u32 v2 = v3 + 1; - // in reality only load v1 & v2 and reuse data from previous v1 & v2 to be new v0 & v3 respectively - // 12 regs (6 loads + 6 reuse) -> these will get spilled once we get in the influence function - __m256 v0x = _mm256_broadcast_ss(&m.v.x[v0]); - __m256 v0y = _mm256_broadcast_ss(&m.v.y[v0]); - __m256 v0z = _mm256_broadcast_ss(&m.v.z[v0]); - __m256 v1x = _mm256_broadcast_ss(&m.v.x[v1]); - __m256 v1y = _mm256_broadcast_ss(&m.v.y[v1]); - __m256 v1z = _mm256_broadcast_ss(&m.v.z[v1]); - __m256 v2x = _mm256_broadcast_ss(&m.v.x[v2]); - __m256 v2y = _mm256_broadcast_ss(&m.v.y[v2]); - __m256 v2z = _mm256_broadcast_ss(&m.v.z[v2]); - __m256 v3x = _mm256_broadcast_ss(&m.v.x[v3]); - __m256 v3y = _mm256_broadcast_ss(&m.v.y[v3]); - __m256 v3z = _mm256_broadcast_ss(&m.v.z[v3]); - - for (u32 ia2 = 0; ia2 <= m.nb_panels_wing()-8; ia2+=8) { - // loads (3 regs) - __m256 colloc_x = _mm256_loadu_ps(&m.colloc.x[ia2]); - __m256 colloc_y = _mm256_loadu_ps(&m.colloc.y[ia2]); - __m256 colloc_z = _mm256_loadu_ps(&m.colloc.z[ia2]); - // 3 regs to store induced velocity - __m256 inf_x = _mm256_setzero_ps(); - __m256 inf_y = _mm256_setzero_ps(); - __m256 inf_z = _mm256_setzero_ps(); - - kernel_influence_avx2(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v0x, v0y, v0z, v1x, v1y, v1z, sigma_p4); - kernel_influence_avx2(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v1x, v1y, v1z, v2x, v2y, v2z, sigma_p4); - kernel_influence_avx2(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v2x, v2y, v2z, v3x, v3y, v3z, sigma_p4); - kernel_influence_avx2(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v3x, v3y, v3z, v0x, v0y, v0z, sigma_p4); - - // dot product - __m256 nx = _mm256_loadu_ps(&m.normal.x[ia2]); - __m256 ny = _mm256_loadu_ps(&m.normal.y[ia2]); - __m256 nz = _mm256_loadu_ps(&m.normal.z[ia2]); - __m256 ring_inf = _mm256_fmadd_ps(inf_x, nx, _mm256_fmadd_ps(inf_y, ny, _mm256_mul_ps(inf_z, nz))); - - // store in col major order - if (Overwrite) { - _mm256_storeu_ps(&lhs[ia * m.nb_panels_wing() + ia2], ring_inf); - } else { - __m256 lhs_ia = _mm256_loadu_ps(&lhs[ia * m.nb_panels_wing() + ia2]); - lhs_ia = _mm256_add_ps(lhs_ia, ring_inf); - _mm256_storeu_ps(&lhs[ia * m.nb_panels_wing() + ia2], lhs_ia); - } - if (ia == 0 && ia2 == 0) { - for (u32 iii = 0; iii < 8; iii++) { - std::printf("%f ", lhs[iii]); - } - std::printf("\n"); - } - } -} - -void BackendGeneric::compute_lhs(const FlowData& flow) { - SimpleTimer timer("LHS"); +void BackendCPU::compute_lhs(const FlowData& flow) { + tiny::ScopedTimer timer("LHS"); Mesh& m = mesh; + ispc::MeshProxy mesh_proxy = { - m.nc, m.ns, m.nb_panels_wing(), + m.ns, m.nc, m.nb_panels_wing(), {m.v.x.data(), m.v.y.data(), m.v.z.data()}, {m.colloc.x.data(), m.colloc.y.data(), m.colloc.z.data()}, {m.normal.x.data(), m.normal.y.data(), m.normal.z.data()} }; - // const f32 sigma_p4 = pow<4>(flow.sigma_vatistas); // Vatistas coeffcient (^2n with n=2) const u32 start_wing = 0; const u32 end_wing = (m.nc - 1) * m.ns; @@ -341,15 +68,6 @@ void BackendGeneric::compute_lhs(const FlowData& flow) { auto init = taskflow.placeholder(); auto wing_pass = taskflow.for_each_index(start_wing, end_wing, [&] (u32 i) { - // macro_kernel_avx2(m, lhs, i, i, flow.sigma_vatistas); - // macro_kernel_remainder_scalar(m, lhs, i, i); - // kernel_influence(m.nc, m.ns, - // lhs.data(), - // m.v.x.data(), m.v.y.data(), m.v.z.data(), - // m.colloc.x.data(), m.colloc.y.data(), m.colloc.z.data(), - // m.normal.x.data(), m.normal.y.data(), m.normal.z.data(), - // i, i, flow.sigma_vatistas); - //macro_kernel_scalar(m, lhs, i, i); ispc::kernel_influence(mesh_proxy, lhs.data(), i, i, flow.sigma_vatistas); }); @@ -360,14 +78,7 @@ void BackendGeneric::compute_lhs(const FlowData& flow) { auto wake_pass = taskflow.for_each_index(0u, m.ns, [&] (u32 j) { const u32 ia = (m.nc - 1) * m.ns + j; const u32 lidx = idx * m.ns + j; - kernel_influence(m.nc, m.ns, - lhs.data(), - m.v.x.data(), m.v.y.data(), m.v.z.data(), - m.colloc.x.data(), m.colloc.y.data(), m.colloc.z.data(), - m.normal.x.data(), m.normal.y.data(), m.normal.z.data(), - ia, lidx, flow.sigma_vatistas); - // macro_kernel_avx2(m, lhs, ia, lidx, flow.sigma_vatistas); - // macro_kernel_remainder_scalar(m, lhs, idx, idx); + ispc::kernel_influence(mesh_proxy, lhs.data(), ia, lidx, flow.sigma_vatistas); }); auto back = taskflow.emplace([&]{ idx++; @@ -390,15 +101,15 @@ void kernel_cpu_rhs(u32 n, const float normal_x[], const float normal_y[], const } } -void BackendGeneric::compute_rhs(const FlowData& flow) { - SimpleTimer timer("RHS"); +void BackendCPU::compute_rhs(const FlowData& flow) { + tiny::ScopedTimer timer("RHS"); const Mesh& m = mesh; kernel_cpu_rhs(m.nb_panels_wing(), m.normal.x.data(), m.normal.y.data(), m.normal.z.data(), flow.freestream.x, flow.freestream.y, flow.freestream.z, rhs.data()); } -void BackendGeneric::compute_rhs(const FlowData& flow, const std::vector& section_alphas) { - SimpleTimer timer("Rebuild RHS"); +void BackendCPU::compute_rhs(const FlowData& flow, const std::vector& section_alphas) { + tiny::ScopedTimer timer("Rebuild RHS"); assert(section_alphas.size() == mesh.ns); const Mesh& m = mesh; for (u32 i = 0; i < mesh.nc; i++) { @@ -410,24 +121,24 @@ void BackendGeneric::compute_rhs(const FlowData& flow, const std::vector& s } } -void BackendGeneric::lu_factor() { - SimpleTimer timer("Factor"); +void BackendCPU::lu_factor() { + tiny::ScopedTimer timer("Factor"); const u32 n = mesh.nb_panels_wing(); LAPACKE_sgetrf(LAPACK_COL_MAJOR, n, n, lhs.data(), n, ipiv.data()); } -void BackendGeneric::lu_solve() { - SimpleTimer timer("Solve"); +void BackendCPU::lu_solve() { + tiny::ScopedTimer timer("Solve"); const u32 n = mesh.nb_panels_wing(); std::copy(rhs.begin(), rhs.end(), gamma.begin()); LAPACKE_sgetrs(LAPACK_COL_MAJOR, 'N', n, 1, lhs.data(), n, ipiv.data(), gamma.data(), n); } -f32 BackendGeneric::compute_coefficient_cl(const FlowData& flow, const f32 area, +f32 BackendCPU::compute_coefficient_cl(const FlowData& flow, const f32 area, const u32 j, const u32 n) { assert(n > 0); - assert(j >= 0 and j+n <= mesh.ns); + assert(j >= 0 && j+n <= mesh.ns); f32 cl = 0.0f; @@ -448,7 +159,7 @@ f32 BackendGeneric::compute_coefficient_cl(const FlowData& flow, const f32 area, return cl; } -linalg::alias::float3 BackendGeneric::compute_coefficient_cm( +linalg::alias::float3 BackendCPU::compute_coefficient_cm( const FlowData& flow, const f32 area, const f32 chord, @@ -456,7 +167,7 @@ linalg::alias::float3 BackendGeneric::compute_coefficient_cm( const u32 n) { assert(n > 0); - assert(j >= 0 and j+n <= mesh.ns); + assert(j >= 0 && j+n <= mesh.ns); linalg::alias::float3 cm(0.f, 0.f, 0.f); for (u32 u = 0; u < mesh.nc; u++) { @@ -477,46 +188,121 @@ linalg::alias::float3 BackendGeneric::compute_coefficient_cm( return cm; } -f32 BackendGeneric::compute_coefficient_cd( +inline void kernel_biosavart(f32& vx, f32& vy, f32& vz, f32& x, f32& y, f32& z, f32& x1, f32& y1, f32& z1, f32& x2, f32& y2, f32& z2) { + static const f32 rcut = 1.0e-10f; + vx = 0.0f; + vy = 0.0f; + vz = 0.0f; + + // Katz Plotkin, Low speed Aero | Eq 10.115 + const f32 r1r2x = (y-y1)*(z-z2) - (z-z1)*(y-y2); + const f32 r1r2y = -((x-x1)*(z-z2) - (z-z1)*(x-x2)); + const f32 r1r2z = (x-x1)*(y-y2) - (y-y1)*(x-x2); + + const f32 r1 = std::sqrt(pow<2>(x-x1)+pow<2>(y-y1)+pow<2>(z-z1)); + const f32 r2 = std::sqrt(pow<2>(x-x2)+pow<2>(y-y2)+pow<2>(z-z2)); + const f32 square = pow<2>(r1r2x) + pow<2>(r1r2y) + pow<2>(r1r2z); + if ((r1 0); - assert(j >= 0 and j+n <= mesh.ns); - + assert(j >= 0 && j+n <= mesh.ns); + std::fill(trefftz_buffer.begin(), trefftz_buffer.end(), 0.0f); + f32 cd = 0.0f; // Drag coefficent computed using Trefftz plane const u32 begin = j + mesh.nb_panels_wing(); const u32 end = begin + n; // parallel for for (u32 ia = begin; ia < end; ia++) { - const f32 colloc_x = mesh.colloc.x[ia]; - const f32 colloc_y = mesh.colloc.y[ia]; - const f32 colloc_z = mesh.colloc.z[ia]; - linalg::alias::float3 inf(0.f, 0.f, 0.f); + const u32 v0 = ia + ia / mesh.ns; + const u32 v1 = v0 + 1; + const u32 v3 = v0 + n+1; + const u32 v2 = v3 + 1; + + const float3 vertex0{mesh.v.x[v0], mesh.v.y[v0], mesh.v.z[v0]}; + const float3 vertex1{mesh.v.x[v1], mesh.v.y[v1], mesh.v.z[v1]}; + const float3 vertex2{mesh.v.x[v2], mesh.v.y[v2], mesh.v.z[v2]}; + const float3 vertex3{mesh.v.x[v3], mesh.v.y[v3], mesh.v.z[v3]}; + + const f32 gammaw = gamma[ia - mesh.ns]; + for (u32 ia2 = begin; ia2 < end; ia2++) { + const float3 colloc(mesh.colloc.x[ia2], mesh.colloc.y[ia2], mesh.colloc.z[ia2]); + const float3 normal(mesh.normal.x[ia2], mesh.normal.y[ia2], mesh.normal.z[ia2]); linalg::alias::float3 inf2(0.f, 0.f, 0.f); - linalg::alias::float3 v0 = mesh.get_v0(ia2); - linalg::alias::float3 v1 = mesh.get_v1(ia2); - linalg::alias::float3 v2 = mesh.get_v2(ia2); - linalg::alias::float3 v3 = mesh.get_v3(ia2); // Influence from the streamwise vortex lines - kernel_symmetry(inf2.x, inf2.y, inf2.z, colloc_x, colloc_y, colloc_z, v1.x, v1.y, v1.z, v2.x, v2.y, v2.z); - kernel_symmetry(inf2.x, inf2.y, inf2.z, colloc_x, colloc_y, colloc_z, v3.x, v3.y, v3.z, v0.x, v0.y, v0.z); - f32 gamma_w = gamma[(mesh.nc-1)*mesh.ns + ia2 % mesh.ns]; + kernel_symmetry(inf2, colloc, vertex1, vertex2, flow.sigma_vatistas); + kernel_symmetry(inf2, colloc, vertex3, vertex0, flow.sigma_vatistas); // This is the induced velocity calculated with the vortex (gamma) calculated earlier (according to kutta condition) - inf += gamma_w * inf2; + trefftz_buffer[ia2 - begin] += gammaw * linalg::dot(inf2, normal); } - const linalg::alias::float3 normal{mesh.normal.x[ia], mesh.normal.y[ia], mesh.normal.z[ia]}; - const f32 w_ind = linalg::dot(inf, normal); - const u32 col = ia % mesh.ns; - linalg::alias::float3 v0 = mesh.get_v0(ia); - linalg::alias::float3 v1 = mesh.get_v1(ia); - const f32 dl = linalg::length(v1 - v0); - cd -= 0.5f * flow.rho * gamma[(mesh.nc-1)*mesh.ns + col] * w_ind * dl; } - cd /= 0.5f * flow.rho * linalg::length2(flow.freestream) * area; + + for (u32 i = 0; i < mesh.ns; i++) { + const u32 li = (mesh.nc-1) * mesh.ns + i; + const f32 dl = mesh.strip_width(i); + cd -= gamma[li] * trefftz_buffer[i] * dl; // used to have 0.5f * flow.rho + } + cd /= linalg::length2(flow.freestream) * area; return cd; } \ No newline at end of file diff --git a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.cpp b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.cpp index af2a0a7..6d6e8d1 100644 --- a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.cpp +++ b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.cpp @@ -16,7 +16,9 @@ inline float3 kernel_biosavart(const float3& colloc, const float3& vertex1, cons const f32 r1_norm = linalg::length(r1); const f32 r2_norm = linalg::length(r2); const f32 square = linalg::length2(r1r2cross); - if ((r1_norm alphas = cfg().section("solver").get_vector("alphas", {0.0f}); std::transform(alphas.begin(), alphas.end(), alphas.begin(), diff --git a/vlm/include/vlm_mesh.hpp b/vlm/include/vlm_mesh.hpp index 48ff59d..e75c450 100644 --- a/vlm/include/vlm_mesh.hpp +++ b/vlm/include/vlm_mesh.hpp @@ -56,6 +56,7 @@ class Mesh { f32 panels_area(const u32 i, const u32 j, const u32 m, const u32 n) const; f32 panels_area_xy(const u32 i, const u32 j, const u32 m, const u32 n) const; f32 panel_width_y(const u32 i, const u32 j) const; + f32 strip_width(const u32 j) const; f32 chord_length(const u32 j) const; f32 chord_mean(const u32 j, const u32 n) const; diff --git a/vlm/src/vlm_backend.cpp b/vlm/src/vlm_backend.cpp index 195e943..a905638 100644 --- a/vlm/src/vlm_backend.cpp +++ b/vlm/src/vlm_backend.cpp @@ -31,7 +31,7 @@ std::unique_ptr vlm::create_backend(const std::string& backend_name, Me #ifdef VLM_CPU if (backend_name == "cpu") { - return std::make_unique(mesh); + return std::make_unique(mesh); } #endif #ifdef VLM_CUDA diff --git a/vlm/src/vlm_mesh.cpp b/vlm/src/vlm_mesh.cpp index fbd234f..87d7c85 100644 --- a/vlm/src/vlm_mesh.cpp +++ b/vlm/src/vlm_mesh.cpp @@ -149,6 +149,13 @@ f32 Mesh::panel_width_y(const u32 i, const u32 j) const { return v.y[j + 1 + i * ld] - v.y[j + i * ld]; } +f32 Mesh::strip_width(const u32 j) const { + assert(j < ns); + const linalg::alias::float3 v0 = get_v0(j); + const linalg::alias::float3 v1 = get_v1(j); + return linalg::length(v1 - v0); +} + /// @brief Computes the chord length of a chordwise segment /// @details Since the mesh follows the camber line, the chord length is computed /// as the distance between the first and last vertex of a chordwise segment From 1f33f32a7d3cc1ec20b796b8b38c777fe8f3aa8b Mon Sep 17 00:00:00 2001 From: Samuel Ayala Date: Wed, 14 Feb 2024 16:24:18 -0500 Subject: [PATCH 3/6] add ispc cd trefftz kernel --- headeronly/tinytimer.hpp | 12 +- vlm/backends/cpu/src/vlm_backend_cpu.cpp | 184 +++++------------- .../cpu/src/vlm_backend_cpu_kernels.ispc | 71 ++++++- 3 files changed, 121 insertions(+), 146 deletions(-) diff --git a/headeronly/tinytimer.hpp b/headeronly/tinytimer.hpp index cdeac87..9b12caf 100644 --- a/headeronly/tinytimer.hpp +++ b/headeronly/tinytimer.hpp @@ -5,8 +5,8 @@ namespace tiny { -constexpr static long long us_in_s = 1'000'000LL; -constexpr static long long us_in_ms = 1'000LL; +constexpr static double us_in_s = 1'000'000; +constexpr static double us_in_ms = 1'000; class ScopedTimer { public: @@ -14,15 +14,15 @@ class ScopedTimer { ~ScopedTimer() { const auto m_end = std::chrono::high_resolution_clock::now(); - const auto duration = static_cast(std::chrono::duration_cast(m_end - m_start).count()); + const auto duration = static_cast(std::chrono::duration_cast(m_end - m_start).count()); std::printf("%s: ", m_name); if (duration > us_in_s) { - std::printf("%lld s\n", duration / us_in_s); + std::printf("%.2f s\n", duration / us_in_s); } else if (duration > us_in_ms) { - std::printf("%lld ms\n", duration / us_in_ms); + std::printf("%.0f ms\n", duration / us_in_ms); } else { - std::printf("%lld us\n", duration); + std::printf("%.0f us\n", duration); } } private: diff --git a/vlm/backends/cpu/src/vlm_backend_cpu.cpp b/vlm/backends/cpu/src/vlm_backend_cpu.cpp index 33aea8c..acc261d 100644 --- a/vlm/backends/cpu/src/vlm_backend_cpu.cpp +++ b/vlm/backends/cpu/src/vlm_backend_cpu.cpp @@ -9,12 +9,9 @@ #include "vlm_utils.hpp" #include "vlm_executor.hpp" // includes taskflow/taskflow.hpp -#include -#include -#include -#include +#include // std::fill +#include // std::printf -#include #include #include @@ -135,29 +132,29 @@ void BackendCPU::lu_solve() { LAPACKE_sgetrs(LAPACK_COL_MAJOR, 'N', n, 1, lhs.data(), n, ipiv.data(), gamma.data(), n); } -f32 BackendCPU::compute_coefficient_cl(const FlowData& flow, const f32 area, - const u32 j, const u32 n) { - assert(n > 0); - assert(j >= 0 && j+n <= mesh.ns); +// f32 BackendCPU::compute_coefficient_cl(const FlowData& flow, const f32 area, +// const u32 j, const u32 n) { +// assert(n > 0); +// assert(j >= 0 && j+n <= mesh.ns); - f32 cl = 0.0f; - - for (u32 u = 0; u < mesh.nc; u++) { - for (u32 v = j; v < j + n; v++) { - const u32 li = u * mesh.ns + v; // linear index - const linalg::alias::float3 v0 = mesh.get_v0(li); - const linalg::alias::float3 v1 = mesh.get_v1(li); - // Leading edge vector pointing outward from wing root - const linalg::alias::float3 dl = v1 - v0; - // Distance from the center of leading edge to the reference point - const linalg::alias::float3 force = linalg::cross(flow.freestream, dl) * flow.rho * delta_gamma[li]; - cl += linalg::dot(force, flow.lift_axis); - } - } - cl /= 0.5f * flow.rho * linalg::length2(flow.freestream) * area; - - return cl; -} +// f32 cl = 0.0f; + +// for (u32 u = 0; u < mesh.nc; u++) { +// for (u32 v = j; v < j + n; v++) { +// const u32 li = u * mesh.ns + v; // linear index +// const linalg::alias::float3 v0 = mesh.get_v0(li); +// const linalg::alias::float3 v1 = mesh.get_v1(li); +// // Leading edge vector pointing outward from wing root +// const linalg::alias::float3 dl = v1 - v0; +// // Distance from the center of leading edge to the reference point +// const linalg::alias::float3 force = linalg::cross(flow.freestream, dl) * flow.rho * delta_gamma[li]; +// cl += linalg::dot(force, flow.lift_axis); +// } +// } +// cl /= 0.5f * flow.rho * linalg::length2(flow.freestream) * area; + +// return cl; +// } linalg::alias::float3 BackendCPU::compute_coefficient_cm( const FlowData& flow, @@ -188,75 +185,6 @@ linalg::alias::float3 BackendCPU::compute_coefficient_cm( return cm; } -inline void kernel_biosavart(f32& vx, f32& vy, f32& vz, f32& x, f32& y, f32& z, f32& x1, f32& y1, f32& z1, f32& x2, f32& y2, f32& z2) { - static const f32 rcut = 1.0e-10f; - vx = 0.0f; - vy = 0.0f; - vz = 0.0f; - - // Katz Plotkin, Low speed Aero | Eq 10.115 - const f32 r1r2x = (y-y1)*(z-z2) - (z-z1)*(y-y2); - const f32 r1r2y = -((x-x1)*(z-z2) - (z-z1)*(x-x2)); - const f32 r1r2z = (x-x1)*(y-y2) - (y-y1)*(x-x2); - - const f32 r1 = std::sqrt(pow<2>(x-x1)+pow<2>(y-y1)+pow<2>(z-z1)); - const f32 r2 = std::sqrt(pow<2>(x-x2)+pow<2>(y-y2)+pow<2>(z-z2)); - const f32 square = pow<2>(r1r2x) + pow<2>(r1r2y) + pow<2>(r1r2z); - if ((r1 0); assert(j >= 0 && j+n <= mesh.ns); std::fill(trefftz_buffer.begin(), trefftz_buffer.end(), 0.0f); - - f32 cd = 0.0f; - // Drag coefficent computed using Trefftz plane - const u32 begin = j + mesh.nb_panels_wing(); - const u32 end = begin + n; - // parallel for - for (u32 ia = begin; ia < end; ia++) { - const u32 v0 = ia + ia / mesh.ns; - const u32 v1 = v0 + 1; - const u32 v3 = v0 + n+1; - const u32 v2 = v3 + 1; - - const float3 vertex0{mesh.v.x[v0], mesh.v.y[v0], mesh.v.z[v0]}; - const float3 vertex1{mesh.v.x[v1], mesh.v.y[v1], mesh.v.z[v1]}; - const float3 vertex2{mesh.v.x[v2], mesh.v.y[v2], mesh.v.z[v2]}; - const float3 vertex3{mesh.v.x[v3], mesh.v.y[v3], mesh.v.z[v3]}; - - const f32 gammaw = gamma[ia - mesh.ns]; - - for (u32 ia2 = begin; ia2 < end; ia2++) { - const float3 colloc(mesh.colloc.x[ia2], mesh.colloc.y[ia2], mesh.colloc.z[ia2]); - const float3 normal(mesh.normal.x[ia2], mesh.normal.y[ia2], mesh.normal.z[ia2]); - linalg::alias::float3 inf2(0.f, 0.f, 0.f); - // Influence from the streamwise vortex lines - kernel_symmetry(inf2, colloc, vertex1, vertex2, flow.sigma_vatistas); - kernel_symmetry(inf2, colloc, vertex3, vertex0, flow.sigma_vatistas); - // This is the induced velocity calculated with the vortex (gamma) calculated earlier (according to kutta condition) - trefftz_buffer[ia2 - begin] += gammaw * linalg::dot(inf2, normal); - } - } - for (u32 i = 0; i < mesh.ns; i++) { - const u32 li = (mesh.nc-1) * mesh.ns + i; - const f32 dl = mesh.strip_width(i); - cd -= gamma[li] * trefftz_buffer[i] * dl; // used to have 0.5f * flow.rho - } + Mesh& m = mesh; + ispc::MeshProxy mesh_proxy = { + m.ns, m.nc, m.nb_panels_wing(), + {m.v.x.data(), m.v.y.data(), m.v.z.data()}, + {m.colloc.x.data(), m.colloc.y.data(), m.colloc.z.data()}, + {m.normal.x.data(), m.normal.y.data(), m.normal.z.data()} + }; + f32 cd = ispc::kernel_trefftz_cd(mesh_proxy, gamma.data(), trefftz_buffer.data(), j, n, flow.sigma_vatistas); cd /= linalg::length2(flow.freestream) * area; return cd; -} \ No newline at end of file +} + +// Using Trefftz plane +f32 BackendCPU::compute_coefficient_cl( + const FlowData& flow, + const f32 area, + const u32 j, + const u32 n) +{ + Mesh& m = mesh; + ispc::MeshProxy mesh_proxy = { + m.ns, m.nc, m.nb_panels_wing(), + {m.v.x.data(), m.v.y.data(), m.v.z.data()}, + {m.colloc.x.data(), m.colloc.y.data(), m.colloc.z.data()}, + {m.normal.x.data(), m.normal.y.data(), m.normal.z.data()} + }; + f32 cl = ispc::kernel_trefftz_cl(mesh_proxy, gamma.data(), j, n); + cl /= 0.5f * flow.rho * linalg::length2(flow.freestream) * area; + return cl; +} diff --git a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc index 41632a8..fb90d63 100644 --- a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc +++ b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc @@ -44,7 +44,8 @@ export struct MeshProxy { }; // Bio-savart Kernel -#define RCUT 1e-10f // highly tuned value +#define RCUT 1e-10f +#define RCUT2 1e-5f #define PI_f 3.141593f @@ -58,7 +59,7 @@ inline float3 kernel_biosavart(float3& colloc, const uniform float3& vertex1, co float r2_norm = length(r2); float square = length2(r1r2cross); - if ((r1_norm Date: Wed, 14 Feb 2024 16:26:06 -0500 Subject: [PATCH 4/6] add ISPC to CI --- .github/workflows/linux.yaml | 6 ++++++ .github/workflows/windows.yaml | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/.github/workflows/linux.yaml b/.github/workflows/linux.yaml index 128f35b..98453ad 100644 --- a/.github/workflows/linux.yaml +++ b/.github/workflows/linux.yaml @@ -25,6 +25,12 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Download ISPC + run: | + curl -L "https://github.com/ispc/ispc/releases/download/v1.22.0/ispc-v1.22.0-linux.tar.gz" -o ispc.tar.gz + tar -xzf ispc.tar.gz + echo "${PWD}/ispc-v1.22.0-linux/bin" >> $GITHUB_PATH + # Install system dependencies (opengl) - name: Install system dependencies run: | diff --git a/.github/workflows/windows.yaml b/.github/workflows/windows.yaml index 7bb6b82..d67c8d1 100644 --- a/.github/workflows/windows.yaml +++ b/.github/workflows/windows.yaml @@ -25,6 +25,12 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Download ISPC + run: | + Invoke-WebRequest -Uri "https://github.com/ispc/ispc/releases/download/v1.22.0/ispc-v1.22.0-windows.zip" -OutFile "ispc.zip" + 7z x ispc.zip + "ispc-v1.22.0-windows/bin" >> $env:GITHUB_PATH + # Force xmake to a specific folder (for cache) - name: Set xmake env run: echo "XMAKE_GLOBALDIR=${{ runner.workspace }}/xmake-global" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append From 27f83cc177f5cc4df1edf7200338f89e7a226aae Mon Sep 17 00:00:00 2001 From: Samuel Ayala Date: Wed, 14 Feb 2024 18:33:38 -0500 Subject: [PATCH 5/6] change to u64 for dimensions --- tests/test_non_linear.cpp | 6 +- vlm/backends/cpu/include/vlm_backend_cpu.hpp | 6 +- .../cpu/include/vlm_backend_cpu_kernels.hpp | 4 +- vlm/backends/cpu/src/vlm_backend_cpu.cpp | 61 ++++++------ .../cpu/src/vlm_backend_cpu_kernels.cpp | 16 +-- .../cpu/src/vlm_backend_cpu_kernels.ispc | 38 +++---- vlm/backends/cpu/xmake.lua | 2 +- .../cuda/include/vlm_backend_cuda.hpp | 6 +- vlm/backends/cuda/src/vlm_backend_cuda.cu | 12 +-- vlm/include/vlm.hpp | 6 +- vlm/include/vlm_backend.hpp | 6 +- vlm/include/vlm_mesh.hpp | 40 ++++---- vlm/include/vlm_types.hpp | 6 +- vlm/src/vlm.cpp | 4 +- vlm/src/vlm_mesh.cpp | 98 +++++++++---------- 15 files changed, 156 insertions(+), 155 deletions(-) diff --git a/tests/test_non_linear.cpp b/tests/test_non_linear.cpp index 240ccc6..3ebd027 100644 --- a/tests/test_non_linear.cpp +++ b/tests/test_non_linear.cpp @@ -35,10 +35,10 @@ struct ThinAirfoilPolarLiftCurve : public LiftCurveFunctor{ }; template -void linspace(T start, T end, u32 n, std::vector& out) { +void linspace(T start, T end, u64 n, std::vector& out) { out.resize(n); T step = (end - start) / (n - 1); - for (u32 i = 0; i < n; i++) { + for (u64 i = 0; i < n; i++) { out[i] = start + i * step; } } @@ -100,7 +100,7 @@ int main(int argc, char** argv) { ); db.profiles_pos.emplace_back(0.0f); - for (u32 i = 0; i < test_alphas.size(); i++) { + for (u64 i = 0; i < test_alphas.size(); i++) { const FlowData flow{test_alphas[i], 0.0f, 1.0f, 1.0f}; auto coeffs = solver.solve(flow, db); test_cl[i] = coeffs.cl; diff --git a/vlm/backends/cpu/include/vlm_backend_cpu.hpp b/vlm/backends/cpu/include/vlm_backend_cpu.hpp index 28d9ca2..595bf46 100644 --- a/vlm/backends/cpu/include/vlm_backend_cpu.hpp +++ b/vlm/backends/cpu/include/vlm_backend_cpu.hpp @@ -22,9 +22,9 @@ class BackendCPU : public Backend { void compute_rhs(const FlowData& flow, const std::vector& section_alphas) override; void lu_factor() override; void lu_solve() override; - f32 compute_coefficient_cl(const FlowData& flow, const f32 area, const u32 j, const u32 n) override; - linalg::alias::float3 compute_coefficient_cm(const FlowData& flow, const f32 area, const f32 chord, const u32 j, const u32 n) override; - f32 compute_coefficient_cd(const FlowData& flow, const f32 area, const u32 j, const u32 n) override; + f32 compute_coefficient_cl(const FlowData& flow, const f32 area, const u64 j, const u64 n) override; + linalg::alias::float3 compute_coefficient_cm(const FlowData& flow, const f32 area, const f32 chord, const u64 j, const u64 n) override; + f32 compute_coefficient_cd(const FlowData& flow, const f32 area, const u64 j, const u64 n) override; void compute_delta_gamma() override; }; diff --git a/vlm/backends/cpu/include/vlm_backend_cpu_kernels.hpp b/vlm/backends/cpu/include/vlm_backend_cpu_kernels.hpp index 7ff4093..e879f37 100644 --- a/vlm/backends/cpu/include/vlm_backend_cpu_kernels.hpp +++ b/vlm/backends/cpu/include/vlm_backend_cpu_kernels.hpp @@ -2,12 +2,12 @@ namespace vlm { void kernel_influence( - u32 m, u32 n, + u64 m, u64 n, f32 lhs[], f32 vx[], f32 vy[], f32 vz[], f32 collocx[], f32 collocy[], f32 collocz[], f32 normalx[], f32 normaly[], f32 normalz[], - u32 ia, u32 lidx, f32 sigma_p4 + u64 ia, u64 lidx, f32 sigma_p4 ); } diff --git a/vlm/backends/cpu/src/vlm_backend_cpu.cpp b/vlm/backends/cpu/src/vlm_backend_cpu.cpp index acc261d..4ee618a 100644 --- a/vlm/backends/cpu/src/vlm_backend_cpu.cpp +++ b/vlm/backends/cpu/src/vlm_backend_cpu.cpp @@ -12,6 +12,7 @@ #include // std::fill #include // std::printf +#include #include #include @@ -22,7 +23,7 @@ using namespace vlm; BackendCPU::~BackendCPU() = default; // Destructor definition BackendCPU::BackendCPU(Mesh& mesh) : Backend(mesh) { - lhs.resize((u64)mesh.nb_panels_wing() * (u64)mesh.nb_panels_wing()); + lhs.resize(mesh.nb_panels_wing() * mesh.nb_panels_wing()); rhs.resize(mesh.nb_panels_wing()); ipiv.resize(mesh.nb_panels_wing()); gamma.resize(mesh.nb_panels_wing()); @@ -40,8 +41,8 @@ void BackendCPU::compute_delta_gamma() { std::copy(gamma.begin(), gamma.begin()+mesh.ns, delta_gamma.begin()); // note: this is efficient as the memory is contiguous - for (u32 i = 1; i < mesh.nc; i++) { - for (u32 j = 0; j < mesh.ns; j++) { + for (u64 i = 1; i < mesh.nc; i++) { + for (u64 j = 0; j < mesh.ns; j++) { delta_gamma[i*mesh.ns + j] = gamma[i*mesh.ns + j] - gamma[(i-1)*mesh.ns + j]; } } @@ -58,23 +59,23 @@ void BackendCPU::compute_lhs(const FlowData& flow) { {m.normal.x.data(), m.normal.y.data(), m.normal.z.data()} }; - const u32 start_wing = 0; - const u32 end_wing = (m.nc - 1) * m.ns; + const u64 start_wing = 0; + const u64 end_wing = (m.nc - 1) * m.ns; tf::Taskflow taskflow; auto init = taskflow.placeholder(); - auto wing_pass = taskflow.for_each_index(start_wing, end_wing, [&] (u32 i) { + auto wing_pass = taskflow.for_each_index(start_wing, end_wing, [&] (u64 i) { ispc::kernel_influence(mesh_proxy, lhs.data(), i, i, flow.sigma_vatistas); }); - u32 idx = m.nc - 1; + u64 idx = m.nc - 1; auto cond = taskflow.emplace([&]{ return idx < m.nc + m.nw ? 0 : 1; // 0 means continue, 1 means break }); - auto wake_pass = taskflow.for_each_index(0u, m.ns, [&] (u32 j) { - const u32 ia = (m.nc - 1) * m.ns + j; - const u32 lidx = idx * m.ns + j; + auto wake_pass = taskflow.for_each_index(0ull, m.ns, [&] (u64 j) { + const u64 ia = (m.nc - 1) * m.ns + j; + const u64 lidx = idx * m.ns + j; ispc::kernel_influence(mesh_proxy, lhs.data(), ia, lidx, flow.sigma_vatistas); }); auto back = taskflow.emplace([&]{ @@ -92,8 +93,8 @@ void BackendCPU::compute_lhs(const FlowData& flow) { Executor::get().run(taskflow).wait(); } -void kernel_cpu_rhs(u32 n, const float normal_x[], const float normal_y[], const float normal_z[], float freestream_x, float freestream_y, float freestream_z, float rhs[]) { - for (u32 i = 0; i < n; i++) { +void kernel_cpu_rhs(u64 n, const float normal_x[], const float normal_y[], const float normal_z[], float freestream_x, float freestream_y, float freestream_z, float rhs[]) { + for (u64 i = 0; i < n; i++) { rhs[i] = - (freestream_x * normal_x[i] + freestream_y * normal_y[i] + freestream_z * normal_z[i]); } } @@ -109,9 +110,9 @@ void BackendCPU::compute_rhs(const FlowData& flow, const std::vector& secti tiny::ScopedTimer timer("Rebuild RHS"); assert(section_alphas.size() == mesh.ns); const Mesh& m = mesh; - for (u32 i = 0; i < mesh.nc; i++) { - for (u32 j = 0; j < mesh.ns; j++) { - const u32 li = i * mesh.ns + j; // linear index + for (u64 i = 0; i < mesh.nc; i++) { + for (u64 j = 0; j < mesh.ns; j++) { + const u64 li = i * mesh.ns + j; // linear index const linalg::alias::float3 freestream = compute_freestream(flow.u_inf, section_alphas[j], flow.beta); rhs[li] = - (freestream.x * m.normal.x[li] + freestream.y * m.normal.y[li] + freestream.z * m.normal.z[li]); } @@ -120,28 +121,28 @@ void BackendCPU::compute_rhs(const FlowData& flow, const std::vector& secti void BackendCPU::lu_factor() { tiny::ScopedTimer timer("Factor"); - const u32 n = mesh.nb_panels_wing(); + const int32_t n = static_cast(mesh.nb_panels_wing()); LAPACKE_sgetrf(LAPACK_COL_MAJOR, n, n, lhs.data(), n, ipiv.data()); } void BackendCPU::lu_solve() { tiny::ScopedTimer timer("Solve"); - const u32 n = mesh.nb_panels_wing(); + const int32_t n = static_cast(mesh.nb_panels_wing()); std::copy(rhs.begin(), rhs.end(), gamma.begin()); LAPACKE_sgetrs(LAPACK_COL_MAJOR, 'N', n, 1, lhs.data(), n, ipiv.data(), gamma.data(), n); } // f32 BackendCPU::compute_coefficient_cl(const FlowData& flow, const f32 area, -// const u32 j, const u32 n) { +// const u64 j, const u64 n) { // assert(n > 0); // assert(j >= 0 && j+n <= mesh.ns); // f32 cl = 0.0f; -// for (u32 u = 0; u < mesh.nc; u++) { -// for (u32 v = j; v < j + n; v++) { -// const u32 li = u * mesh.ns + v; // linear index +// for (u64 u = 0; u < mesh.nc; u++) { +// for (u64 v = j; v < j + n; v++) { +// const u64 li = u * mesh.ns + v; // linear index // const linalg::alias::float3 v0 = mesh.get_v0(li); // const linalg::alias::float3 v1 = mesh.get_v1(li); // // Leading edge vector pointing outward from wing root @@ -160,16 +161,16 @@ linalg::alias::float3 BackendCPU::compute_coefficient_cm( const FlowData& flow, const f32 area, const f32 chord, - const u32 j, - const u32 n) + const u64 j, + const u64 n) { assert(n > 0); assert(j >= 0 && j+n <= mesh.ns); linalg::alias::float3 cm(0.f, 0.f, 0.f); - for (u32 u = 0; u < mesh.nc; u++) { - for (u32 v = j; v < j + n; v++) { - const u32 li = u * mesh.ns + v; // linear index + for (u64 u = 0; u < mesh.nc; u++) { + for (u64 v = j; v < j + n; v++) { + const u64 li = u * mesh.ns + v; // linear index const linalg::alias::float3 v0 = mesh.get_v0(li); const linalg::alias::float3 v1 = mesh.get_v1(li); // Leading edge vector pointing outward from wing root @@ -188,8 +189,8 @@ linalg::alias::float3 BackendCPU::compute_coefficient_cm( f32 BackendCPU::compute_coefficient_cd( const FlowData& flow, const f32 area, - const u32 j, - const u32 n) + const u64 j, + const u64 n) { tiny::ScopedTimer timer("Compute CD"); assert(n > 0); @@ -212,8 +213,8 @@ f32 BackendCPU::compute_coefficient_cd( f32 BackendCPU::compute_coefficient_cl( const FlowData& flow, const f32 area, - const u32 j, - const u32 n) + const u64 j, + const u64 n) { Mesh& m = mesh; ispc::MeshProxy mesh_proxy = { diff --git a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.cpp b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.cpp index 6d6e8d1..fa7268a 100644 --- a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.cpp +++ b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.cpp @@ -36,25 +36,25 @@ inline void kernel_symmetry(float3& inf, float3 colloc, const float3& vertex0, c } void vlm::kernel_influence( - u32 m, u32 n, + u64 m, u64 n, f32 lhs[], f32 vx[], f32 vy[], f32 vz[], f32 collocx[], f32 collocy[], f32 collocz[], f32 normalx[], f32 normaly[], f32 normalz[], - u32 ia, u32 lidx, f32 sigma + u64 ia, u64 lidx, f32 sigma ) { - const u32 nb_panels = m * n; - const u32 v0 = lidx + lidx / n; - const u32 v1 = v0 + 1; - const u32 v3 = v0 + n+1; - const u32 v2 = v3 + 1; + const u64 nb_panels = m * n; + const u64 v0 = lidx + lidx / n; + const u64 v1 = v0 + 1; + const u64 v3 = v0 + n+1; + const u64 v2 = v3 + 1; float3 vertex0{vx[v0], vy[v0], vz[v0]}; float3 vertex1{vx[v1], vy[v1], vz[v1]}; float3 vertex2{vx[v2], vy[v2], vz[v2]}; float3 vertex3{vx[v3], vy[v3], vz[v3]}; - for (u32 ia2 = 0; ia2 < nb_panels; ia2++) { + for (u64 ia2 = 0; ia2 < nb_panels; ia2++) { const float3 colloc(collocx[ia2], collocy[ia2], collocz[ia2]); float3 inf(0.0f, 0.0f, 0.0f); const float3 normal(normalx[ia2], normaly[ia2], normalz[ia2]); diff --git a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc index fb90d63..7142ae8 100644 --- a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc +++ b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc @@ -35,9 +35,9 @@ export struct SoA3D { }; export struct MeshProxy { - uniform unsigned int ns; - uniform unsigned int nc; - uniform unsigned int nb_panels; + uniform unsigned int64 ns; + uniform unsigned int64 nc; + uniform unsigned int64 nb_panels; uniform SoA3D v; // vertices uniform SoA3D colloc; // collocation points uniform SoA3D normal; // normals @@ -85,12 +85,12 @@ inline void kernel_symmetry(float3& inf, float3 colloc, const uniform float3& ve export void kernel_influence( uniform const MeshProxy& m, uniform float* uniform lhs, - uniform unsigned int ia, uniform unsigned int lidx, uniform float sigma + uniform unsigned int64 ia, uniform unsigned int64 lidx, uniform float sigma ) { - const uniform unsigned int v0 = lidx + lidx / m.ns; - const uniform unsigned int v1 = v0 + 1; - const uniform unsigned int v3 = v0 + m.ns+1; - const uniform unsigned int v2 = v3 + 1; + const uniform unsigned int64 v0 = lidx + lidx / m.ns; + const uniform unsigned int64 v1 = v0 + 1; + const uniform unsigned int64 v3 = v0 + m.ns+1; + const uniform unsigned int64 v2 = v3 + 1; // Broadcast vertices uniform float3 vertex0 = {m.v.x[v0], m.v.y[v0], m.v.z[v0]}; @@ -115,18 +115,18 @@ export uniform float kernel_trefftz_cd( uniform const MeshProxy& m, uniform float* uniform gamma, uniform float* uniform trefftz_buffer, - uniform unsigned int j, uniform unsigned int n, uniform float sigma + uniform unsigned int64 j, uniform unsigned int64 n, uniform float sigma ) { - uniform unsigned int begin = m.nb_panels + j; - uniform unsigned int end = begin + n; + uniform unsigned int64 begin = m.nb_panels + j; + uniform unsigned int64 end = begin + n; float cd = 0.0f; // Compute the induced velocity of the streamwise wake vortex segments - for (uniform unsigned int ia = m.nb_panels; ia < m.nb_panels + m.ns; ia++) { - const uniform unsigned int v0 = ia + ia / m.ns; - const uniform unsigned int v1 = v0 + 1; - const uniform unsigned int v3 = v0 + m.ns+1; - const uniform unsigned int v2 = v3 + 1; + for (uniform unsigned int64 ia = m.nb_panels; ia < m.nb_panels + m.ns; ia++) { + const uniform unsigned int64 v0 = ia + ia / m.ns; + const uniform unsigned int64 v1 = v0 + 1; + const uniform unsigned int64 v3 = v0 + m.ns+1; + const uniform unsigned int64 v2 = v3 + 1; // Broadcast vertices const uniform float3 vertex0 = {m.v.x[v0], m.v.y[v0], m.v.z[v0]}; @@ -147,7 +147,7 @@ export uniform float kernel_trefftz_cd( } // Perform the integration (Katz Plotkin, Low speed Aero | Eq 8.146) foreach(i = j ... j + n) { - unsigned int li = (m.nc-1) * m.ns + i; + unsigned int64 li = (m.nc-1) * m.ns + i; float3 v0 = {m.v.x[i], m.v.y[i], m.v.z[i]}; float3 v1 = {m.v.x[i+1], m.v.y[i+1], m.v.z[i+1]}; float dl = length(v1 - v0); @@ -159,11 +159,11 @@ export uniform float kernel_trefftz_cd( export uniform float kernel_trefftz_cl( uniform const MeshProxy& m, uniform float* uniform gamma, - uniform unsigned int j, uniform unsigned int n + uniform unsigned int64 j, uniform unsigned int64 n ) { float cl = 0.0f; foreach(i = j ... j + n) { - unsigned int li = (m.nc-1) * m.ns + i; + unsigned int64 li = (m.nc-1) * m.ns + i; float3 v0 = {m.v.x[i], m.v.y[i], m.v.z[i]}; float3 v1 = {m.v.x[i+1], m.v.y[i+1], m.v.z[i+1]}; float dl = length(v1 - v0); diff --git a/vlm/backends/cpu/xmake.lua b/vlm/backends/cpu/xmake.lua index 0c39656..8fbdc57 100644 --- a/vlm/backends/cpu/xmake.lua +++ b/vlm/backends/cpu/xmake.lua @@ -9,7 +9,7 @@ target("backend-cpu") add_packages("openblas_custom", { public = true }) add_rules("utils.ispc", {header_extension = "_ispc.h"}) - set_values("ispc.flags", {"--target=host", "-O1"}) + set_values("ispc.flags", {"--target=host", "--addressing=64"}) add_files("src/*.ispc") add_includedirs("../../include") diff --git a/vlm/backends/cuda/include/vlm_backend_cuda.hpp b/vlm/backends/cuda/include/vlm_backend_cuda.hpp index 892ab15..ae882ad 100644 --- a/vlm/backends/cuda/include/vlm_backend_cuda.hpp +++ b/vlm/backends/cuda/include/vlm_backend_cuda.hpp @@ -26,9 +26,9 @@ class BackendCUDA : public Backend { void compute_rhs(const FlowData& flow, const std::vector& section_alphas) override; void lu_factor() override; void lu_solve() override; - f32 compute_coefficient_cl(const FlowData& flow, const f32 area, const u32 j, const u32 n) override; - linalg::alias::float3 compute_coefficient_cm(const FlowData& flow, const f32 area, const f32 chord, const u32 j, const u32 n) override; - f32 compute_coefficient_cd(const FlowData& flow, const f32 area, const u32 j, const u32 n) override; + f32 compute_coefficient_cl(const FlowData& flow, const f32 area, const u64 j, const u64 n) override; + linalg::alias::float3 compute_coefficient_cm(const FlowData& flow, const f32 area, const f32 chord, const u64 j, const u64 n) override; + f32 compute_coefficient_cd(const FlowData& flow, const f32 area, const u64 j, const u64 n) override; void compute_delta_gamma() override; }; diff --git a/vlm/backends/cuda/src/vlm_backend_cuda.cu b/vlm/backends/cuda/src/vlm_backend_cuda.cu index 73faf6d..eb7e08f 100644 --- a/vlm/backends/cuda/src/vlm_backend_cuda.cu +++ b/vlm/backends/cuda/src/vlm_backend_cuda.cu @@ -213,16 +213,16 @@ void BackendCUDA::lu_solve() { f32 BackendCUDA::compute_coefficient_cl( const FlowData& flow, const f32 area, - const u32 j, - const u32 n) { + const u64 j, + const u64 n) { return default_backend.compute_coefficient_cl(flow, area, j, n); } f32 BackendCUDA::compute_coefficient_cd( const FlowData& flow, const f32 area, - const u32 j, - const u32 n) { + const u64 j, + const u64 n) { return default_backend.compute_coefficient_cd(flow, area, j, n); } @@ -230,8 +230,8 @@ linalg::alias::float3 BackendCUDA::compute_coefficient_cm( const FlowData& flow, const f32 area, const f32 chord, - const u32 j, - const u32 n) { + const u64 j, + const u64 n) { return default_backend.compute_coefficient_cm(flow, area, chord, j, n); } diff --git a/vlm/include/vlm.hpp b/vlm/include/vlm.hpp index 942b582..768c5e7 100644 --- a/vlm/include/vlm.hpp +++ b/vlm/include/vlm.hpp @@ -19,19 +19,19 @@ class Solver { class NonLinearVLM final: public Solver { public: const f64 tol; - const u32 max_iter; + const u64 max_iter; std::vector strip_alphas; NonLinearVLM( const tiny::Config& cfg, const f64 tol_= 1e-4f, - const u32 max_iter_ = 100 + const u64 max_iter_ = 100 ): Solver(cfg), tol(tol_), max_iter(max_iter_) {} - NonLinearVLM(const f64 tol_= 1e-4f, const u32 max_iter_ = 100) : tol(tol_), max_iter(max_iter_) {} + NonLinearVLM(const f64 tol_= 1e-4f, const u64 max_iter_ = 100) : tol(tol_), max_iter(max_iter_) {} ~NonLinearVLM() = default; AeroCoefficients solve(const FlowData& flow, const Database& db); }; diff --git a/vlm/include/vlm_backend.hpp b/vlm/include/vlm_backend.hpp index aea8091..b74a382 100644 --- a/vlm/include/vlm_backend.hpp +++ b/vlm/include/vlm_backend.hpp @@ -16,11 +16,11 @@ class Backend { virtual void compute_rhs(const FlowData& flow, const std::vector& section_alphas) = 0; virtual void lu_factor() = 0; virtual void lu_solve() = 0; - virtual f32 compute_coefficient_cl(const FlowData& flow, const f32 area, const u32 j, const u32 n) = 0; + virtual f32 compute_coefficient_cl(const FlowData& flow, const f32 area, const u64 j, const u64 n) = 0; f32 compute_coefficient_cl(const FlowData& flow); - virtual linalg::alias::float3 compute_coefficient_cm(const FlowData& flow, const f32 area, const f32 chord, const u32 j, const u32 n) = 0; + virtual linalg::alias::float3 compute_coefficient_cm(const FlowData& flow, const f32 area, const f32 chord, const u64 j, const u64 n) = 0; linalg::alias::float3 compute_coefficient_cm(const FlowData& flow); - virtual f32 compute_coefficient_cd(const FlowData& flow, const f32 area, const u32 j, const u32 n) = 0; + virtual f32 compute_coefficient_cd(const FlowData& flow, const f32 area, const u64 j, const u64 n) = 0; f32 compute_coefficient_cd(const FlowData& flow); virtual void compute_delta_gamma() = 0; virtual ~Backend() = default; diff --git a/vlm/include/vlm_mesh.hpp b/vlm/include/vlm_mesh.hpp index e75c450..91888c6 100644 --- a/vlm/include/vlm_mesh.hpp +++ b/vlm/include/vlm_mesh.hpp @@ -17,10 +17,10 @@ class Mesh { // (stored in span major order) SoA_3D_t v; // size (ncw+1)*(ns+1) // Offsets for indexing in connectivity array for each panel - std::vector offsets = {}; // size nc*ns + 1 + std::vector offsets = {}; // size nc*ns + 1 // Panel-vertex connectivity // vertices indices of a panel: connectivity[offsets[i]] to connectivity[offsets[i+1]] - std::vector connectivity = {}; // size 4*(nc*ns) + std::vector connectivity = {}; // size 4*(nc*ns) // Panels metrics (stored span major order) // --------------------- @@ -33,9 +33,9 @@ class Mesh { // Structured dimensions // --------------------- - u32 nc = 1; // number of panels chordwise - u32 ns = 1; // number of panels spanwise - const u32 nw = 1; // number of panels in chordwise wake + u64 nc = 1; // number of panels chordwise + u64 ns = 1; // number of panels spanwise + const u64 nw = 1; // number of panels in chordwise wake // Analytical quanities when provided // --------------------- @@ -48,23 +48,23 @@ class Mesh { void compute_connectivity(); // fill offsets, connectivity void compute_metrics_wing(); // fill colloc, normal, area void compute_metrics_wake(); - void compute_metrics_i(u32 i); - u32 nb_panels_wing() const; - u32 nb_panels_total() const; - u32 nb_vertices_wing() const; - u32 nb_vertices_total() const; - f32 panels_area(const u32 i, const u32 j, const u32 m, const u32 n) const; - f32 panels_area_xy(const u32 i, const u32 j, const u32 m, const u32 n) const; - f32 panel_width_y(const u32 i, const u32 j) const; - f32 strip_width(const u32 j) const; - f32 chord_length(const u32 j) const; - f32 chord_mean(const u32 j, const u32 n) const; + void compute_metrics_i(u64 i); + u64 nb_panels_wing() const; + u64 nb_panels_total() const; + u64 nb_vertices_wing() const; + u64 nb_vertices_total() const; + f32 panels_area(const u64 i, const u64 j, const u64 m, const u64 n) const; + f32 panels_area_xy(const u64 i, const u64 j, const u64 m, const u64 n) const; + f32 panel_width_y(const u64 i, const u64 j) const; + f32 strip_width(const u64 j) const; + f32 chord_length(const u64 j) const; + f32 chord_mean(const u64 j, const u64 n) const; // i panel vertices coordinates - linalg::alias::float3 get_v0(u32 i) const; // upper left - linalg::alias::float3 get_v1(u32 i) const; // upper right - linalg::alias::float3 get_v2(u32 i) const; // lower right - linalg::alias::float3 get_v3(u32 i) const; // lower left + linalg::alias::float3 get_v0(u64 i) const; // upper left + linalg::alias::float3 get_v1(u64 i) const; // upper right + linalg::alias::float3 get_v2(u64 i) const; // lower right + linalg::alias::float3 get_v3(u64 i) const; // lower left void io_read(const std::string& filename); Mesh(const tiny::Config& cfg); diff --git a/vlm/include/vlm_types.hpp b/vlm/include/vlm_types.hpp index 8de7396..2d0a931 100644 --- a/vlm/include/vlm_types.hpp +++ b/vlm/include/vlm_types.hpp @@ -47,16 +47,16 @@ struct SoA_3D_t { std::vector x = {}; std::vector y = {}; std::vector z = {}; - u32 size = 0; + u64 size = 0; - void resize(u32 size_) { + void resize(u64 size_) { size = size_; x.resize(size); y.resize(size); z.resize(size); } - void reserve(u32 size_) { + void reserve(u64 size_) { size = size_; x.reserve(size); y.reserve(size); diff --git a/vlm/src/vlm.cpp b/vlm/src/vlm.cpp index f82e599..aec90c7 100644 --- a/vlm/src/vlm.cpp +++ b/vlm/src/vlm.cpp @@ -46,7 +46,7 @@ AeroCoefficients NonLinearVLM::solve(const FlowData& flow, const Database& db) { backend->compute_lhs(flow); // Create influence matrix backend->lu_factor(); // Factorize the influence matrix into LU form - for (u32 iter = 0; iter < max_iter && err > tol; iter++) { + for (u64 iter = 0; iter < max_iter && err > tol; iter++) { err = 0.0; // reset l1 error backend->compute_rhs(flow, strip_alphas); // Compute RHS using strip alphas backend->lu_solve(); // Solve for the gammas @@ -56,7 +56,7 @@ AeroCoefficients NonLinearVLM::solve(const FlowData& flow, const Database& db) { // loop over the chordwise strips and apply Van Dam algorithm { const tiny::ScopedTimer timer("Strip correction"); - for (u32 j = 0; j < mesh->ns; j++) { + for (u64 j = 0; j < mesh->ns; j++) { const f32 strip_area = mesh->panels_area(0, j, mesh->nc, 1); const FlowData strip_flow = {strip_alphas[j], flow.beta, flow.u_inf, flow.rho}; const f32 strip_cl = backend->compute_coefficient_cl(strip_flow, strip_area, j, 1); diff --git a/vlm/src/vlm_mesh.cpp b/vlm/src/vlm_mesh.cpp index 87d7c85..09b3f92 100644 --- a/vlm/src/vlm_mesh.cpp +++ b/vlm/src/vlm_mesh.cpp @@ -49,7 +49,7 @@ void Mesh::init() { } void Mesh::alloc() { - const u32 ncw = nc + nw; + const u64 ncw = nc + nw; v.resize((ncw + 1) * (ns + 1)); offsets.resize(nc * ns + 1); connectivity.resize(4 * nc * ns); @@ -59,10 +59,10 @@ void Mesh::alloc() { area.resize(ncw * ns); } -u32 Mesh::nb_panels_wing() const { return nc * ns; }; -u32 Mesh::nb_panels_total() const { return (nc+nw) * ns; }; -u32 Mesh::nb_vertices_wing() const { return (nc + 1) * (ns + 1); }; -u32 Mesh::nb_vertices_total() const { return (nc + nw + 1) * (ns + 1); }; +u64 Mesh::nb_panels_wing() const { return nc * ns; }; +u64 Mesh::nb_panels_total() const { return (nc+nw) * ns; }; +u64 Mesh::nb_vertices_wing() const { return (nc + 1) * (ns + 1); }; +u64 Mesh::nb_vertices_total() const { return (nc + nw + 1) * (ns + 1); }; /// @brief Computes the mean chord of a set of panels /// @details @@ -72,13 +72,13 @@ u32 Mesh::nb_vertices_total() const { return (nc + nw + 1) * (ns + 1); }; /// @param j first panel index chordwise /// @param n number of panels spanwise /// @return mean chord of the set of panels -f32 Mesh::chord_mean(const u32 j, const u32 n) const { +f32 Mesh::chord_mean(const u64 j, const u64 n) const { assert(j + n <= ns+1); // spanwise range assert(n > 1); // minimum 2 chords f32 mac = 0.0f; // loop over panel chordwise sections in spanwise direction // Note: can be done optimally with vertical fused simd - for (u32 v = 0; v < n - 1; v++) { + for (u64 v = 0; v < n - 1; v++) { const f32 c0 = chord_length(j + v); const f32 c1 = chord_length(j + v + 1); mac += 0.5f * (c0 * c0 + c1 * c1) * panel_width_y(0, j + v); @@ -93,16 +93,16 @@ f32 Mesh::chord_mean(const u32 j, const u32 n) const { /// @param m number of panels chordwise /// @param n number of panels spanwise /// @return sum of the areas of the panels -f32 Mesh::panels_area(const u32 i, const u32 j, const u32 m, const u32 n) const { +f32 Mesh::panels_area(const u64 i, const u64 j, const u64 m, const u64 n) const { assert(i + m <= nc); assert(j + n <= ns); - const u32 ld = ns; + const u64 ld = ns; const f32* areas = &area[j + i * ld]; f32 total_area = 0.0f; - for (u32 u = 0; u < m; u++) { - for (u32 v = 0; v < n; v++) { + for (u64 u = 0; u < m; u++) { + for (u64 v = 0; v < n; v++) { total_area += areas[v + u * ld]; } } @@ -115,7 +115,7 @@ f32 Mesh::panels_area(const u32 i, const u32 j, const u32 m, const u32 n) const /// @param m number of panels chordwise /// @param n number of panels spanwise /// @return sum of the areas of the panels projected on the xy plane -f32 Mesh::panels_area_xy(const u32 i, const u32 j, const u32 m, const u32 n) const { +f32 Mesh::panels_area_xy(const u64 i, const u64 j, const u64 m, const u64 n) const { assert(i + m <= nc); assert(j + n <= ns); @@ -123,9 +123,9 @@ f32 Mesh::panels_area_xy(const u32 i, const u32 j, const u32 m, const u32 n) con // 0.5 * || d1 x d2 || where d1 and d2 are the diagonals of the quad f32 total_area = 0.0f; // Note: this is highly inefficient as vertex coordinates should be loaded with simd - for (u32 u = 0; u < m; u++) { - for (u32 v = 0; v < n; v++) { - const u32 idx = j + v + (i+u) * ns; + for (u64 u = 0; u < m; u++) { + for (u64 v = 0; v < n; v++) { + const u64 idx = j + v + (i+u) * ns; const linalg::alias::float3 d1 = get_v0(idx) - get_v2(idx); const linalg::alias::float3 d2 = get_v1(idx) - get_v3(idx); const linalg::alias::float3 cross = linalg::cross(d1, d2); @@ -141,15 +141,15 @@ f32 Mesh::panels_area_xy(const u32 i, const u32 j, const u32 m, const u32 n) con /// @brief Computes the width of a single panel in pure y direction /// @param i panel index chordwise /// @param j panel index spanwise -f32 Mesh::panel_width_y(const u32 i, const u32 j) const { +f32 Mesh::panel_width_y(const u64 i, const u64 j) const { // Since chordwise segments are always parallel, we can simply measure the width of the first panel assert(i < nc); assert(j < ns); - const u32 ld = ns + 1; + const u64 ld = ns + 1; return v.y[j + 1 + i * ld] - v.y[j + i * ld]; } -f32 Mesh::strip_width(const u32 j) const { +f32 Mesh::strip_width(const u64 j) const { assert(j < ns); const linalg::alias::float3 v0 = get_v0(j); const linalg::alias::float3 v1 = get_v1(j); @@ -160,7 +160,7 @@ f32 Mesh::strip_width(const u32 j) const { /// @details Since the mesh follows the camber line, the chord length is computed /// as the distance between the first and last vertex of a chordwise segment /// @param j chordwise segment index -f32 Mesh::chord_length(const u32 j) const { +f32 Mesh::chord_length(const u64 j) const { assert(j <= ns); // spanwise vertex range const f32 dx = v.x[j + nc * (ns+1)] - v.x[j]; const f32 dy = 0.0f; // chordwise segments are parallel to the x axis @@ -169,23 +169,23 @@ f32 Mesh::chord_length(const u32 j) const { return std::sqrt(dx * dx + dy * dy + dz * dz); } -linalg::alias::float3 Mesh::get_v0(u32 i) const { - const u32 idx = i + i / ns; +linalg::alias::float3 Mesh::get_v0(u64 i) const { + const u64 idx = i + i / ns; return {v.x[idx], v.y[idx], v.z[idx]}; } -linalg::alias::float3 Mesh::get_v1(u32 i) const { - const u32 idx = i + i / ns + 1; +linalg::alias::float3 Mesh::get_v1(u64 i) const { + const u64 idx = i + i / ns + 1; return {v.x[idx], v.y[idx], v.z[idx]}; } -linalg::alias::float3 Mesh::get_v2(u32 i) const { - const u32 idx = i + i / ns + ns + 2; +linalg::alias::float3 Mesh::get_v2(u64 i) const { + const u64 idx = i + i / ns + ns + 2; return {v.x[idx], v.y[idx], v.z[idx]}; } -linalg::alias::float3 Mesh::get_v3(u32 i) const { - const u32 idx = i + i / ns + ns + 1; +linalg::alias::float3 Mesh::get_v3(u64 i) const { + const u64 idx = i + i / ns + ns + 1; return {v.x[idx], v.y[idx], v.z[idx]}; } @@ -195,12 +195,12 @@ void Mesh::update_wake(const linalg::alias::float3& freestream) { const f32 off_y = freestream.y * 100.0f * chord_root; const f32 off_z = freestream.z * 100.0f * chord_root; - const u32 v_ns = ns + 1; - const u32 begin_trailing_edge = nb_vertices_wing()-v_ns; - const u32 end_trailing_edge = nb_vertices_wing(); + const u64 v_ns = ns + 1; + const u64 begin_trailing_edge = nb_vertices_wing()-v_ns; + const u64 end_trailing_edge = nb_vertices_wing(); // Add one layer of wake vertices // this can be parallelized (careful to false sharing tho) - for (u32 i = begin_trailing_edge; i < end_trailing_edge; ++i) { + for (u64 i = begin_trailing_edge; i < end_trailing_edge; ++i) { v.x[i + v_ns] = v.x[i] + off_x; v.y[i + v_ns] = v.y[i] + off_y; v.z[i + v_ns] = v.z[i] + off_z; @@ -212,7 +212,7 @@ void Mesh::update_wake(const linalg::alias::float3& freestream) { void Mesh::correction_high_aoa(f32 alpha_rad) { const f32 factor = 0.5f * alpha_rad / (std::sin(alpha_rad) + EPS); // correction factor // Note: this can be vectorized and parallelized - for (u32 i = 0; i < nb_panels_total(); i++) { + for (u64 i = 0; i < nb_panels_total(); i++) { // "chord vector" from center of leading line (v0-v1) to trailing line (v3-v2) const linalg::alias::float3 chord_vec = 0.5f * (get_v2(i) + get_v3(i) - get_v0(i) - get_v1(i)); const linalg::alias::float3 colloc_pt = 0.5f * (get_v0(i) + get_v1(i)) + factor * chord_vec; @@ -229,13 +229,13 @@ void Mesh::compute_connectivity() { // chord(x) 0--1 // | | | // \/ 3--2 - std::array quad = {}; + std::array quad = {}; // Note: with some modifications this loop can be parallelized - for (u32 i = 0; i < nb_panels_wing(); i++) { + for (u64 i = 0; i < nb_panels_wing(); i++) { // extra offset occuring from the fact that there is 1 more vertex // per row than surfaces - u32 chord_idx = i / nc; + u64 chord_idx = i / nc; quad[0] = i + chord_idx; quad[1] = quad[0] + 1; quad[2] = quad[1] + ns; @@ -245,7 +245,7 @@ void Mesh::compute_connectivity() { } } -void Mesh::compute_metrics_i(u32 i ) { +void Mesh::compute_metrics_i(u64 i ) { const linalg::alias::float3 v0 = get_v0(i); const linalg::alias::float3 v1 = get_v1(i); const linalg::alias::float3 v2 = get_v2(i); @@ -281,13 +281,13 @@ void Mesh::compute_metrics_i(u32 i ) { } void Mesh::compute_metrics_wing() { - for (u32 i = 0; i < nb_panels_wing(); i++) { + for (u64 i = 0; i < nb_panels_wing(); i++) { compute_metrics_i(i); } } void Mesh::compute_metrics_wake() { - for (u32 i = nb_panels_wing(); i < nb_panels_total(); i++) { + for (u64 i = nb_panels_wing(); i < nb_panels_total(); i++) { compute_metrics_i(i); } } @@ -295,10 +295,10 @@ void Mesh::compute_metrics_wake() { // plot3d is chordwise major void Mesh::io_read_plot3d_structured(std::ifstream& f) { std::cout << "reading plot3d mesh" << std::endl; - u32 ni = 0; // number of vertices chordwise - u32 nj = 0; // number of vertices spanwise - u32 nk = 0; - u32 blocks = 0; + u64 ni = 0; // number of vertices chordwise + u64 nj = 0; // number of vertices spanwise + u64 nk = 0; + u64 blocks = 0; f32 x, y, z; f >> blocks; if (blocks != 1) { @@ -317,20 +317,20 @@ void Mesh::io_read_plot3d_structured(std::ifstream& f) { std::cout << "ns: " << ns << std::endl; std::cout << "nc: " << nc << std::endl; - for (u32 j = 0; j < nj; j++) { - for (u32 i = 0; i < ni; i++) { + for (u64 j = 0; j < nj; j++) { + for (u64 i = 0; i < ni; i++) { f >> x; v.x[nj*i + j] = x; } } - for (u32 j = 0; j < nj; j++) { - for (u32 i = 0; i < ni; i++) { + for (u64 j = 0; j < nj; j++) { + for (u64 i = 0; i < ni; i++) { f >> y; v.y[nj*i + j] = y; } } - for (u32 j = 0; j < nj; j++) { - for (u32 i = 0; i < ni; i++) { + for (u64 j = 0; j < nj; j++) { + for (u64 i = 0; i < ni; i++) { f >> z; v.z[nj*i + j] = z; } @@ -343,7 +343,7 @@ void Mesh::io_read_plot3d_structured(std::ifstream& f) { // throw std::runtime_error("First vertex of plot3d mesh must be at origin"); // } f32 first_y = v.y[0]; - for (u32 i = 1; i < ni; i++) { + for (u64 i = 1; i < ni; i++) { if (v.y[i * nj] != first_y) { throw std::runtime_error("Mesh vertices should be ordered in chordwise direction"); } From 1167945db0123bd955ffadab2bbafd6f8ee887ec Mon Sep 17 00:00:00 2001 From: Samuel Ayala Date: Wed, 14 Feb 2024 18:55:36 -0500 Subject: [PATCH 6/6] reenable all nonlinear curves and fix ci --- tests/test_non_linear.cpp | 4 ++-- vlm/backends/cpu/src/vlm_backend_cpu.cpp | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_non_linear.cpp b/tests/test_non_linear.cpp index 3ebd027..a97d731 100644 --- a/tests/test_non_linear.cpp +++ b/tests/test_non_linear.cpp @@ -64,8 +64,8 @@ int main(int argc, char** argv) { const std::vector backends = {"cpu"}; std::vector>> lift_curves; lift_curves.emplace_back(std::make_pair("spallart1", std::make_unique(1.2f, 0.28f, 0.02f, 2.f*PI_f, 2.f*PI_f))); - // lift_curves.emplace_back(std::make_pair("spallart2", std::make_unique(0.72f, 0.28f, 0.04f, 2.f*PI_f, 1.5f*PI_f))); - // lift_curves.emplace_back(std::make_pair("polar", std::make_unique())); + lift_curves.emplace_back(std::make_pair("spallart2", std::make_unique(0.72f, 0.28f, 0.04f, 2.f*PI_f, 1.5f*PI_f))); + lift_curves.emplace_back(std::make_pair("polar", std::make_unique())); std::vector test_alphas = {0, 5, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; std::transform(test_alphas.begin(), test_alphas.end(), test_alphas.begin(), to_radians); diff --git a/vlm/backends/cpu/src/vlm_backend_cpu.cpp b/vlm/backends/cpu/src/vlm_backend_cpu.cpp index 4ee618a..986cec6 100644 --- a/vlm/backends/cpu/src/vlm_backend_cpu.cpp +++ b/vlm/backends/cpu/src/vlm_backend_cpu.cpp @@ -59,13 +59,13 @@ void BackendCPU::compute_lhs(const FlowData& flow) { {m.normal.x.data(), m.normal.y.data(), m.normal.z.data()} }; - const u64 start_wing = 0; + const u64 zero = 0; const u64 end_wing = (m.nc - 1) * m.ns; tf::Taskflow taskflow; auto init = taskflow.placeholder(); - auto wing_pass = taskflow.for_each_index(start_wing, end_wing, [&] (u64 i) { + auto wing_pass = taskflow.for_each_index(zero, end_wing, [&] (u64 i) { ispc::kernel_influence(mesh_proxy, lhs.data(), i, i, flow.sigma_vatistas); }); @@ -73,7 +73,7 @@ void BackendCPU::compute_lhs(const FlowData& flow) { auto cond = taskflow.emplace([&]{ return idx < m.nc + m.nw ? 0 : 1; // 0 means continue, 1 means break }); - auto wake_pass = taskflow.for_each_index(0ull, m.ns, [&] (u64 j) { + auto wake_pass = taskflow.for_each_index(zero, m.ns, [&] (u64 j) { const u64 ia = (m.nc - 1) * m.ns + j; const u64 lidx = idx * m.ns + j; ispc::kernel_influence(mesh_proxy, lhs.data(), ia, lidx, flow.sigma_vatistas);