diff --git a/.github/workflows/linux.yaml b/.github/workflows/linux.yaml index 128f35b..98453ad 100644 --- a/.github/workflows/linux.yaml +++ b/.github/workflows/linux.yaml @@ -25,6 +25,12 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Download ISPC + run: | + curl -L "https://github.com/ispc/ispc/releases/download/v1.22.0/ispc-v1.22.0-linux.tar.gz" -o ispc.tar.gz + tar -xzf ispc.tar.gz + echo "${PWD}/ispc-v1.22.0-linux/bin" >> $GITHUB_PATH + # Install system dependencies (opengl) - name: Install system dependencies run: | diff --git a/.github/workflows/windows.yaml b/.github/workflows/windows.yaml index 7bb6b82..d67c8d1 100644 --- a/.github/workflows/windows.yaml +++ b/.github/workflows/windows.yaml @@ -25,6 +25,12 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Download ISPC + run: | + Invoke-WebRequest -Uri "https://github.com/ispc/ispc/releases/download/v1.22.0/ispc-v1.22.0-windows.zip" -OutFile "ispc.zip" + 7z x ispc.zip + "ispc-v1.22.0-windows/bin" >> $env:GITHUB_PATH + # Force xmake to a specific folder (for cache) - name: Set xmake env run: echo "XMAKE_GLOBALDIR=${{ runner.workspace }}/xmake-global" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append diff --git a/config/elliptic.vlm b/config/elliptic.vlm index 2b4232d..a899515 100644 --- a/config/elliptic.vlm +++ b/config/elliptic.vlm @@ -8,7 +8,7 @@ alphas = [5] s_ref = 3.92699 # half wing area c_ref = 0.85 sigma_vatistas = 0.0 -backend = avx2 +backend = cpu wake_included = false diff --git a/config/swept.vlm b/config/swept.vlm index abada90..c2b6521 100644 --- a/config/swept.vlm +++ b/config/swept.vlm @@ -7,7 +7,7 @@ ref_point = [0.25, 0.0, 0.0] alphas = [5] -backend = avx2 +backend = cpu wake_included = false \ No newline at end of file diff --git a/headeronly/tinytimer.hpp b/headeronly/tinytimer.hpp index 16c7a38..9b12caf 100644 --- a/headeronly/tinytimer.hpp +++ b/headeronly/tinytimer.hpp @@ -5,24 +5,24 @@ namespace tiny { -constexpr static long long us_in_s = 1'000'000LL; -constexpr static long long us_in_ms = 1'000LL; +constexpr static double us_in_s = 1'000'000; +constexpr static double us_in_ms = 1'000; -class Timer { +class ScopedTimer { public: - Timer(const char* name) : m_name(name), m_start(std::chrono::high_resolution_clock::now()) {} + ScopedTimer(const char* name) : m_name(name), m_start(std::chrono::high_resolution_clock::now()) {} - ~Timer() { + ~ScopedTimer() { const auto m_end = std::chrono::high_resolution_clock::now(); - const auto duration = static_cast(std::chrono::duration_cast(m_end - m_start).count()); + const auto duration = static_cast(std::chrono::duration_cast(m_end - m_start).count()); std::printf("%s: ", m_name); if (duration > us_in_s) { - std::printf("%lld s\n", duration / us_in_s); + std::printf("%.2f s\n", duration / us_in_s); } else if (duration > us_in_ms) { - std::printf("%lld ms\n", duration / us_in_ms); + std::printf("%.0f ms\n", duration / us_in_ms); } else { - std::printf("%lld us\n", duration); + std::printf("%.0f us\n", duration); } } private: diff --git a/mesh/infinite_rectangular_2x8.x b/mesh/infinite_rectangular_2x8.x new file mode 100644 index 0000000..c7e459c --- /dev/null +++ b/mesh/infinite_rectangular_2x8.x @@ -0,0 +1,23 @@ + 1 + 3 9 1 + 0.0000000000000000 0.50000000000000000 1.0000000000000000 0.0000000000000000 + 0.50000000000000000 1.0000000000000000 0.0000000000000000 0.50000000000000000 + 1.0000000000000000 0.0000000000000000 0.50000000000000000 1.0000000000000000 + 0.0000000000000000 0.50000000000000000 1.0000000000000000 0.0000000000000000 + 0.50000000000000000 1.0000000000000000 0.0000000000000000 0.50000000000000000 + 1.0000000000000000 0.0000000000000000 0.50000000000000000 1.0000000000000000 + 0.0000000000000000 0.50000000000000000 1.0000000000000000 + 0.0000000000000000 0.0000000000000000 0.0000000000000000 3750.0000000000000 + 3750.0000000000005 3750.0000000000000 7500.0000000000000 7500.0000000000009 + 7500.0000000000000 11250.000000000000 11250.000000000000 11250.000000000000 + 15000.000000000000 15000.000000000002 15000.000000000000 18750.000000000000 + 18750.000000000000 18750.000000000000 17500.000000000000 17500.000000000000 + 17500.000000000000 13750.000000000000 13750.000000000000 13750.000000000000 + 10000.000000000000 10000.000000000000 10000.000000000000 + 0.0000000000000000 0.0000000000000000 0.0000000000000000 0.0000000000000000 + 0.0000000000000000 0.0000000000000000 0.0000000000000000 0.0000000000000000 + 0.0000000000000000 0.0000000000000000 0.0000000000000000 0.0000000000000000 + 0.0000000000000000 0.0000000000000000 0.0000000000000000 0.0000000000000000 + 0.0000000000000000 0.0000000000000000 0.0000000000000000 0.0000000000000000 + 0.0000000000000000 0.0000000000000000 0.0000000000000000 0.0000000000000000 + 0.0000000000000000 0.0000000000000000 0.0000000000000000 diff --git a/tests/test_linear.cpp b/tests/test_linear.cpp new file mode 100644 index 0000000..c1cad6f --- /dev/null +++ b/tests/test_linear.cpp @@ -0,0 +1,141 @@ +#include +#include +#include +#include +#include + +#include "tinycombination.hpp" + +#include "vlm.hpp" +#include "vlm_utils.hpp" + +using namespace vlm; + +// area of whole wing +float s_ref(float a, float b) { + return 0.5f * vlm::PI_f * a * b; +} + +// wing aspect ratio +float aspect_ratio(float a, float b) { + return (2*b)*(2*b) / s_ref(a, b); +} + +float compute_analytical_cl(float alpha, float a, float b) { + return 2.0f * vlm::PI_f * alpha / (1.0f + 2.0f/aspect_ratio(a, b)); +} + +float compute_analytical_cd(float cl, float a, float b) { + return cl*cl / (vlm::PI_f * aspect_ratio(a, b)); +} + +float compute_analytical_gamma(float y, float a, float b, float alpha) { + const float gamma0 = compute_analytical_cl(alpha, a, b) * 1.0f * s_ref(a, b) / (vlm::PI_f * b); + const float ratio = y / b; + return gamma0 * std::sqrt(1.0f - ratio*ratio); +} + +// bool elliptic_convergence() { +// tiny::Config cfg("../../../../config/elliptic.vlm"); + +// vlm::VLM vlm(cfg); + +// std::vector dimensions = { +// 16, 32, 45, 64, 90, 128 +// }; + +// const float a = 1.0f; // wing chord root +// const float b = 5.0f; // half wing span + +// std::vector norm_l1; +// std::vector norm_l2; +// std::vector norm_linf; + +// const float alpha = 0.1f; // degrees + +// for (const auto& dim : dimensions) { +// std::string filename = std::format("../../../../mesh/elliptic_{}x{}.x", dim, dim); +// vlm.mesh.io_read(filename); +// vlm.init(); +// vlm::Solver solver(vlm.mesh, vlm.data, cfg); +// solver.run(alpha); + +// double l1 = 0.0f; +// double l2 = 0.0f; +// double linf = 0.0f; +// int begin = (vlm.mesh.nc - 1) * vlm.mesh.ns; +// int end = vlm.mesh.nc * vlm.mesh.ns; +// // loop over last row of panels +// for (int i = begin; i < end; i++) { +// const float y = vlm.mesh.colloc.y[i]; +// const float gamma = vlm.data.gamma[i]; +// const float gamma_analytical = analytical_gamma(y, a, b, alpha); +// const double error = std::abs((gamma - gamma_analytical) / (gamma_analytical + 1e-7f)); +// std::printf("y: %f, gamma: %f, gamma_analytical: %f, error: %f \n", y, gamma, gamma_analytical, error); + +// l1 += error; +// l2 += error * error; +// linf = std::max(linf, error); +// } +// l1 /= (end - begin); +// l2 = std::sqrt(l2) / (end - begin); +// std::printf("L1: %f, L2: %f, Linf: %f\n", l1, l2, linf); + +// norm_l1.push_back(l1); +// norm_l2.push_back(l2); +// norm_linf.push_back(linf); +// } + +// double order_l1 = 0.0f; +// double order_l2 = 0.0f; +// double order_linf = 0.0f; + +// auto order = [=](double norm0, double norm1, float dim0, float dim1) { +// return std::log(norm0 / norm1) / std::log((b/dim0)/(b/dim1)); +// }; + +// for (int i = 0; i < dimensions.size() - 1; i++) { +// order_l1 += order(norm_l1[i], norm_l1[i+1], dimensions[i], dimensions[i+1]); +// order_l2 += order(norm_l2[i], norm_l2[i+1], dimensions[i], dimensions[i+1]); +// order_linf += order(norm_linf[i], norm_linf[i+1], dimensions[i], dimensions[i+1]); +// } +// order_l1 /= (dimensions.size() - 1); +// order_l2 /= (dimensions.size() - 1); +// order_linf /= (dimensions.size() - 1); +// std::printf("Order L1: %f, Order L2: %f, Order Linf: %f\n", order_l1, order_l2, order_linf); +// return 0; +// } + +int main(int argc, char **argv) { + const float a = 1.0f; // wing chord root + const float b = 5.0f; // half wing span + + const std::vector meshes = {"../../../../mesh/elliptic_64x64.x"}; + const std::vector backends = {"cpu"}; + std::vector test_alphas = {0, 1, 2, 3, 4, 5, 10, 15}; + std::transform(test_alphas.begin(), test_alphas.end(), test_alphas.begin(), to_radians); + + auto solvers = tiny::make_combination(meshes, backends); + for (const auto& [mesh_name, backend_name] : solvers) { + LinearVLM solver{}; + solver.mesh = create_mesh(mesh_name); + solver.backend = create_backend(backend_name, *solver.mesh); + + for (u64 i = 0; i < test_alphas.size(); i++) { + const FlowData flow{test_alphas[i], 0.0f, 1.0f, 1.0f}; + const auto coeffs = solver.solve(flow); + const f32 analytical_cl = compute_analytical_cl(flow.alpha, a, b); + const f32 analytical_cd = compute_analytical_cd(analytical_cl, a, b); + const f32 cl_aerr = std::abs(coeffs.cl - analytical_cl); + const f32 cl_rerr = analytical_cl < EPS_f ? 0.f : cl_aerr / analytical_cl; + const f32 cd_aerr = std::abs(coeffs.cd - analytical_cd); + const f32 cd_rerr = analytical_cd < EPS_f ? 0.f : cd_aerr / analytical_cd; + std::printf(">>> Alpha: %.1f | CL = %.6f CD = %.6f CMx = %.6f CMy = %.6f CMz = %.6f\n", to_degrees(flow.alpha), coeffs.cl, coeffs.cd, coeffs.cm.x, coeffs.cm.y, coeffs.cm.z); + std::printf(">>> Analytical CL: %.6f | Abs Error: %.3E | Relative Error: %.5f%% \n", analytical_cl, cl_aerr, cl_rerr*100.f); + std::printf(">>> Analytical CD: %.6f | Abs Error: %.3E | Relative Error: %.5f%% \n", analytical_cd, cd_aerr, cd_rerr*100.f); + if (cl_rerr > 0.03f || cd_rerr > 0.01f) return 1; + } + } + + return 0; +} \ No newline at end of file diff --git a/tests/test_non_linear.cpp b/tests/test_non_linear.cpp index 95a9738..a97d731 100644 --- a/tests/test_non_linear.cpp +++ b/tests/test_non_linear.cpp @@ -7,9 +7,11 @@ #include #include "tinyinterpolate.hpp" -#include "vlm.hpp" #include "tinycombination.hpp" +#include "vlm.hpp" +#include "vlm_utils.hpp" + using namespace vlm; struct LiftCurveFunctor { @@ -33,22 +35,14 @@ struct ThinAirfoilPolarLiftCurve : public LiftCurveFunctor{ }; template -void linspace(T start, T end, u32 n, std::vector& out) { +void linspace(T start, T end, u64 n, std::vector& out) { out.resize(n); T step = (end - start) / (n - 1); - for (u32 i = 0; i < n; i++) { + for (u64 i = 0; i < n; i++) { out[i] = start + i * step; } } -f32 to_radians(f32 degrees) { - return degrees * PI_f / 180.0f; -} - -f32 to_degrees(f32 radians) { - return radians * 180.0f / PI_f; -} - template void write_vector_pair(const std::string& filename, const std::vector& vec1, const std::vector& vec2) { assert(vec1.size() == vec2.size()); @@ -67,7 +61,7 @@ void write_vector_pair(const std::string& filename, const std::vector& vec1, int main(int argc, char** argv) { const std::vector meshes = {"../../../../mesh/infinite_rectangular_5x200.x"}; - const std::vector backends = {"avx2"}; + const std::vector backends = {"cpu"}; std::vector>> lift_curves; lift_curves.emplace_back(std::make_pair("spallart1", std::make_unique(1.2f, 0.28f, 0.02f, 2.f*PI_f, 2.f*PI_f))); lift_curves.emplace_back(std::make_pair("spallart2", std::make_unique(0.72f, 0.28f, 0.04f, 2.f*PI_f, 1.5f*PI_f))); @@ -106,16 +100,16 @@ int main(int argc, char** argv) { ); db.profiles_pos.emplace_back(0.0f); - for (u32 i = 0; i < test_alphas.size(); i++) { + for (u64 i = 0; i < test_alphas.size(); i++) { const FlowData flow{test_alphas[i], 0.0f, 1.0f, 1.0f}; auto coeffs = solver.solve(flow, db); test_cl[i] = coeffs.cl; std::printf(">>> Alpha: %.1f | CL = %.6f CD = %.6f CMx = %.6f CMy = %.6f CMz = %.6f\n", to_degrees(test_alphas[i]), coeffs.cl, coeffs.cd, coeffs.cm.x, coeffs.cm.y, coeffs.cm.z); const f32 analytical_cl = (*lift_curve.second)(flow.alpha); const f32 abs_error = std::abs(coeffs.cl - analytical_cl); - const f32 rel_error = 100.0f * abs_error / (analytical_cl + std::numeric_limits::epsilon()); - std::printf(">>> Analytical: %.6f | Abs Error: %.3E | Relative Error: %.5f%% \n", analytical_cl, abs_error, rel_error); - if (rel_error > 1.f) return 1; // Failure + const f32 rel_error = abs_error / (analytical_cl + std::numeric_limits::epsilon()); + std::printf(">>> Analytical: %.6f | Abs Error: %.3E | Relative Error: %.5f%% \n", analytical_cl, abs_error, rel_error*100.f); + if (rel_error > 0.01f) return 1; // Failure } write_vector_pair(lift_curve.first + "_nonlinear_cl", test_alphas, test_cl); } diff --git a/vlm/backends/avx2/src/vlm_backend_avx2.cpp b/vlm/backends/avx2/src/vlm_backend_avx2.cpp deleted file mode 100644 index 22c95f5..0000000 --- a/vlm/backends/avx2/src/vlm_backend_avx2.cpp +++ /dev/null @@ -1,448 +0,0 @@ -#include "vlm_backend_avx2.hpp" - -#include "linalg.h" -#include "simpletimer.hpp" -#include "vlm_mesh.hpp" -#include "vlm_data.hpp" -#include "vlm_utils.hpp" -#include "vlm_executor.hpp" // includes taskflow/taskflow.hpp - -#include -#include -#include -#include - -#include - -#include -#include - -using namespace vlm; - -BackendAVX2::~BackendAVX2() = default; // Destructor definition - -BackendAVX2::BackendAVX2(Mesh& mesh) : Backend(mesh) { - lhs.resize((u64)mesh.nb_panels_wing() * (u64)mesh.nb_panels_wing()); - rhs.resize(mesh.nb_panels_wing()); - ipiv.resize(mesh.nb_panels_wing()); - gamma.resize(mesh.nb_panels_wing()); - delta_gamma.resize(mesh.nb_panels_wing()); -} - -void BackendAVX2::reset() { - std::fill(gamma.begin(), gamma.end(), 0.0f); - std::fill(lhs.begin(), lhs.end(), 0.0f); - std::fill(rhs.begin(), rhs.end(), 0.0f); -} - -void BackendAVX2::compute_delta_gamma() { - std::copy(gamma.begin(), gamma.begin()+mesh.ns, delta_gamma.begin()); - - // note: this is efficient as the memory is contiguous - for (u32 i = 1; i < mesh.nc; i++) { - for (u32 j = 0; j < mesh.ns; j++) { - delta_gamma[i*mesh.ns + j] = gamma[i*mesh.ns + j] - gamma[(i-1)*mesh.ns + j]; - } - } -} - -inline void micro_kernel_influence_scalar(f32& vx, f32& vy, f32& vz, f32& x, f32& y, f32& z, f32& x1, f32& y1, f32& z1, f32& x2, f32& y2, f32& z2) { - static const f32 rcut = 1.0e-12f; - vx = 0.0f; - vy = 0.0f; - vz = 0.0f; - - // Katz Plotkin, Low speed Aero | Eq 10.115 - - const f32 r1r2x = (y-y1)*(z-z2) - (z-z1)*(y-y2); - const f32 r1r2y = -((x-x1)*(z-z2) - (z-z1)*(x-x2)); - const f32 r1r2z = (x-x1)*(y-y2) - (y-y1)*(x-x2); - - const f32 r1 = std::sqrt(pow<2>(x-x1)+pow<2>(y-y1)+pow<2>(z-z1)); - const f32 r2 = std::sqrt(pow<2>(x-x2)+pow<2>(y-y2)+pow<2>(z-z2)); - const f32 square = pow<2>(r1r2x) + pow<2>(r1r2y) + pow<2>(r1r2z); - if ((r1 -inline void macro_kernel_remainder_scalar(Mesh& m, std::vector& lhs, u32 ia, u32 lidx) { - // quick return - const u32 remainder = m.nb_panels_wing() % 8; - if (remainder == 0) return; - - const u32 v0 = lidx + lidx / m.ns; - const u32 v1 = v0 + 1; - const u32 v3 = v0 + m.ns+1; - const u32 v2 = v3 + 1; - - f32 v0x = m.v.x[v0]; - f32 v0y = m.v.y[v0]; - f32 v0z = m.v.z[v0]; - f32 v1x = m.v.x[v1]; - f32 v1y = m.v.y[v1]; - f32 v1z = m.v.z[v1]; - f32 v2x = m.v.x[v2]; - f32 v2y = m.v.y[v2]; - f32 v2z = m.v.z[v2]; - f32 v3x = m.v.x[v3]; - f32 v3y = m.v.y[v3]; - f32 v3z = m.v.z[v3]; - - for (u32 ia2 = m.nb_panels_wing() - remainder; ia2 < m.nb_panels_wing(); ia2++) { - const f32 colloc_x = m.colloc.x[ia2]; - const f32 colloc_y = m.colloc.y[ia2]; - const f32 colloc_z = m.colloc.z[ia2]; - - // 3 regs to store induced velocity - f32 inf_x = 0.0f; - f32 inf_y = 0.0f; - f32 inf_z = 0.0f; - kernel_influence_scalar(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v0x, v0y, v0z, v1x, v1y, v1z); - kernel_influence_scalar(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v1x, v1y, v1z, v2x, v2y, v2z); - kernel_influence_scalar(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v2x, v2y, v2z, v3x, v3y, v3z); - kernel_influence_scalar(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v3x, v3y, v3z, v0x, v0y, v0z); - f32 nx = m.normal.x[ia2]; - f32 ny = m.normal.y[ia2]; - f32 nz = m.normal.z[ia2]; - f32 ring_inf = inf_x * nx + inf_y * ny + inf_z * nz; - // store in col major order - if (Overwrite) { - lhs[ia * m.nb_panels_wing() + ia2] = ring_inf; - } else { - lhs[ia * m.nb_panels_wing() + ia2] += ring_inf; - } - } -} - -inline void micro_kernel_influence_avx2(__m256& vx, __m256& vy, __m256& vz, __m256& x, __m256& y, __m256& z, __m256& x1, __m256& y1, __m256& z1, __m256& x2, __m256& y2, __m256& z2, f32 sigma_p4) { - static const __m256 threshold = _mm256_set1_ps(1.0e-10f); - static const __m256 four_pi = _mm256_set1_ps(4.0f * PI_f); - static const __m256 zero = _mm256_set1_ps(0.0f); - - vx = zero; - vy = zero; - vz = zero; - // define vectors - __m256 r1x = _mm256_sub_ps(x, x1); - __m256 r1y = _mm256_sub_ps(y, y1); - __m256 r1z = _mm256_sub_ps(z, z1); - __m256 r2x = _mm256_sub_ps(x, x2); - __m256 r2y = _mm256_sub_ps(y, y2); - __m256 r2z = _mm256_sub_ps(z, z2); - - // crossproduct - // (v0y*v1z - v0z*v1y); - // (v0z*v1x - v0x*v1z); - // (v0x*v1y - v0y*v1x); - __m256 r1_x_r2_x = _mm256_fmsub_ps(r1y, r2z, _mm256_mul_ps(r1z, r2y)); - __m256 r1_x_r2_y = _mm256_fmsub_ps(r1z, r2x, _mm256_mul_ps(r1x, r2z)); - __m256 r1_x_r2_z = _mm256_fmsub_ps(r1x, r2y, _mm256_mul_ps(r1y, r2x)); - - // magnitude & mag squared of crossproduct - __m256 r1_x_r2_mag_p2 = _mm256_fmadd_ps(r1_x_r2_x, r1_x_r2_x, _mm256_fmadd_ps(r1_x_r2_y, r1_x_r2_y, _mm256_mul_ps(r1_x_r2_z, r1_x_r2_z))); - __m256 r1_mag = _mm256_sqrt_ps(_mm256_fmadd_ps(r1x, r1x, _mm256_fmadd_ps(r1y, r1y, _mm256_mul_ps(r1z, r1z)))); - __m256 r2_mag = _mm256_sqrt_ps(_mm256_fmadd_ps(r2x, r2x, _mm256_fmadd_ps(r2y, r2y, _mm256_mul_ps(r2z, r2z)))); - - // vector from point 1 to point 2 of segment - __m256 r0x = _mm256_sub_ps(x2, x1); - __m256 r0y = _mm256_sub_ps(y2, y1); - __m256 r0z = _mm256_sub_ps(z2, z1); - - // dot product r0.r1 and r0.r2 - __m256 r0_dot_r1 = _mm256_fmadd_ps(r0x, r1x, _mm256_fmadd_ps(r0y, r1y, _mm256_mul_ps(r0z, r1z))); - __m256 r0_dot_r2 = _mm256_fmadd_ps(r0x, r2x, _mm256_fmadd_ps(r0y, r2y, _mm256_mul_ps(r0z, r2z))); - - __m256 numerator = _mm256_fmsub_ps(r0_dot_r1, r2_mag, _mm256_mul_ps(r0_dot_r2, r1_mag)); - - __m256 four_pi_r1_mag_r2_mag = _mm256_mul_ps(four_pi, _mm256_mul_ps(r1_mag, r2_mag)); // 4*pi*|r1|*|r2| - __m256 denominator; - - if (sigma_p4 == 0.0f) { - // Singular Bio-Savart - denominator = _mm256_mul_ps(four_pi_r1_mag_r2_mag, r1_x_r2_mag_p2); - } else { - // Vatistas smoothing kernel (n=2) (https://doi.org/10.3390/fluids7020081) - __m256 r1_x_r2_mag_p4 = _mm256_mul_ps(r1_x_r2_mag_p2, r1_x_r2_mag_p2); // ^2n - __m256 r0_mag_p2 = _mm256_fmadd_ps(r0x, r0x, _mm256_fmadd_ps(r0y, r0y, _mm256_mul_ps(r0z, r0z))); - __m256 r0_mag_p4 = _mm256_mul_ps(r0_mag_p2, r0_mag_p2); // ^2n - denominator = _mm256_mul_ps(four_pi_r1_mag_r2_mag, _mm256_sqrt_ps(_mm256_fmadd_ps(_mm256_set1_ps(sigma_p4), r0_mag_p4, r1_x_r2_mag_p4))); - } - - __m256 coeff = _mm256_div_ps(numerator, denominator); - - // add the influence and blend with mask - // the masks should be done independently for optimal ILP but if compiler smart he can do it - __m256 mask = _mm256_cmp_ps(r1_mag, threshold, _CMP_LT_OS); - mask = _mm256_or_ps(mask, _mm256_cmp_ps(r2_mag, threshold, _CMP_LT_OS)); - mask = _mm256_or_ps(mask, _mm256_cmp_ps(r1_x_r2_mag_p2, threshold, _CMP_LT_OS)); - - vx = _mm256_blendv_ps(_mm256_mul_ps(r1_x_r2_x, coeff), zero, mask); - vy = _mm256_blendv_ps(_mm256_mul_ps(r1_x_r2_y, coeff), zero, mask); - vz = _mm256_blendv_ps(_mm256_mul_ps(r1_x_r2_z, coeff), zero, mask); -} - -inline void kernel_influence_avx2(__m256& inf_x, __m256& inf_y, __m256& inf_z, __m256 x, __m256 y, __m256 z, __m256 x1, __m256 y1, __m256 z1, __m256 x2, __m256 y2, __m256 z2, f32 sigma_p4) { - __m256 vx, vy, vz; - micro_kernel_influence_avx2(vx, vy, vz, x, y, z, x1, y1, z1, x2, y2, z2, sigma_p4); - inf_x = _mm256_add_ps(inf_x, vx); - inf_y = _mm256_add_ps(inf_y, vy); - inf_z = _mm256_add_ps(inf_z, vz); - y = _mm256_xor_ps(y, _mm256_set1_ps(-0.0f)); // wing symmetry - micro_kernel_influence_avx2(vx, vy, vz, x, y, z, x1, y1, z1, x2, y2, z2, sigma_p4); - inf_x = _mm256_add_ps(inf_x, vx); - inf_y = _mm256_sub_ps(inf_y, vy); // HERE IS SUB INSTEAD OF ADD ! - inf_z = _mm256_add_ps(inf_z, vz); -} - -// Fill a column of the LHS matrix (influence of a single panel on all the others) -template -inline void macro_kernel_avx2(Mesh& m, std::vector& lhs, u32 ia, u32 lidx, f32 sigma_p4) { - const u32 v0 = lidx + lidx / m.ns; - const u32 v1 = v0 + 1; - const u32 v3 = v0 + m.ns+1; - const u32 v2 = v3 + 1; - // in reality only load v1 & v2 and reuse data from previous v1 & v2 to be new v0 & v3 respectively - // 12 regs (6 loads + 6 reuse) -> these will get spilled once we get in the influence function - __m256 v0x = _mm256_broadcast_ss(&m.v.x[v0]); - __m256 v0y = _mm256_broadcast_ss(&m.v.y[v0]); - __m256 v0z = _mm256_broadcast_ss(&m.v.z[v0]); - __m256 v1x = _mm256_broadcast_ss(&m.v.x[v1]); - __m256 v1y = _mm256_broadcast_ss(&m.v.y[v1]); - __m256 v1z = _mm256_broadcast_ss(&m.v.z[v1]); - __m256 v2x = _mm256_broadcast_ss(&m.v.x[v2]); - __m256 v2y = _mm256_broadcast_ss(&m.v.y[v2]); - __m256 v2z = _mm256_broadcast_ss(&m.v.z[v2]); - __m256 v3x = _mm256_broadcast_ss(&m.v.x[v3]); - __m256 v3y = _mm256_broadcast_ss(&m.v.y[v3]); - __m256 v3z = _mm256_broadcast_ss(&m.v.z[v3]); - - for (u32 ia2 = 0; ia2 <= m.nb_panels_wing()-8; ia2+=8) { - // loads (3 regs) - __m256 colloc_x = _mm256_loadu_ps(&m.colloc.x[ia2]); - __m256 colloc_y = _mm256_loadu_ps(&m.colloc.y[ia2]); - __m256 colloc_z = _mm256_loadu_ps(&m.colloc.z[ia2]); - // 3 regs to store induced velocity - __m256 inf_x = _mm256_setzero_ps(); - __m256 inf_y = _mm256_setzero_ps(); - __m256 inf_z = _mm256_setzero_ps(); - - kernel_influence_avx2(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v0x, v0y, v0z, v1x, v1y, v1z, sigma_p4); - kernel_influence_avx2(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v1x, v1y, v1z, v2x, v2y, v2z, sigma_p4); - kernel_influence_avx2(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v2x, v2y, v2z, v3x, v3y, v3z, sigma_p4); - kernel_influence_avx2(inf_x, inf_y, inf_z, colloc_x, colloc_y, colloc_z, v3x, v3y, v3z, v0x, v0y, v0z, sigma_p4); - - // dot product - __m256 nx = _mm256_loadu_ps(&m.normal.x[ia2]); - __m256 ny = _mm256_loadu_ps(&m.normal.y[ia2]); - __m256 nz = _mm256_loadu_ps(&m.normal.z[ia2]); - __m256 ring_inf = _mm256_fmadd_ps(inf_x, nx, _mm256_fmadd_ps(inf_y, ny, _mm256_mul_ps(inf_z, nz))); - - // store in col major order - if (Overwrite) { - _mm256_storeu_ps(&lhs[ia * m.nb_panels_wing() + ia2], ring_inf); - } else { - __m256 lhs_ia = _mm256_loadu_ps(&lhs[ia * m.nb_panels_wing() + ia2]); - lhs_ia = _mm256_add_ps(lhs_ia, ring_inf); - _mm256_storeu_ps(&lhs[ia * m.nb_panels_wing() + ia2], lhs_ia); - } - } -} - -void BackendAVX2::compute_lhs(const FlowData& flow) { - SimpleTimer timer("LHS"); - Mesh& m = mesh; - const f32 sigma_p4 = pow<4>(flow.sigma_vatistas); // Vatistas coeffcient (^2n with n=2) - - const u32 start_wing = 0; - const u32 end_wing = (m.nc - 1) * m.ns; - - tf::Taskflow taskflow; - - auto init = taskflow.placeholder(); - auto wing_pass = taskflow.for_each_index(start_wing, end_wing, [&] (u32 i) { - macro_kernel_avx2(m, lhs, i, i, sigma_p4); - macro_kernel_remainder_scalar(m, lhs, i, i); - }); - - u32 idx = m.nc - 1; - auto cond = taskflow.emplace([&]{ - return idx < m.nc + m.nw ? 0 : 1; // 0 means continue, 1 means break - }); - auto wake_pass = taskflow.for_each_index(0u, m.ns, [&] (u32 j) { - const u32 ia = (m.nc - 1) * m.ns + j; - const u32 lidx = idx * m.ns + j; - macro_kernel_avx2(m, lhs, ia, lidx, sigma_p4); - macro_kernel_remainder_scalar(m, lhs, idx, idx); - }); - auto back = taskflow.emplace([&]{ - idx++; - return 0; // 0 means continue - }); - auto sync = taskflow.placeholder(); - - init.precede(wing_pass, cond); - wing_pass.precede(sync); - cond.precede(wake_pass, sync); - wake_pass.precede(back); - back.precede(cond); - - Executor::get().run(taskflow).wait(); -} - -void kernel_generic_rhs(u32 n, const float normal_x[], const float normal_y[], const float normal_z[], float freestream_x, float freestream_y, float freestream_z, float rhs[]) { - for (u32 i = 0; i < n; i++) { - rhs[i] = - (freestream_x * normal_x[i] + freestream_y * normal_y[i] + freestream_z * normal_z[i]); - } -} - -void BackendAVX2::compute_rhs(const FlowData& flow) { - SimpleTimer timer("RHS"); - const Mesh& m = mesh; - - kernel_generic_rhs(m.nb_panels_wing(), m.normal.x.data(), m.normal.y.data(), m.normal.z.data(), flow.freestream.x, flow.freestream.y, flow.freestream.z, rhs.data()); -} - -void BackendAVX2::compute_rhs(const FlowData& flow, const std::vector& section_alphas) { - SimpleTimer timer("Rebuild RHS"); - assert(section_alphas.size() == mesh.ns); - const Mesh& m = mesh; - for (u32 i = 0; i < mesh.nc; i++) { - for (u32 j = 0; j < mesh.ns; j++) { - const u32 li = i * mesh.ns + j; // linear index - const linalg::alias::float3 freestream = compute_freestream(flow.u_inf, section_alphas[j], flow.beta); - rhs[li] = - (freestream.x * m.normal.x[li] + freestream.y * m.normal.y[li] + freestream.z * m.normal.z[li]); - } - } -} - -void BackendAVX2::lu_factor() { - SimpleTimer timer("Factor"); - const u32 n = mesh.nb_panels_wing(); - LAPACKE_sgetrf(LAPACK_COL_MAJOR, n, n, lhs.data(), n, ipiv.data()); -} - -void BackendAVX2::lu_solve() { - SimpleTimer timer("Solve"); - const u32 n = mesh.nb_panels_wing(); - std::copy(rhs.begin(), rhs.end(), gamma.begin()); - - LAPACKE_sgetrs(LAPACK_COL_MAJOR, 'N', n, 1, lhs.data(), n, ipiv.data(), gamma.data(), n); -} - -f32 BackendAVX2::compute_coefficient_cl(const FlowData& flow, const f32 area, - const u32 j, const u32 n) { - assert(n > 0); - assert(j >= 0 and j+n <= mesh.ns); - - f32 cl = 0.0f; - - for (u32 u = 0; u < mesh.nc; u++) { - for (u32 v = j; v < j + n; v++) { - const u32 li = u * mesh.ns + v; // linear index - const linalg::alias::float3 v0 = mesh.get_v0(li); - const linalg::alias::float3 v1 = mesh.get_v1(li); - // Leading edge vector pointing outward from wing root - const linalg::alias::float3 dl = v1 - v0; - // Distance from the center of leading edge to the reference point - const linalg::alias::float3 force = linalg::cross(flow.freestream, dl) * flow.rho * delta_gamma[li]; - cl += linalg::dot(force, flow.lift_axis); - } - } - cl /= 0.5f * flow.rho * linalg::length2(flow.freestream) * area; - - return cl; -} - -linalg::alias::float3 BackendAVX2::compute_coefficient_cm( - const FlowData& flow, - const f32 area, - const f32 chord, - const u32 j, - const u32 n) -{ - assert(n > 0); - assert(j >= 0 and j+n <= mesh.ns); - linalg::alias::float3 cm(0.f, 0.f, 0.f); - - for (u32 u = 0; u < mesh.nc; u++) { - for (u32 v = j; v < j + n; v++) { - const u32 li = u * mesh.ns + v; // linear index - const linalg::alias::float3 v0 = mesh.get_v0(li); - const linalg::alias::float3 v1 = mesh.get_v1(li); - // Leading edge vector pointing outward from wing root - const linalg::alias::float3 dl = v1 - v0; - // Distance from the center of leading edge to the reference point - const linalg::alias::float3 dst_to_ref = mesh.ref_pt - 0.5f * (v0 + v1); - // Distance from the center of leading edge to the reference point - const linalg::alias::float3 force = linalg::cross(flow.freestream, dl) * flow.rho * delta_gamma[li]; - cm += linalg::cross(force, dst_to_ref); - } - } - cm /= 0.5f * flow.rho * linalg::length2(flow.freestream) * area * chord; - return cm; -} - -f32 BackendAVX2::compute_coefficient_cd( - const FlowData& flow, - const f32 area, - const u32 j, - const u32 n) -{ - assert(n > 0); - assert(j >= 0 and j+n <= mesh.ns); - - f32 cd = 0.0f; - // Drag coefficent computed using Trefftz plane - const u32 begin = j + mesh.nb_panels_wing(); - const u32 end = begin + n; - // parallel for - for (u32 ia = begin; ia < end; ia++) { - const f32 colloc_x = mesh.colloc.x[ia]; - const f32 colloc_y = mesh.colloc.y[ia]; - const f32 colloc_z = mesh.colloc.z[ia]; - linalg::alias::float3 inf(0.f, 0.f, 0.f); - for (u32 ia2 = begin; ia2 < end; ia2++) { - linalg::alias::float3 inf2(0.f, 0.f, 0.f); - linalg::alias::float3 v0 = mesh.get_v0(ia2); - linalg::alias::float3 v1 = mesh.get_v1(ia2); - linalg::alias::float3 v2 = mesh.get_v2(ia2); - linalg::alias::float3 v3 = mesh.get_v3(ia2); - // Influence from the streamwise vortex lines - kernel_influence_scalar(inf2.x, inf2.y, inf2.z, colloc_x, colloc_y, colloc_z, v1.x, v1.y, v1.z, v2.x, v2.y, v2.z); - kernel_influence_scalar(inf2.x, inf2.y, inf2.z, colloc_x, colloc_y, colloc_z, v3.x, v3.y, v3.z, v0.x, v0.y, v0.z); - f32 gamma_w = gamma[(mesh.nc-1)*mesh.ns + ia2 % mesh.ns]; - // This is the induced velocity calculated with the vortex (gamma) calculated earlier (according to kutta condition) - inf += gamma_w * inf2; - } - const linalg::alias::float3 normal{mesh.normal.x[ia], mesh.normal.y[ia], mesh.normal.z[ia]}; - const f32 w_ind = linalg::dot(inf, normal); - const u32 col = ia % mesh.ns; - linalg::alias::float3 v0 = mesh.get_v0(ia); - linalg::alias::float3 v1 = mesh.get_v1(ia); - const f32 dl = linalg::length(v1 - v0); - cd -= 0.5f * flow.rho * gamma[(mesh.nc-1)*mesh.ns + col] * w_ind * dl; - } - cd /= 0.5f * flow.rho * linalg::length2(flow.freestream) * area; - return cd; -} \ No newline at end of file diff --git a/vlm/backends/avx2/include/vlm_backend_avx2.hpp b/vlm/backends/cpu/include/vlm_backend_cpu.hpp similarity index 73% rename from vlm/backends/avx2/include/vlm_backend_avx2.hpp rename to vlm/backends/cpu/include/vlm_backend_cpu.hpp index 732c598..595bf46 100644 --- a/vlm/backends/avx2/include/vlm_backend_avx2.hpp +++ b/vlm/backends/cpu/include/vlm_backend_cpu.hpp @@ -5,25 +5,26 @@ namespace vlm { -class BackendAVX2 : public Backend { +class BackendCPU : public Backend { public: std::vector lhs; std::vector rhs; std::vector ipiv; std::vector gamma; std::vector delta_gamma; + std::vector trefftz_buffer; - BackendAVX2(Mesh& mesh); - ~BackendAVX2(); + BackendCPU(Mesh& mesh); + ~BackendCPU(); void reset() override; void compute_lhs(const FlowData& flow) override; void compute_rhs(const FlowData& flow) override; void compute_rhs(const FlowData& flow, const std::vector& section_alphas) override; void lu_factor() override; void lu_solve() override; - f32 compute_coefficient_cl(const FlowData& flow, const f32 area, const u32 j, const u32 n) override; - linalg::alias::float3 compute_coefficient_cm(const FlowData& flow, const f32 area, const f32 chord, const u32 j, const u32 n) override; - f32 compute_coefficient_cd(const FlowData& flow, const f32 area, const u32 j, const u32 n) override; + f32 compute_coefficient_cl(const FlowData& flow, const f32 area, const u64 j, const u64 n) override; + linalg::alias::float3 compute_coefficient_cm(const FlowData& flow, const f32 area, const f32 chord, const u64 j, const u64 n) override; + f32 compute_coefficient_cd(const FlowData& flow, const f32 area, const u64 j, const u64 n) override; void compute_delta_gamma() override; }; diff --git a/vlm/backends/cpu/include/vlm_backend_cpu_kernels.hpp b/vlm/backends/cpu/include/vlm_backend_cpu_kernels.hpp new file mode 100644 index 0000000..e879f37 --- /dev/null +++ b/vlm/backends/cpu/include/vlm_backend_cpu_kernels.hpp @@ -0,0 +1,13 @@ +#include "vlm_types.hpp" + +namespace vlm { +void kernel_influence( + u64 m, u64 n, + f32 lhs[], + f32 vx[], f32 vy[], f32 vz[], + f32 collocx[], f32 collocy[], f32 collocz[], + f32 normalx[], f32 normaly[], f32 normalz[], + u64 ia, u64 lidx, f32 sigma_p4 + ); +} + diff --git a/vlm/backends/cpu/src/vlm_backend_cpu.cpp b/vlm/backends/cpu/src/vlm_backend_cpu.cpp new file mode 100644 index 0000000..986cec6 --- /dev/null +++ b/vlm/backends/cpu/src/vlm_backend_cpu.cpp @@ -0,0 +1,229 @@ +#include "vlm_backend_cpu.hpp" +#include "vlm_backend_cpu_kernels.hpp" +#include "vlm_backend_cpu_kernels_ispc.h" + +#include "linalg.h" +#include "tinytimer.hpp" +#include "vlm_mesh.hpp" +#include "vlm_data.hpp" +#include "vlm_utils.hpp" +#include "vlm_executor.hpp" // includes taskflow/taskflow.hpp + +#include // std::fill +#include // std::printf + +#include +#include + +#include +#include + +using namespace vlm; + +BackendCPU::~BackendCPU() = default; // Destructor definition + +BackendCPU::BackendCPU(Mesh& mesh) : Backend(mesh) { + lhs.resize(mesh.nb_panels_wing() * mesh.nb_panels_wing()); + rhs.resize(mesh.nb_panels_wing()); + ipiv.resize(mesh.nb_panels_wing()); + gamma.resize(mesh.nb_panels_wing()); + delta_gamma.resize(mesh.nb_panels_wing()); + trefftz_buffer.resize(mesh.ns); +} + +void BackendCPU::reset() { + std::fill(gamma.begin(), gamma.end(), 0.0f); + std::fill(lhs.begin(), lhs.end(), 0.0f); + std::fill(rhs.begin(), rhs.end(), 0.0f); +} + +void BackendCPU::compute_delta_gamma() { + std::copy(gamma.begin(), gamma.begin()+mesh.ns, delta_gamma.begin()); + + // note: this is efficient as the memory is contiguous + for (u64 i = 1; i < mesh.nc; i++) { + for (u64 j = 0; j < mesh.ns; j++) { + delta_gamma[i*mesh.ns + j] = gamma[i*mesh.ns + j] - gamma[(i-1)*mesh.ns + j]; + } + } +} + +void BackendCPU::compute_lhs(const FlowData& flow) { + tiny::ScopedTimer timer("LHS"); + Mesh& m = mesh; + + ispc::MeshProxy mesh_proxy = { + m.ns, m.nc, m.nb_panels_wing(), + {m.v.x.data(), m.v.y.data(), m.v.z.data()}, + {m.colloc.x.data(), m.colloc.y.data(), m.colloc.z.data()}, + {m.normal.x.data(), m.normal.y.data(), m.normal.z.data()} + }; + + const u64 zero = 0; + const u64 end_wing = (m.nc - 1) * m.ns; + + tf::Taskflow taskflow; + + auto init = taskflow.placeholder(); + auto wing_pass = taskflow.for_each_index(zero, end_wing, [&] (u64 i) { + ispc::kernel_influence(mesh_proxy, lhs.data(), i, i, flow.sigma_vatistas); + }); + + u64 idx = m.nc - 1; + auto cond = taskflow.emplace([&]{ + return idx < m.nc + m.nw ? 0 : 1; // 0 means continue, 1 means break + }); + auto wake_pass = taskflow.for_each_index(zero, m.ns, [&] (u64 j) { + const u64 ia = (m.nc - 1) * m.ns + j; + const u64 lidx = idx * m.ns + j; + ispc::kernel_influence(mesh_proxy, lhs.data(), ia, lidx, flow.sigma_vatistas); + }); + auto back = taskflow.emplace([&]{ + idx++; + return 0; // 0 means continue + }); + auto sync = taskflow.placeholder(); + + init.precede(wing_pass, cond); + wing_pass.precede(sync); + cond.precede(wake_pass, sync); + wake_pass.precede(back); + back.precede(cond); + + Executor::get().run(taskflow).wait(); +} + +void kernel_cpu_rhs(u64 n, const float normal_x[], const float normal_y[], const float normal_z[], float freestream_x, float freestream_y, float freestream_z, float rhs[]) { + for (u64 i = 0; i < n; i++) { + rhs[i] = - (freestream_x * normal_x[i] + freestream_y * normal_y[i] + freestream_z * normal_z[i]); + } +} + +void BackendCPU::compute_rhs(const FlowData& flow) { + tiny::ScopedTimer timer("RHS"); + const Mesh& m = mesh; + + kernel_cpu_rhs(m.nb_panels_wing(), m.normal.x.data(), m.normal.y.data(), m.normal.z.data(), flow.freestream.x, flow.freestream.y, flow.freestream.z, rhs.data()); +} + +void BackendCPU::compute_rhs(const FlowData& flow, const std::vector& section_alphas) { + tiny::ScopedTimer timer("Rebuild RHS"); + assert(section_alphas.size() == mesh.ns); + const Mesh& m = mesh; + for (u64 i = 0; i < mesh.nc; i++) { + for (u64 j = 0; j < mesh.ns; j++) { + const u64 li = i * mesh.ns + j; // linear index + const linalg::alias::float3 freestream = compute_freestream(flow.u_inf, section_alphas[j], flow.beta); + rhs[li] = - (freestream.x * m.normal.x[li] + freestream.y * m.normal.y[li] + freestream.z * m.normal.z[li]); + } + } +} + +void BackendCPU::lu_factor() { + tiny::ScopedTimer timer("Factor"); + const int32_t n = static_cast(mesh.nb_panels_wing()); + LAPACKE_sgetrf(LAPACK_COL_MAJOR, n, n, lhs.data(), n, ipiv.data()); +} + +void BackendCPU::lu_solve() { + tiny::ScopedTimer timer("Solve"); + const int32_t n = static_cast(mesh.nb_panels_wing()); + std::copy(rhs.begin(), rhs.end(), gamma.begin()); + + LAPACKE_sgetrs(LAPACK_COL_MAJOR, 'N', n, 1, lhs.data(), n, ipiv.data(), gamma.data(), n); +} + +// f32 BackendCPU::compute_coefficient_cl(const FlowData& flow, const f32 area, +// const u64 j, const u64 n) { +// assert(n > 0); +// assert(j >= 0 && j+n <= mesh.ns); + +// f32 cl = 0.0f; + +// for (u64 u = 0; u < mesh.nc; u++) { +// for (u64 v = j; v < j + n; v++) { +// const u64 li = u * mesh.ns + v; // linear index +// const linalg::alias::float3 v0 = mesh.get_v0(li); +// const linalg::alias::float3 v1 = mesh.get_v1(li); +// // Leading edge vector pointing outward from wing root +// const linalg::alias::float3 dl = v1 - v0; +// // Distance from the center of leading edge to the reference point +// const linalg::alias::float3 force = linalg::cross(flow.freestream, dl) * flow.rho * delta_gamma[li]; +// cl += linalg::dot(force, flow.lift_axis); +// } +// } +// cl /= 0.5f * flow.rho * linalg::length2(flow.freestream) * area; + +// return cl; +// } + +linalg::alias::float3 BackendCPU::compute_coefficient_cm( + const FlowData& flow, + const f32 area, + const f32 chord, + const u64 j, + const u64 n) +{ + assert(n > 0); + assert(j >= 0 && j+n <= mesh.ns); + linalg::alias::float3 cm(0.f, 0.f, 0.f); + + for (u64 u = 0; u < mesh.nc; u++) { + for (u64 v = j; v < j + n; v++) { + const u64 li = u * mesh.ns + v; // linear index + const linalg::alias::float3 v0 = mesh.get_v0(li); + const linalg::alias::float3 v1 = mesh.get_v1(li); + // Leading edge vector pointing outward from wing root + const linalg::alias::float3 dl = v1 - v0; + // Distance from the center of leading edge to the reference point + const linalg::alias::float3 dst_to_ref = mesh.ref_pt - 0.5f * (v0 + v1); + // Distance from the center of leading edge to the reference point + const linalg::alias::float3 force = linalg::cross(flow.freestream, dl) * flow.rho * delta_gamma[li]; + cm += linalg::cross(force, dst_to_ref); + } + } + cm /= 0.5f * flow.rho * linalg::length2(flow.freestream) * area * chord; + return cm; +} + +f32 BackendCPU::compute_coefficient_cd( + const FlowData& flow, + const f32 area, + const u64 j, + const u64 n) +{ + tiny::ScopedTimer timer("Compute CD"); + assert(n > 0); + assert(j >= 0 && j+n <= mesh.ns); + std::fill(trefftz_buffer.begin(), trefftz_buffer.end(), 0.0f); + + Mesh& m = mesh; + ispc::MeshProxy mesh_proxy = { + m.ns, m.nc, m.nb_panels_wing(), + {m.v.x.data(), m.v.y.data(), m.v.z.data()}, + {m.colloc.x.data(), m.colloc.y.data(), m.colloc.z.data()}, + {m.normal.x.data(), m.normal.y.data(), m.normal.z.data()} + }; + f32 cd = ispc::kernel_trefftz_cd(mesh_proxy, gamma.data(), trefftz_buffer.data(), j, n, flow.sigma_vatistas); + cd /= linalg::length2(flow.freestream) * area; + return cd; +} + +// Using Trefftz plane +f32 BackendCPU::compute_coefficient_cl( + const FlowData& flow, + const f32 area, + const u64 j, + const u64 n) +{ + Mesh& m = mesh; + ispc::MeshProxy mesh_proxy = { + m.ns, m.nc, m.nb_panels_wing(), + {m.v.x.data(), m.v.y.data(), m.v.z.data()}, + {m.colloc.x.data(), m.colloc.y.data(), m.colloc.z.data()}, + {m.normal.x.data(), m.normal.y.data(), m.normal.z.data()} + }; + f32 cl = ispc::kernel_trefftz_cl(mesh_proxy, gamma.data(), j, n); + cl /= 0.5f * flow.rho * linalg::length2(flow.freestream) * area; + return cl; +} diff --git a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.cpp b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.cpp new file mode 100644 index 0000000..fa7268a --- /dev/null +++ b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.cpp @@ -0,0 +1,69 @@ +#include "vlm_backend_cpu_kernels.hpp" +#include "linalg.h" +#include "vlm_types.hpp" +#include + +using namespace vlm; +using namespace linalg::alias; + +inline float3 kernel_biosavart(const float3& colloc, const float3& vertex1, const float3& vertex2, const f32& sigma) { + const f32 rcut = 1e-10f; // highly tuned value + const float3 r0 = vertex2 - vertex1; + const float3 r1 = colloc - vertex1; + const float3 r2 = colloc - vertex2; + // Katz Plotkin, Low speed Aero | Eq 10.115 + const float3 r1r2cross = linalg::cross(r1, r2); + const f32 r1_norm = linalg::length(r1); + const f32 r2_norm = linalg::length(r2); + const f32 square = linalg::length2(r1r2cross); + if ((r1_norm float3; + +#define DOT(v0, v1) (v0.x*v1.x + v0.y*v1.y + v0.z*v1.z) + +#define CROSS(v0, v1, res) \ + res.x = v0.y*v1.z - v0.z*v1.y; \ + res.y = v0.z*v1.x - v0.x*v1.z; \ + res.z = v0.x*v1.y - v0.y*v1.x; + +#define LENGTH2(v) (v.x*v.x + v.y*v.y + v.z*v.z) + +#define LENGTH(v) (sqrt(LENGTH2(v))) + +inline float dot(const float3& v0, const float3& v1) {return DOT(v0, v1);} +inline float dot(const float3& v0, const uniform float3& v1) {return DOT(v0, v1);} +inline float dot(const uniform float3& v0, const float3& v1) {return DOT(v0, v1);} +inline uniform float dot(const uniform float3& v0, const uniform float3& v1) {return DOT(v0, v1);} + +inline float3 cross(const float3& v0, const float3& v1) {float3 res; CROSS(v0, v1, res); return res;} +inline float3 cross(const float3& v0, const uniform float3& v1) {float3 res; CROSS(v0, v1, res); return res;} +inline float3 cross(const uniform float3& v0, const float3& v1) {float3 res; CROSS(v0, v1, res); return res;} +inline uniform float3 cross(const uniform float3& v0, const uniform float3& v1) {uniform float3 res; CROSS(v0, v1, res); return res;} + +inline float length2(const float3& v) {return LENGTH2(v);} +inline uniform float length2(const uniform float3& v) {return LENGTH2(v);} + +inline float length(const float3& v) {return LENGTH(v);} +inline uniform float length(const uniform float3& v) {return LENGTH(v);} + +// Aggregate Structures +export struct SoA3D { + uniform float* uniform x; + uniform float* uniform y; + uniform float* uniform z; +}; + +export struct MeshProxy { + uniform unsigned int64 ns; + uniform unsigned int64 nc; + uniform unsigned int64 nb_panels; + uniform SoA3D v; // vertices + uniform SoA3D colloc; // collocation points + uniform SoA3D normal; // normals +}; + +// Bio-savart Kernel +#define RCUT 1e-10f +#define RCUT2 1e-5f + +#define PI_f 3.141593f + +inline float3 kernel_biosavart(float3& colloc, const uniform float3& vertex1, const uniform float3& vertex2, const uniform float& sigma) { + uniform float3 r0 = vertex2 - vertex1; + float3 r1 = colloc - vertex1; + float3 r2 = colloc - vertex2; + // Katz Plotkin, Low speed Aero | Eq 10.115 + float3 r1r2cross = cross(r1, r2); + float r1_norm = length(r1); + float r2_norm = length(r2); + float square = length2(r1r2cross); + + if ((square& section_alphas) override; void lu_factor() override; void lu_solve() override; - f32 compute_coefficient_cl(const FlowData& flow, const f32 area, const u32 j, const u32 n) override; - linalg::alias::float3 compute_coefficient_cm(const FlowData& flow, const f32 area, const f32 chord, const u32 j, const u32 n) override; - f32 compute_coefficient_cd(const FlowData& flow, const f32 area, const u32 j, const u32 n) override; + f32 compute_coefficient_cl(const FlowData& flow, const f32 area, const u64 j, const u64 n) override; + linalg::alias::float3 compute_coefficient_cm(const FlowData& flow, const f32 area, const f32 chord, const u64 j, const u64 n) override; + f32 compute_coefficient_cd(const FlowData& flow, const f32 area, const u64 j, const u64 n) override; void compute_delta_gamma() override; }; diff --git a/vlm/backends/cuda/src/vlm_backend_cuda.cu b/vlm/backends/cuda/src/vlm_backend_cuda.cu index 38b47f1..eb7e08f 100644 --- a/vlm/backends/cuda/src/vlm_backend_cuda.cu +++ b/vlm/backends/cuda/src/vlm_backend_cuda.cu @@ -1,4 +1,4 @@ -#include "vlm_backend_avx2.hpp" +#include "vlm_backend_cpu.hpp" #include "vlm_backend_cuda.hpp" #include "simpletimer.hpp" @@ -128,7 +128,7 @@ BackendCUDA::~BackendCUDA() { CHECK_CUDA(cudaFree(d_delta_gamma)); } -// For the moment, cuda backend just falls back to AVX2 +// For the moment, cuda backend just falls back to cpu backend void BackendCUDA::reset() { default_backend.reset(); @@ -213,16 +213,16 @@ void BackendCUDA::lu_solve() { f32 BackendCUDA::compute_coefficient_cl( const FlowData& flow, const f32 area, - const u32 j, - const u32 n) { + const u64 j, + const u64 n) { return default_backend.compute_coefficient_cl(flow, area, j, n); } f32 BackendCUDA::compute_coefficient_cd( const FlowData& flow, const f32 area, - const u32 j, - const u32 n) { + const u64 j, + const u64 n) { return default_backend.compute_coefficient_cd(flow, area, j, n); } @@ -230,8 +230,8 @@ linalg::alias::float3 BackendCUDA::compute_coefficient_cm( const FlowData& flow, const f32 area, const f32 chord, - const u32 j, - const u32 n) { + const u64 j, + const u64 n) { return default_backend.compute_coefficient_cm(flow, area, chord, j, n); } diff --git a/vlm/backends/cuda/xmake.lua b/vlm/backends/cuda/xmake.lua index b776287..52224e2 100644 --- a/vlm/backends/cuda/xmake.lua +++ b/vlm/backends/cuda/xmake.lua @@ -4,7 +4,7 @@ target("backend-cuda") set_kind("static") set_default(false) add_packages("cuda") - add_deps("backend-avx2") + add_deps("backend-cpu") add_includedirs("../../include") add_files("src/*.cu") diff --git a/vlm/dev/main.cpp b/vlm/dev/main.cpp index 1810361..e40b066 100644 --- a/vlm/dev/main.cpp +++ b/vlm/dev/main.cpp @@ -2,6 +2,8 @@ #include "parser.hpp" #include "vlm_executor.hpp" #include "vlm_types.hpp" +#include "vlm_utils.hpp" + #include #include #include @@ -54,7 +56,7 @@ int main(int argc, char **argv) { }); try { - //vlm::Executor::instance(1); // 1 thread + // vlm::Executor::instance(1); // 1 thread LinearVLM solver(cfg); std::vector alphas = cfg().section("solver").get_vector("alphas", {0.0f}); std::transform(alphas.begin(), alphas.end(), alphas.begin(), @@ -65,7 +67,7 @@ int main(int argc, char **argv) { for (auto alpha : alphas) { FlowData flow(alpha, 0.0f, 1.0f, 1.0f); auto coeffs = solver.solve(flow); - std::printf(">>> Alpha: %.1f | CL = %.6f CD = %.6f CMx = %.6f CMy = %.6f CMz = %.6f\n", flow.alpha, coeffs.cl, coeffs.cd, coeffs.cm.x, coeffs.cm.y, coeffs.cm.z); + std::printf(">>> Alpha: %.1f | CL = %.6f CD = %.6f CMx = %.6f CMy = %.6f CMz = %.6f\n", vlm::to_degrees(flow.alpha), coeffs.cl, coeffs.cd, coeffs.cm.x, coeffs.cm.y, coeffs.cm.z); } // Pause for memory reading diff --git a/vlm/include/vlm.hpp b/vlm/include/vlm.hpp index 942b582..768c5e7 100644 --- a/vlm/include/vlm.hpp +++ b/vlm/include/vlm.hpp @@ -19,19 +19,19 @@ class Solver { class NonLinearVLM final: public Solver { public: const f64 tol; - const u32 max_iter; + const u64 max_iter; std::vector strip_alphas; NonLinearVLM( const tiny::Config& cfg, const f64 tol_= 1e-4f, - const u32 max_iter_ = 100 + const u64 max_iter_ = 100 ): Solver(cfg), tol(tol_), max_iter(max_iter_) {} - NonLinearVLM(const f64 tol_= 1e-4f, const u32 max_iter_ = 100) : tol(tol_), max_iter(max_iter_) {} + NonLinearVLM(const f64 tol_= 1e-4f, const u64 max_iter_ = 100) : tol(tol_), max_iter(max_iter_) {} ~NonLinearVLM() = default; AeroCoefficients solve(const FlowData& flow, const Database& db); }; diff --git a/vlm/include/vlm_backend.hpp b/vlm/include/vlm_backend.hpp index aea8091..b74a382 100644 --- a/vlm/include/vlm_backend.hpp +++ b/vlm/include/vlm_backend.hpp @@ -16,11 +16,11 @@ class Backend { virtual void compute_rhs(const FlowData& flow, const std::vector& section_alphas) = 0; virtual void lu_factor() = 0; virtual void lu_solve() = 0; - virtual f32 compute_coefficient_cl(const FlowData& flow, const f32 area, const u32 j, const u32 n) = 0; + virtual f32 compute_coefficient_cl(const FlowData& flow, const f32 area, const u64 j, const u64 n) = 0; f32 compute_coefficient_cl(const FlowData& flow); - virtual linalg::alias::float3 compute_coefficient_cm(const FlowData& flow, const f32 area, const f32 chord, const u32 j, const u32 n) = 0; + virtual linalg::alias::float3 compute_coefficient_cm(const FlowData& flow, const f32 area, const f32 chord, const u64 j, const u64 n) = 0; linalg::alias::float3 compute_coefficient_cm(const FlowData& flow); - virtual f32 compute_coefficient_cd(const FlowData& flow, const f32 area, const u32 j, const u32 n) = 0; + virtual f32 compute_coefficient_cd(const FlowData& flow, const f32 area, const u64 j, const u64 n) = 0; f32 compute_coefficient_cd(const FlowData& flow); virtual void compute_delta_gamma() = 0; virtual ~Backend() = default; diff --git a/vlm/include/vlm_executor.hpp b/vlm/include/vlm_executor.hpp index c6e7e2d..64b51ca 100644 --- a/vlm/include/vlm_executor.hpp +++ b/vlm/include/vlm_executor.hpp @@ -1,7 +1,6 @@ // This header should not be included by another header file, only by source files. #pragma once -#include #include // includes , , // Taskflow executor singleton wrapper (not thread-safe by design) diff --git a/vlm/include/vlm_mesh.hpp b/vlm/include/vlm_mesh.hpp index 48ff59d..91888c6 100644 --- a/vlm/include/vlm_mesh.hpp +++ b/vlm/include/vlm_mesh.hpp @@ -17,10 +17,10 @@ class Mesh { // (stored in span major order) SoA_3D_t v; // size (ncw+1)*(ns+1) // Offsets for indexing in connectivity array for each panel - std::vector offsets = {}; // size nc*ns + 1 + std::vector offsets = {}; // size nc*ns + 1 // Panel-vertex connectivity // vertices indices of a panel: connectivity[offsets[i]] to connectivity[offsets[i+1]] - std::vector connectivity = {}; // size 4*(nc*ns) + std::vector connectivity = {}; // size 4*(nc*ns) // Panels metrics (stored span major order) // --------------------- @@ -33,9 +33,9 @@ class Mesh { // Structured dimensions // --------------------- - u32 nc = 1; // number of panels chordwise - u32 ns = 1; // number of panels spanwise - const u32 nw = 1; // number of panels in chordwise wake + u64 nc = 1; // number of panels chordwise + u64 ns = 1; // number of panels spanwise + const u64 nw = 1; // number of panels in chordwise wake // Analytical quanities when provided // --------------------- @@ -48,22 +48,23 @@ class Mesh { void compute_connectivity(); // fill offsets, connectivity void compute_metrics_wing(); // fill colloc, normal, area void compute_metrics_wake(); - void compute_metrics_i(u32 i); - u32 nb_panels_wing() const; - u32 nb_panels_total() const; - u32 nb_vertices_wing() const; - u32 nb_vertices_total() const; - f32 panels_area(const u32 i, const u32 j, const u32 m, const u32 n) const; - f32 panels_area_xy(const u32 i, const u32 j, const u32 m, const u32 n) const; - f32 panel_width_y(const u32 i, const u32 j) const; - f32 chord_length(const u32 j) const; - f32 chord_mean(const u32 j, const u32 n) const; + void compute_metrics_i(u64 i); + u64 nb_panels_wing() const; + u64 nb_panels_total() const; + u64 nb_vertices_wing() const; + u64 nb_vertices_total() const; + f32 panels_area(const u64 i, const u64 j, const u64 m, const u64 n) const; + f32 panels_area_xy(const u64 i, const u64 j, const u64 m, const u64 n) const; + f32 panel_width_y(const u64 i, const u64 j) const; + f32 strip_width(const u64 j) const; + f32 chord_length(const u64 j) const; + f32 chord_mean(const u64 j, const u64 n) const; // i panel vertices coordinates - linalg::alias::float3 get_v0(u32 i) const; // upper left - linalg::alias::float3 get_v1(u32 i) const; // upper right - linalg::alias::float3 get_v2(u32 i) const; // lower right - linalg::alias::float3 get_v3(u32 i) const; // lower left + linalg::alias::float3 get_v0(u64 i) const; // upper left + linalg::alias::float3 get_v1(u64 i) const; // upper right + linalg::alias::float3 get_v2(u64 i) const; // lower right + linalg::alias::float3 get_v3(u64 i) const; // lower left void io_read(const std::string& filename); Mesh(const tiny::Config& cfg); diff --git a/vlm/include/vlm_types.hpp b/vlm/include/vlm_types.hpp index 0098f50..2d0a931 100644 --- a/vlm/include/vlm_types.hpp +++ b/vlm/include/vlm_types.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "linalg.h" #include "tinyconfig.hpp" @@ -27,6 +28,8 @@ using f64 = double; constexpr f64 PI_d = 3.14159265358979; constexpr f32 PI_f = 3.141593f; +constexpr f64 EPS_d = std::numeric_limits::epsilon(); +constexpr f32 EPS_f = std::numeric_limits::epsilon(); // Optimized n power function that uses template folding optimisations // to generate a series of mul instructions @@ -44,16 +47,16 @@ struct SoA_3D_t { std::vector x = {}; std::vector y = {}; std::vector z = {}; - u32 size = 0; + u64 size = 0; - void resize(u32 size_) { + void resize(u64 size_) { size = size_; x.resize(size); y.resize(size); z.resize(size); } - void reserve(u32 size_) { + void reserve(u64 size_) { size = size_; x.reserve(size); y.reserve(size); diff --git a/vlm/include/vlm_utils.hpp b/vlm/include/vlm_utils.hpp index f3a1376..9604cc6 100644 --- a/vlm/include/vlm_utils.hpp +++ b/vlm/include/vlm_utils.hpp @@ -9,4 +9,6 @@ linalg::alias::float3 compute_freestream(const f32 u_inf, const f32 alpha, const linalg::alias::float3 compute_lift_axis(const linalg::alias::float3& freestream_); linalg::alias::float3 compute_stream_axis(const linalg::alias::float3& freestream_); +f32 to_degrees(f32 radians); +f32 to_radians(f32 degrees); } \ No newline at end of file diff --git a/vlm/src/vlm.cpp b/vlm/src/vlm.cpp index 7d22a5f..aec90c7 100644 --- a/vlm/src/vlm.cpp +++ b/vlm/src/vlm.cpp @@ -14,7 +14,7 @@ using namespace vlm; Solver::Solver(const tiny::Config& cfg) { - std::string backend_name = cfg().section("solver").get("backend", "avx2"); + std::string backend_name = cfg().section("solver").get("backend", "cpu"); mesh = std::make_unique(cfg); backend = create_backend(backend_name, *mesh); } @@ -46,7 +46,7 @@ AeroCoefficients NonLinearVLM::solve(const FlowData& flow, const Database& db) { backend->compute_lhs(flow); // Create influence matrix backend->lu_factor(); // Factorize the influence matrix into LU form - for (u32 iter = 0; iter < max_iter && err > tol; iter++) { + for (u64 iter = 0; iter < max_iter && err > tol; iter++) { err = 0.0; // reset l1 error backend->compute_rhs(flow, strip_alphas); // Compute RHS using strip alphas backend->lu_solve(); // Solve for the gammas @@ -55,8 +55,8 @@ AeroCoefficients NonLinearVLM::solve(const FlowData& flow, const Database& db) { // parallel reduce // loop over the chordwise strips and apply Van Dam algorithm { - const tiny::Timer timer("Strip correction"); - for (u32 j = 0; j < mesh->ns; j++) { + const tiny::ScopedTimer timer("Strip correction"); + for (u64 j = 0; j < mesh->ns; j++) { const f32 strip_area = mesh->panels_area(0, j, mesh->nc, 1); const FlowData strip_flow = {strip_alphas[j], flow.beta, flow.u_inf, flow.rho}; const f32 strip_cl = backend->compute_coefficient_cl(strip_flow, strip_area, j, 1); diff --git a/vlm/src/vlm_backend.cpp b/vlm/src/vlm_backend.cpp index af16f5f..a905638 100644 --- a/vlm/src/vlm_backend.cpp +++ b/vlm/src/vlm_backend.cpp @@ -4,8 +4,8 @@ #include -#ifdef VLM_AVX2 -#include "vlm_backend_avx2.hpp" +#ifdef VLM_CPU +#include "vlm_backend_cpu.hpp" #endif #ifdef VLM_CUDA #include "vlm_backend_cuda.hpp" @@ -27,11 +27,11 @@ f32 Backend::compute_coefficient_cd(const FlowData& flow) { std::unique_ptr vlm::create_backend(const std::string& backend_name, Mesh& mesh) { tiny::CPUID cpuid; - cpuid.print_info(); + //cpuid.print_info(); - #ifdef VLM_AVX2 - if (backend_name == "avx2" && cpuid.has("AVX2")) { - return std::make_unique(mesh); + #ifdef VLM_CPU + if (backend_name == "cpu") { + return std::make_unique(mesh); } #endif #ifdef VLM_CUDA diff --git a/vlm/src/vlm_mesh.cpp b/vlm/src/vlm_mesh.cpp index fbd234f..09b3f92 100644 --- a/vlm/src/vlm_mesh.cpp +++ b/vlm/src/vlm_mesh.cpp @@ -49,7 +49,7 @@ void Mesh::init() { } void Mesh::alloc() { - const u32 ncw = nc + nw; + const u64 ncw = nc + nw; v.resize((ncw + 1) * (ns + 1)); offsets.resize(nc * ns + 1); connectivity.resize(4 * nc * ns); @@ -59,10 +59,10 @@ void Mesh::alloc() { area.resize(ncw * ns); } -u32 Mesh::nb_panels_wing() const { return nc * ns; }; -u32 Mesh::nb_panels_total() const { return (nc+nw) * ns; }; -u32 Mesh::nb_vertices_wing() const { return (nc + 1) * (ns + 1); }; -u32 Mesh::nb_vertices_total() const { return (nc + nw + 1) * (ns + 1); }; +u64 Mesh::nb_panels_wing() const { return nc * ns; }; +u64 Mesh::nb_panels_total() const { return (nc+nw) * ns; }; +u64 Mesh::nb_vertices_wing() const { return (nc + 1) * (ns + 1); }; +u64 Mesh::nb_vertices_total() const { return (nc + nw + 1) * (ns + 1); }; /// @brief Computes the mean chord of a set of panels /// @details @@ -72,13 +72,13 @@ u32 Mesh::nb_vertices_total() const { return (nc + nw + 1) * (ns + 1); }; /// @param j first panel index chordwise /// @param n number of panels spanwise /// @return mean chord of the set of panels -f32 Mesh::chord_mean(const u32 j, const u32 n) const { +f32 Mesh::chord_mean(const u64 j, const u64 n) const { assert(j + n <= ns+1); // spanwise range assert(n > 1); // minimum 2 chords f32 mac = 0.0f; // loop over panel chordwise sections in spanwise direction // Note: can be done optimally with vertical fused simd - for (u32 v = 0; v < n - 1; v++) { + for (u64 v = 0; v < n - 1; v++) { const f32 c0 = chord_length(j + v); const f32 c1 = chord_length(j + v + 1); mac += 0.5f * (c0 * c0 + c1 * c1) * panel_width_y(0, j + v); @@ -93,16 +93,16 @@ f32 Mesh::chord_mean(const u32 j, const u32 n) const { /// @param m number of panels chordwise /// @param n number of panels spanwise /// @return sum of the areas of the panels -f32 Mesh::panels_area(const u32 i, const u32 j, const u32 m, const u32 n) const { +f32 Mesh::panels_area(const u64 i, const u64 j, const u64 m, const u64 n) const { assert(i + m <= nc); assert(j + n <= ns); - const u32 ld = ns; + const u64 ld = ns; const f32* areas = &area[j + i * ld]; f32 total_area = 0.0f; - for (u32 u = 0; u < m; u++) { - for (u32 v = 0; v < n; v++) { + for (u64 u = 0; u < m; u++) { + for (u64 v = 0; v < n; v++) { total_area += areas[v + u * ld]; } } @@ -115,7 +115,7 @@ f32 Mesh::panels_area(const u32 i, const u32 j, const u32 m, const u32 n) const /// @param m number of panels chordwise /// @param n number of panels spanwise /// @return sum of the areas of the panels projected on the xy plane -f32 Mesh::panels_area_xy(const u32 i, const u32 j, const u32 m, const u32 n) const { +f32 Mesh::panels_area_xy(const u64 i, const u64 j, const u64 m, const u64 n) const { assert(i + m <= nc); assert(j + n <= ns); @@ -123,9 +123,9 @@ f32 Mesh::panels_area_xy(const u32 i, const u32 j, const u32 m, const u32 n) con // 0.5 * || d1 x d2 || where d1 and d2 are the diagonals of the quad f32 total_area = 0.0f; // Note: this is highly inefficient as vertex coordinates should be loaded with simd - for (u32 u = 0; u < m; u++) { - for (u32 v = 0; v < n; v++) { - const u32 idx = j + v + (i+u) * ns; + for (u64 u = 0; u < m; u++) { + for (u64 v = 0; v < n; v++) { + const u64 idx = j + v + (i+u) * ns; const linalg::alias::float3 d1 = get_v0(idx) - get_v2(idx); const linalg::alias::float3 d2 = get_v1(idx) - get_v3(idx); const linalg::alias::float3 cross = linalg::cross(d1, d2); @@ -141,19 +141,26 @@ f32 Mesh::panels_area_xy(const u32 i, const u32 j, const u32 m, const u32 n) con /// @brief Computes the width of a single panel in pure y direction /// @param i panel index chordwise /// @param j panel index spanwise -f32 Mesh::panel_width_y(const u32 i, const u32 j) const { +f32 Mesh::panel_width_y(const u64 i, const u64 j) const { // Since chordwise segments are always parallel, we can simply measure the width of the first panel assert(i < nc); assert(j < ns); - const u32 ld = ns + 1; + const u64 ld = ns + 1; return v.y[j + 1 + i * ld] - v.y[j + i * ld]; } +f32 Mesh::strip_width(const u64 j) const { + assert(j < ns); + const linalg::alias::float3 v0 = get_v0(j); + const linalg::alias::float3 v1 = get_v1(j); + return linalg::length(v1 - v0); +} + /// @brief Computes the chord length of a chordwise segment /// @details Since the mesh follows the camber line, the chord length is computed /// as the distance between the first and last vertex of a chordwise segment /// @param j chordwise segment index -f32 Mesh::chord_length(const u32 j) const { +f32 Mesh::chord_length(const u64 j) const { assert(j <= ns); // spanwise vertex range const f32 dx = v.x[j + nc * (ns+1)] - v.x[j]; const f32 dy = 0.0f; // chordwise segments are parallel to the x axis @@ -162,23 +169,23 @@ f32 Mesh::chord_length(const u32 j) const { return std::sqrt(dx * dx + dy * dy + dz * dz); } -linalg::alias::float3 Mesh::get_v0(u32 i) const { - const u32 idx = i + i / ns; +linalg::alias::float3 Mesh::get_v0(u64 i) const { + const u64 idx = i + i / ns; return {v.x[idx], v.y[idx], v.z[idx]}; } -linalg::alias::float3 Mesh::get_v1(u32 i) const { - const u32 idx = i + i / ns + 1; +linalg::alias::float3 Mesh::get_v1(u64 i) const { + const u64 idx = i + i / ns + 1; return {v.x[idx], v.y[idx], v.z[idx]}; } -linalg::alias::float3 Mesh::get_v2(u32 i) const { - const u32 idx = i + i / ns + ns + 2; +linalg::alias::float3 Mesh::get_v2(u64 i) const { + const u64 idx = i + i / ns + ns + 2; return {v.x[idx], v.y[idx], v.z[idx]}; } -linalg::alias::float3 Mesh::get_v3(u32 i) const { - const u32 idx = i + i / ns + ns + 1; +linalg::alias::float3 Mesh::get_v3(u64 i) const { + const u64 idx = i + i / ns + ns + 1; return {v.x[idx], v.y[idx], v.z[idx]}; } @@ -188,12 +195,12 @@ void Mesh::update_wake(const linalg::alias::float3& freestream) { const f32 off_y = freestream.y * 100.0f * chord_root; const f32 off_z = freestream.z * 100.0f * chord_root; - const u32 v_ns = ns + 1; - const u32 begin_trailing_edge = nb_vertices_wing()-v_ns; - const u32 end_trailing_edge = nb_vertices_wing(); + const u64 v_ns = ns + 1; + const u64 begin_trailing_edge = nb_vertices_wing()-v_ns; + const u64 end_trailing_edge = nb_vertices_wing(); // Add one layer of wake vertices // this can be parallelized (careful to false sharing tho) - for (u32 i = begin_trailing_edge; i < end_trailing_edge; ++i) { + for (u64 i = begin_trailing_edge; i < end_trailing_edge; ++i) { v.x[i + v_ns] = v.x[i] + off_x; v.y[i + v_ns] = v.y[i] + off_y; v.z[i + v_ns] = v.z[i] + off_z; @@ -205,7 +212,7 @@ void Mesh::update_wake(const linalg::alias::float3& freestream) { void Mesh::correction_high_aoa(f32 alpha_rad) { const f32 factor = 0.5f * alpha_rad / (std::sin(alpha_rad) + EPS); // correction factor // Note: this can be vectorized and parallelized - for (u32 i = 0; i < nb_panels_total(); i++) { + for (u64 i = 0; i < nb_panels_total(); i++) { // "chord vector" from center of leading line (v0-v1) to trailing line (v3-v2) const linalg::alias::float3 chord_vec = 0.5f * (get_v2(i) + get_v3(i) - get_v0(i) - get_v1(i)); const linalg::alias::float3 colloc_pt = 0.5f * (get_v0(i) + get_v1(i)) + factor * chord_vec; @@ -222,13 +229,13 @@ void Mesh::compute_connectivity() { // chord(x) 0--1 // | | | // \/ 3--2 - std::array quad = {}; + std::array quad = {}; // Note: with some modifications this loop can be parallelized - for (u32 i = 0; i < nb_panels_wing(); i++) { + for (u64 i = 0; i < nb_panels_wing(); i++) { // extra offset occuring from the fact that there is 1 more vertex // per row than surfaces - u32 chord_idx = i / nc; + u64 chord_idx = i / nc; quad[0] = i + chord_idx; quad[1] = quad[0] + 1; quad[2] = quad[1] + ns; @@ -238,7 +245,7 @@ void Mesh::compute_connectivity() { } } -void Mesh::compute_metrics_i(u32 i ) { +void Mesh::compute_metrics_i(u64 i ) { const linalg::alias::float3 v0 = get_v0(i); const linalg::alias::float3 v1 = get_v1(i); const linalg::alias::float3 v2 = get_v2(i); @@ -274,13 +281,13 @@ void Mesh::compute_metrics_i(u32 i ) { } void Mesh::compute_metrics_wing() { - for (u32 i = 0; i < nb_panels_wing(); i++) { + for (u64 i = 0; i < nb_panels_wing(); i++) { compute_metrics_i(i); } } void Mesh::compute_metrics_wake() { - for (u32 i = nb_panels_wing(); i < nb_panels_total(); i++) { + for (u64 i = nb_panels_wing(); i < nb_panels_total(); i++) { compute_metrics_i(i); } } @@ -288,10 +295,10 @@ void Mesh::compute_metrics_wake() { // plot3d is chordwise major void Mesh::io_read_plot3d_structured(std::ifstream& f) { std::cout << "reading plot3d mesh" << std::endl; - u32 ni = 0; // number of vertices chordwise - u32 nj = 0; // number of vertices spanwise - u32 nk = 0; - u32 blocks = 0; + u64 ni = 0; // number of vertices chordwise + u64 nj = 0; // number of vertices spanwise + u64 nk = 0; + u64 blocks = 0; f32 x, y, z; f >> blocks; if (blocks != 1) { @@ -310,20 +317,20 @@ void Mesh::io_read_plot3d_structured(std::ifstream& f) { std::cout << "ns: " << ns << std::endl; std::cout << "nc: " << nc << std::endl; - for (u32 j = 0; j < nj; j++) { - for (u32 i = 0; i < ni; i++) { + for (u64 j = 0; j < nj; j++) { + for (u64 i = 0; i < ni; i++) { f >> x; v.x[nj*i + j] = x; } } - for (u32 j = 0; j < nj; j++) { - for (u32 i = 0; i < ni; i++) { + for (u64 j = 0; j < nj; j++) { + for (u64 i = 0; i < ni; i++) { f >> y; v.y[nj*i + j] = y; } } - for (u32 j = 0; j < nj; j++) { - for (u32 i = 0; i < ni; i++) { + for (u64 j = 0; j < nj; j++) { + for (u64 i = 0; i < ni; i++) { f >> z; v.z[nj*i + j] = z; } @@ -336,7 +343,7 @@ void Mesh::io_read_plot3d_structured(std::ifstream& f) { // throw std::runtime_error("First vertex of plot3d mesh must be at origin"); // } f32 first_y = v.y[0]; - for (u32 i = 1; i < ni; i++) { + for (u64 i = 1; i < ni; i++) { if (v.y[i * nj] != first_y) { throw std::runtime_error("Mesh vertices should be ordered in chordwise direction"); } diff --git a/vlm/src/vlm_utils.cpp b/vlm/src/vlm_utils.cpp index 55c327b..38393e9 100644 --- a/vlm/src/vlm_utils.cpp +++ b/vlm/src/vlm_utils.cpp @@ -1,5 +1,7 @@ #include "vlm_utils.hpp" +using namespace vlm; + linalg::alias::float3 vlm::compute_freestream(const f32 u_inf, const f32 alpha, const f32 beta) { return linalg::alias::float3{ u_inf * std::cos(alpha) * std::cos(beta), @@ -14,4 +16,12 @@ linalg::alias::float3 vlm::compute_lift_axis(const linalg::alias::float3& freest linalg::alias::float3 vlm::compute_stream_axis(const linalg::alias::float3& freestream_) { return linalg::normalize(freestream_); +} + +f32 vlm::to_degrees(f32 radians) { + return radians * 180.0f / PI_f; +} + +f32 vlm::to_radians(f32 degrees) { + return degrees * PI_f / 180.0f; } \ No newline at end of file diff --git a/xmake.lua b/xmake.lua index 6e05785..9b6f440 100644 --- a/xmake.lua +++ b/xmake.lua @@ -18,7 +18,7 @@ set_languages("c++17", "c99") set_runtimes("MD") -- msvc runtime library (MD/MT/MDd/MTd) -- Define backends and helper functions -backends = {"cuda", "avx2"} +backends = {"cuda", "cpu"} backend_includes = function(name) return "vlm/backends/" .. name .. "/xmake.lua" end backend_defines = function(name) return "VLM_" .. name:upper() end backend_deps = function(name) return "backend-" .. name end @@ -31,7 +31,7 @@ add_includedirs("headeronly", {public = true}) -- must be set before options for _,name in ipairs(backends) do -- Create option option(backend_option(name)) - set_default(name == "avx2") -- set default to true for avx2, false otherwise + set_default(name == "cpu") -- set default to true for cpu, false otherwise set_showmenu(true) option_end() -- Add option includes