From c3bad58f0121e06ba9772758efd16433bb9e6252 Mon Sep 17 00:00:00 2001 From: Ashley Brighthope Date: Sat, 16 Sep 2023 18:28:15 +1000 Subject: [PATCH 1/7] cpu_features: Update submodule pointer and new CMake target name - Update cpu_features to v0.9.0 - Added BUILD_EXECUTABLE for cpu_features to build list_cpu_features for github workflow tests - Added ENABLE_INSTALL to install cpu_feature for old behaviour - Renamed CpuFeature -> CpuFeatures to match new name Signed-off-by: Ashley Brighthope --- .github/workflows/run-tests.yml | 8 ++++---- CMakeLists.txt | 1 + cpu_features | 2 +- lib/CMakeLists.txt | 4 ++-- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 1b7024d1f..ce318d2a4 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -44,7 +44,7 @@ jobs: env: CC: ${{ matrix.compiler.cc }} CXX: ${{ matrix.compiler.cxx }} - run: mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-Werror" .. + run: mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON .. - name: Build run: | echo "Build with $(nproc) thread(s)" @@ -150,7 +150,7 @@ jobs: run: | cd /volk cd build - cmake -DCMAKE_CXX_FLAGS="-Werror" .. + cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON .. echo "Build with $(nproc) thread(s)" make -j$(nproc) ./cpu_features/list_cpu_features @@ -173,7 +173,7 @@ jobs: - name: dependencies run: sudo apt install python3-mako liborc-dev - name: configure - run: mkdir build && cd build && cmake -DENABLE_STATIC_LIBS=True .. + run: mkdir build && cd build && cmake -DENABLE_STATIC_LIBS=True -DBUILD_EXECUTABLE=ON .. - name: build run: cmake --build build -j$(nproc) - name: Print info @@ -248,7 +248,7 @@ jobs: - name: dependencies run: pip3 install mako - name: configure - run: mkdir build && cd build && cmake .. + run: mkdir build && cd build && cmake -DBUILD_EXECUTABLE=ON .. - name: build run: cmake --build build --config Debug -j3 - name: Print info diff --git a/CMakeLists.txt b/CMakeLists.txt index 92d5097fa..bbf1e2f56 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -144,6 +144,7 @@ if (VOLK_CPU_FEATURES) FORCE) set(BUILD_SHARED_LIBS_SAVED "${BUILD_SHARED_LIBS}") set(BUILD_SHARED_LIBS OFF) + set(ENABLE_INSTALL ON) add_subdirectory(cpu_features) set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}") endif() diff --git a/cpu_features b/cpu_features index 41e206e43..ba4bffa86 160000 --- a/cpu_features +++ b/cpu_features @@ -1 +1 @@ -Subproject commit 41e206e435b3c84a6fdd937dfe2a07e8ee73e611 +Subproject commit ba4bffa86cbb5456bdb34426ad22b9551278e2c0 diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index f2d6b7e7c..b5b165fe6 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -548,7 +548,7 @@ endif() add_library(volk SHARED $) target_link_libraries(volk PUBLIC ${volk_libraries}) if(VOLK_CPU_FEATURES) - target_link_libraries(volk PRIVATE CpuFeature::cpu_features) + target_link_libraries(volk PRIVATE CpuFeatures::cpu_features) endif() target_include_directories(volk PUBLIC $ @@ -591,7 +591,7 @@ if(ENABLE_STATIC_LIBS) add_library(volk_static STATIC $) target_link_libraries(volk_static PUBLIC ${volk_libraries}) if(VOLK_CPU_FEATURES) - target_link_libraries(volk_static PRIVATE CpuFeature::cpu_features) + target_link_libraries(volk_static PRIVATE CpuFeatures::cpu_features) endif() if(ORC_FOUND) target_link_libraries(volk_static PUBLIC ${ORC_LIBRARIES_STATIC}) From 6383ddb722da75d7e50516c6bc2562fefb35aff8 Mon Sep 17 00:00:00 2001 From: Ashley Brighthope Date: Sat, 16 Sep 2023 18:56:22 +1000 Subject: [PATCH 2/7] cmake: Removed duplicated logic The else path can not be reached since the root CMake file aborts if CpuFeatures_FOUND if false Signed-off-by: Ashley Brighthope --- lib/CMakeLists.txt | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index b5b165fe6..ca39f99e2 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -521,15 +521,9 @@ target_include_directories(volk_obj ) if(VOLK_CPU_FEATURES) set_source_files_properties(volk_cpu.c PROPERTIES COMPILE_DEFINITIONS "VOLK_CPU_FEATURES=1") - if(CpuFeatures_FOUND) - target_include_directories(volk_obj - PRIVATE $ - ) - else() - target_include_directories(volk_obj - PRIVATE $ - ) - endif() + target_include_directories(volk_obj + PRIVATE $ + ) endif() #Configure object target properties From 49c158144588a3b965558aff09926a4b5b5b3982 Mon Sep 17 00:00:00 2001 From: Ashley Brighthope Date: Wed, 20 Sep 2023 19:36:36 +1000 Subject: [PATCH 3/7] cmake: Do not install cpu_features with volk Signed-off-by: Ashley Brighthope --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bbf1e2f56..be1dd6ebc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -144,7 +144,7 @@ if (VOLK_CPU_FEATURES) FORCE) set(BUILD_SHARED_LIBS_SAVED "${BUILD_SHARED_LIBS}") set(BUILD_SHARED_LIBS OFF) - set(ENABLE_INSTALL ON) + set(ENABLE_INSTALL OFF) add_subdirectory(cpu_features) set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}") endif() From 492568169a38b246576c0b9e71f00e5a46481eff Mon Sep 17 00:00:00 2001 From: Andrej Rode Date: Sat, 23 Sep 2023 14:15:10 +0200 Subject: [PATCH 4/7] cmake: Link to cpu_features only in BUILD_INTERFACE This avoids issues in the static build since otherwise cmake will add cpu_features to the export set. Signed-off-by: Andrej Rode --- lib/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index ca39f99e2..a27be22b4 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -542,7 +542,7 @@ endif() add_library(volk SHARED $) target_link_libraries(volk PUBLIC ${volk_libraries}) if(VOLK_CPU_FEATURES) - target_link_libraries(volk PRIVATE CpuFeatures::cpu_features) + target_link_libraries(volk PRIVATE $) endif() target_include_directories(volk PUBLIC $ @@ -585,7 +585,7 @@ if(ENABLE_STATIC_LIBS) add_library(volk_static STATIC $) target_link_libraries(volk_static PUBLIC ${volk_libraries}) if(VOLK_CPU_FEATURES) - target_link_libraries(volk_static PRIVATE CpuFeatures::cpu_features) + target_link_libraries(volk_static PRIVATE $) endif() if(ORC_FOUND) target_link_libraries(volk_static PUBLIC ${ORC_LIBRARIES_STATIC}) From ba9a6f22b217e61de35c6c586753dee70513fff2 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Thu, 21 Sep 2023 17:46:40 +0200 Subject: [PATCH 5/7] new kernels Signed-off-by: Magnus Lundmark --- CMakeLists.txt | 2 + include/volk/volk_avx2_fma_intrinsics.h | 50 ++ include/volk/volk_avx_intrinsics.h | 38 ++ include/volk/volk_sse_intrinsics.h | 38 ++ kernels/volk/volk_32f_atan_32f.h | 594 +++++++++++------------- lib/kernel_tests.h | 3 +- 6 files changed, 403 insertions(+), 322 deletions(-) create mode 100644 include/volk/volk_avx2_fma_intrinsics.h diff --git a/CMakeLists.txt b/CMakeLists.txt index be1dd6ebc..d9141b154 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,6 @@ # # Copyright 2011-2020 Free Software Foundation, Inc. +# Copyright 2023 Magnus Lundmark # # This file is part of VOLK # @@ -249,6 +250,7 @@ install(FILES ${CMAKE_SOURCE_DIR}/include/volk/saturation_arithmetic.h ${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h + ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h diff --git a/include/volk/volk_avx2_fma_intrinsics.h b/include/volk/volk_avx2_fma_intrinsics.h new file mode 100644 index 000000000..1027200f6 --- /dev/null +++ b/include/volk/volk_avx2_fma_intrinsics.h @@ -0,0 +1,50 @@ +/* -*- c++ -*- */ +/* + * Copyright 2023 Magnus Lundmark + * + * This file is part of VOLK + * + * SPDX-License-Identifier: LGPL-3.0-or-later + */ + +/* + * This file is intended to hold AVX2 FMA intrinsics of intrinsics. + * They should be used in VOLK kernels to avoid copy-paste. + */ + +#ifndef INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ +#define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ +#include + +/* + * Approximate arctan(x) via polynomial expansion + * on the interval [-1, 1] + * + * Maximum relative error ~6.5e-7 + * Polynomial evaluated via Horner's method + */ +static inline __m256 _m256_arctan_approximation_avx2_fma(const __m256 x) +{ + const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); + const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); + const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f); + const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f); + const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f); + const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f); + const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f); + + const __m256 x_times_x = _mm256_mul_ps(x, x); + __m256 arctan; + arctan = a13; + arctan = _mm256_fmadd_ps(x_times_x, arctan, a11); + arctan = _mm256_fmadd_ps(x_times_x, arctan, a9); + arctan = _mm256_fmadd_ps(x_times_x, arctan, a7); + arctan = _mm256_fmadd_ps(x_times_x, arctan, a5); + arctan = _mm256_fmadd_ps(x_times_x, arctan, a3); + arctan = _mm256_fmadd_ps(x_times_x, arctan, a1); + arctan = _mm256_mul_ps(x, arctan); + + return arctan; +} + +#endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */ diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h index 0de88d84d..1277a053c 100644 --- a/include/volk/volk_avx_intrinsics.h +++ b/include/volk/volk_avx_intrinsics.h @@ -1,6 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2015 Free Software Foundation, Inc. + * Copyright 2023 Magnus Lundmark * * This file is part of VOLK * @@ -16,6 +17,43 @@ #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ #include +/* + * Approximate arctan(x) via polynomial expansion + * on the interval [-1, 1] + * + * Maximum relative error ~6.5e-7 + * Polynomial evaluated via Horner's method + */ +static inline __m256 _m256_arctan_approximation_avx(const __m256 x) +{ + const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); + const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); + const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f); + const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f); + const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f); + const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f); + const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f); + + const __m256 x_times_x = _mm256_mul_ps(x, x); + __m256 arctan; + arctan = a13; + arctan = _mm256_mul_ps(x_times_x, arctan); + arctan = _mm256_add_ps(arctan, a11); + arctan = _mm256_mul_ps(x_times_x, arctan); + arctan = _mm256_add_ps(arctan, a9); + arctan = _mm256_mul_ps(x_times_x, arctan); + arctan = _mm256_add_ps(arctan, a7); + arctan = _mm256_mul_ps(x_times_x, arctan); + arctan = _mm256_add_ps(arctan, a5); + arctan = _mm256_mul_ps(x_times_x, arctan); + arctan = _mm256_add_ps(arctan, a3); + arctan = _mm256_mul_ps(x_times_x, arctan); + arctan = _mm256_add_ps(arctan, a1); + arctan = _mm256_mul_ps(x, arctan); + + return arctan; +} + static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y) { __m256 yl, yh, tmp1, tmp2; diff --git a/include/volk/volk_sse_intrinsics.h b/include/volk/volk_sse_intrinsics.h index ce900651c..2cd0dd017 100644 --- a/include/volk/volk_sse_intrinsics.h +++ b/include/volk/volk_sse_intrinsics.h @@ -1,6 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2015 Free Software Foundation, Inc. + * Copyright 2023 Magnus Lundmark * * This file is part of VOLK * @@ -16,6 +17,43 @@ #define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ #include +/* + * Approximate arctan(x) via polynomial expansion + * on the interval [-1, 1] + * + * Maximum relative error ~6.5e-7 + * Polynomial evaluated via Horner's method + */ +static inline __m128 _mm_arctan_approximation_sse(const __m128 x) +{ + const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f); + const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f); + const __m128 a5 = _mm_set1_ps(+0x1.972be6p-3f); + const __m128 a7 = _mm_set1_ps(-0x1.1436ap-3f); + const __m128 a9 = _mm_set1_ps(+0x1.5785aap-4f); + const __m128 a11 = _mm_set1_ps(-0x1.2f3004p-5f); + const __m128 a13 = _mm_set1_ps(+0x1.01a37cp-7f); + + const __m128 x_times_x = _mm_mul_ps(x, x); + __m128 arctan; + arctan = a13; + arctan = _mm_mul_ps(x_times_x, arctan); + arctan = _mm_add_ps(arctan, a11); + arctan = _mm_mul_ps(x_times_x, arctan); + arctan = _mm_add_ps(arctan, a9); + arctan = _mm_mul_ps(x_times_x, arctan); + arctan = _mm_add_ps(arctan, a7); + arctan = _mm_mul_ps(x_times_x, arctan); + arctan = _mm_add_ps(arctan, a5); + arctan = _mm_mul_ps(x_times_x, arctan); + arctan = _mm_add_ps(arctan, a3); + arctan = _mm_mul_ps(x_times_x, arctan); + arctan = _mm_add_ps(arctan, a1); + arctan = _mm_mul_ps(x, arctan); + + return arctan; +} + static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2) { __m128 iValue, qValue; diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h index 96078f4a1..f20ff1a4f 100644 --- a/kernels/volk/volk_32f_atan_32f.h +++ b/kernels/volk/volk_32f_atan_32f.h @@ -1,6 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2014 Free Software Foundation, Inc. + * Copyright 2023 Magnus Lundmark * * This file is part of VOLK * @@ -53,210 +54,241 @@ * volk_free(out); * \endcode */ - -#include #include -#include -/* This is the number of terms of Taylor series to evaluate, increase this for more - * accuracy*/ -#define TERMS 2 +#define POLY_ORDER (13) // Use either 11, 12, 13 or 15 +/* + * arctan(x) polynomial expansion on the interval [-1, 1] + */ +#if (POLY_ORDER == 11) +static inline float arctan_approximation(const float x) +{ + /* + * Max relative error < 4.4e-6 + */ + const float a1 = +0x1.ffff6ep-1f; + const float a3 = -0x1.54fca2p-2f; + const float a5 = +0x1.90aaa2p-3f; + const float a7 = -0x1.f09d2ep-4f; + const float a9 = +0x1.d6e42cp-5f; + const float a11 = -0x1.b9c81ep-7f; + + const float x_times_x = x * x; + float arctan = a11; + arctan = fmaf(x_times_x, arctan, a9); + arctan = fmaf(x_times_x, arctan, a7); + arctan = fmaf(x_times_x, arctan, a5); + arctan = fmaf(x_times_x, arctan, a3); + arctan = fmaf(x_times_x, arctan, a1); + arctan *= x; + + return arctan; +} +#elif (POLY_ORDER == 12) // Order 13 with a1 set to 1 +static inline float arctan_approximation(const float x) +{ + /* + * Max relative error < 7.5e-7 + */ + // a1 == 1 implicitly + const float a3 = -0x1.5548a4p-2f; + const float a5 = +0x1.978224p-3f; + const float a7 = -0x1.156488p-3f; + const float a9 = +0x1.5b822cp-4f; + const float a11 = -0x1.35a172p-5f; + const float a13 = +0x1.09a14ep-7f; + + const float x_times_x = x * x; + float arctan = a13; + arctan = fmaf(x_times_x, arctan, a11); + arctan = fmaf(x_times_x, arctan, a9); + arctan = fmaf(x_times_x, arctan, a7); + arctan = fmaf(x_times_x, arctan, a5); + arctan = fmaf(x_times_x, arctan, a3); + arctan *= x_times_x; + arctan = fmaf(x, arctan, x); + + return arctan; +} +#elif (POLY_ORDER == 13) +static inline float arctan_approximation(const float x) +{ + /* + * Max relative error < 6.6e-7 + */ + const float a1 = +0x1.ffffeap-1f; + const float a3 = -0x1.55437p-2f; + const float a5 = +0x1.972be6p-3f; + const float a7 = -0x1.1436ap-3f; + const float a9 = +0x1.5785aap-4f; + const float a11 = -0x1.2f3004p-5f; + const float a13 = +0x1.01a37cp-7f; + + const float x_times_x = x * x; + float arctan = a13; + arctan = fmaf(x_times_x, arctan, a11); + arctan = fmaf(x_times_x, arctan, a9); + arctan = fmaf(x_times_x, arctan, a7); + arctan = fmaf(x_times_x, arctan, a5); + arctan = fmaf(x_times_x, arctan, a3); + arctan = fmaf(x_times_x, arctan, a1); + arctan *= x; + + return arctan; +} +#elif (POLY_ORDER == 15) +static inline float arctan_approximation(const float x) +{ + /* + * Max relative error < 1.0e-7 + */ + const float a1 = +0x1.fffffcp-1f; + const float a3 = -0x1.55519ep-2f; + const float a5 = +0x1.98f6a8p-3f; + const float a7 = -0x1.1f0a92p-3f; + const float a9 = +0x1.95b654p-4f; + const float a11 = -0x1.e65492p-5f; + const float a13 = +0x1.8c0c36p-6f; + const float a15 = -0x1.32316ep-8f; + + const float x_times_x = x * x; + float arctan = a15; + arctan = fmaf(x_times_x, arctan, a13); + arctan = fmaf(x_times_x, arctan, a11); + arctan = fmaf(x_times_x, arctan, a9); + arctan = fmaf(x_times_x, arctan, a7); + arctan = fmaf(x_times_x, arctan, a5); + arctan = fmaf(x_times_x, arctan, a3); + arctan = fmaf(x_times_x, arctan, a1); + arctan *= x; + + return arctan; +} +#else +#error Undefined polynomial order. +#endif #ifndef INCLUDED_volk_32f_atan_32f_a_H #define INCLUDED_volk_32f_atan_32f_a_H +static inline float arctan(const float x) +{ + /* + * arctan(x) + arctan(1 / x) == sign(x) * pi / 2 + */ + const float pi_over_2 = 0x1.921fb6p0f; + + if (fabs(x) < 1.f) { + return arctan_approximation(x); + } else { + return copysignf(pi_over_2, x) - arctan_approximation(1.f / x); + } +} + #if LV_HAVE_AVX2 && LV_HAVE_FMA #include - -static inline void volk_32f_atan_32f_a_avx2_fma(float* bVector, - const float* aVector, - unsigned int num_points) +#include +static inline void +volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; + const __m256 one = _mm256_set1_ps(1.f); + const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f); + const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); + const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - int i, j; - - __m256 aVal, pio2, x, y, z, arctangent; - __m256 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); - fzeroes = _mm256_setzero_ps(); - fones = _mm256_set1_ps(1.0); - ftwos = _mm256_set1_ps(2.0); - ffours = _mm256_set1_ps(4.0); - - for (; number < eighthPoints; number++) { - aVal = _mm256_load_ps(aPtr); - z = aVal; - condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); - z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); - condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); - x = _mm256_add_ps( - z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - - for (i = 0; i < 2; i++) { - x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones))); - } - x = _mm256_div_ps(fones, x); - y = fzeroes; - for (j = TERMS - 1; j >= 0; j--) { - y = _mm256_fmadd_ps( - y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); - } - - y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); - condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); - - y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition)); - arctangent = y; - condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); - arctangent = _mm256_sub_ps( - arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); - - _mm256_store_ps(bPtr, arctangent); - aPtr += 8; - bPtr += 8; + unsigned int eighth_points = num_points / 8; + for (; number < eighth_points; number++) { + __m256 x = _mm256_load_ps(in); + __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); + __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), + _mm256_blendv_ps(one, x, swap_mask)); + __m256 result = _m256_arctan_approximation_avx2_fma(x_star); + __m256 term = _mm256_and_ps(x_star, sign_mask); + term = _mm256_or_ps(pi_over_2, term); + term = _mm256_sub_ps(term, result); + result = _mm256_blendv_ps(result, term, swap_mask); + _mm256_store_ps(out, result); + in += 8; + out += 8; } - number = eighthPoints * 8; + number = eighth_points * 8; for (; number < num_points; number++) { - *bPtr++ = atan(*aPtr++); + *out++ = arctan(*in++); } } - #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ - -#ifdef LV_HAVE_AVX +#if LV_HAVE_AVX #include - +#include static inline void -volk_32f_atan_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points) +volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; + const __m256 one = _mm256_set1_ps(1.f); + const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f); + const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); + const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - int i, j; - - __m256 aVal, pio2, x, y, z, arctangent; - __m256 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); - fzeroes = _mm256_setzero_ps(); - fones = _mm256_set1_ps(1.0); - ftwos = _mm256_set1_ps(2.0); - ffours = _mm256_set1_ps(4.0); - - for (; number < eighthPoints; number++) { - aVal = _mm256_load_ps(aPtr); - z = aVal; - condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); - z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); - condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); - x = _mm256_add_ps( - z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - - for (i = 0; i < 2; i++) { - x = _mm256_add_ps(x, - _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); - } - x = _mm256_div_ps(fones, x); - y = fzeroes; - for (j = TERMS - 1; j >= 0; j--) { - y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), - _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); - } - - y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); - condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); - - y = _mm256_add_ps( - y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); - arctangent = y; - condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); - arctangent = _mm256_sub_ps( - arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); - - _mm256_store_ps(bPtr, arctangent); - aPtr += 8; - bPtr += 8; + unsigned int eighth_points = num_points / 8; + for (; number < eighth_points; number++) { + __m256 x = _mm256_load_ps(in); + __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); + __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), + _mm256_blendv_ps(one, x, swap_mask)); + __m256 result = _m256_arctan_approximation_avx(x_star); + __m256 term = _mm256_and_ps(x_star, sign_mask); + term = _mm256_or_ps(pi_over_2, term); + term = _mm256_sub_ps(term, result); + result = _mm256_blendv_ps(result, term, swap_mask); + _mm256_store_ps(out, result); + in += 8; + out += 8; } - number = eighthPoints * 8; + number = eighth_points * 8; for (; number < num_points; number++) { - *bPtr++ = atan(*aPtr++); + *out++ = arctan(*in++); } } - #endif /* LV_HAVE_AVX for aligned */ #ifdef LV_HAVE_SSE4_1 #include - +#include static inline void -volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) +volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; + const __m128 one = _mm_set1_ps(1.f); + const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f); + const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); + const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); unsigned int number = 0; - unsigned int quarterPoints = num_points / 4; - int i, j; - - __m128 aVal, pio2, x, y, z, arctangent; - __m128 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm_set1_ps(3.14159265358979323846 / 2); - fzeroes = _mm_setzero_ps(); - fones = _mm_set1_ps(1.0); - ftwos = _mm_set1_ps(2.0); - ffours = _mm_set1_ps(4.0); - - for (; number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); - z = aVal; - condition = _mm_cmplt_ps(z, fzeroes); - z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); - condition = _mm_cmplt_ps(z, fones); - x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); - - for (i = 0; i < 2; i++) { - x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); - } - x = _mm_div_ps(fones, x); - y = fzeroes; - for (j = TERMS - 1; j >= 0; j--) { - y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), - _mm_set1_ps(pow(-1, j) / (2 * j + 1))); - } - - y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); - condition = _mm_cmpgt_ps(z, fones); - - y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); - arctangent = y; - condition = _mm_cmplt_ps(aVal, fzeroes); - arctangent = - _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition)); - - _mm_store_ps(bPtr, arctangent); - aPtr += 4; - bPtr += 4; + unsigned int quarter_points = num_points / 4; + for (; number < quarter_points; number++) { + __m128 x = _mm_load_ps(in); + __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one); + __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask), + _mm_blendv_ps(one, x, swap_mask)); + __m128 result = _mm_arctan_approximation_sse(x_star); + __m128 term = _mm_and_ps(x_star, sign_mask); + term = _mm_or_ps(pi_over_2, term); + term = _mm_sub_ps(term, result); + result = _mm_blendv_ps(result, term, swap_mask); + _mm_store_ps(out, result); + in += 4; + out += 4; } - number = quarterPoints * 4; + number = quarter_points * 4; for (; number < num_points; number++) { - *bPtr++ = atanf(*aPtr++); + *out++ = arctan(*in++); } } - #endif /* LV_HAVE_SSE4_1 for aligned */ - #endif /* INCLUDED_volk_32f_atan_32f_a_H */ #ifndef INCLUDED_volk_32f_atan_32f_u_H @@ -264,205 +296,125 @@ volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu #if LV_HAVE_AVX2 && LV_HAVE_FMA #include - -static inline void volk_32f_atan_32f_u_avx2_fma(float* bVector, - const float* aVector, - unsigned int num_points) +static inline void +volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; + const __m256 one = _mm256_set1_ps(1.f); + const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f); + const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); + const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - int i, j; - - __m256 aVal, pio2, x, y, z, arctangent; - __m256 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); - fzeroes = _mm256_setzero_ps(); - fones = _mm256_set1_ps(1.0); - ftwos = _mm256_set1_ps(2.0); - ffours = _mm256_set1_ps(4.0); - - for (; number < eighthPoints; number++) { - aVal = _mm256_loadu_ps(aPtr); - z = aVal; - condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); - z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); - condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); - x = _mm256_add_ps( - z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - - for (i = 0; i < 2; i++) { - x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones))); - } - x = _mm256_div_ps(fones, x); - y = fzeroes; - for (j = TERMS - 1; j >= 0; j--) { - y = _mm256_fmadd_ps( - y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); - } - - y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); - condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); - - y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition)); - arctangent = y; - condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); - arctangent = _mm256_sub_ps( - arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); - - _mm256_storeu_ps(bPtr, arctangent); - aPtr += 8; - bPtr += 8; + unsigned int eighth_points = num_points / 8; + for (; number < eighth_points; number++) { + __m256 x = _mm256_loadu_ps(in); + __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); + __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), + _mm256_blendv_ps(one, x, swap_mask)); + __m256 result = _m256_arctan_approximation_avx2_fma(x_star); + __m256 term = _mm256_and_ps(x_star, sign_mask); + term = _mm256_or_ps(pi_over_2, term); + term = _mm256_sub_ps(term, result); + result = _mm256_blendv_ps(result, term, swap_mask); + _mm256_storeu_ps(out, result); + in += 8; + out += 8; } - number = eighthPoints * 8; + number = eighth_points * 8; for (; number < num_points; number++) { - *bPtr++ = atan(*aPtr++); + *out++ = arctan(*in++); } } - #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ - -#ifdef LV_HAVE_AVX +#if LV_HAVE_AVX #include - static inline void -volk_32f_atan_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points) +volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; + const __m256 one = _mm256_set1_ps(1.f); + const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f); + const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); + const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - int i, j; - - __m256 aVal, pio2, x, y, z, arctangent; - __m256 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); - fzeroes = _mm256_setzero_ps(); - fones = _mm256_set1_ps(1.0); - ftwos = _mm256_set1_ps(2.0); - ffours = _mm256_set1_ps(4.0); - - for (; number < eighthPoints; number++) { - aVal = _mm256_loadu_ps(aPtr); - z = aVal; - condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); - z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); - condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); - x = _mm256_add_ps( - z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - - for (i = 0; i < 2; i++) { - x = _mm256_add_ps(x, - _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); - } - x = _mm256_div_ps(fones, x); - y = fzeroes; - for (j = TERMS - 1; j >= 0; j--) { - y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), - _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); - } - - y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); - condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); - - y = _mm256_add_ps( - y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); - arctangent = y; - condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); - arctangent = _mm256_sub_ps( - arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); - - _mm256_storeu_ps(bPtr, arctangent); - aPtr += 8; - bPtr += 8; + unsigned int eighth_points = num_points / 8; + for (; number < eighth_points; number++) { + __m256 x = _mm256_loadu_ps(in); + __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); + __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), + _mm256_blendv_ps(one, x, swap_mask)); + __m256 result = _m256_arctan_approximation_avx(x_star); + __m256 term = _mm256_and_ps(x_star, sign_mask); + term = _mm256_or_ps(pi_over_2, term); + term = _mm256_sub_ps(term, result); + result = _mm256_blendv_ps(result, term, swap_mask); + _mm256_storeu_ps(out, result); + in += 8; + out += 8; } - number = eighthPoints * 8; + number = eighth_points * 8; for (; number < num_points; number++) { - *bPtr++ = atan(*aPtr++); + *out++ = arctan(*in++); } } - #endif /* LV_HAVE_AVX for unaligned */ #ifdef LV_HAVE_SSE4_1 #include - +#include static inline void -volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) +volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; + const __m128 one = _mm_set1_ps(1.f); + const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f); + const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); + const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); unsigned int number = 0; - unsigned int quarterPoints = num_points / 4; - int i, j; - - __m128 aVal, pio2, x, y, z, arctangent; - __m128 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm_set1_ps(3.14159265358979323846 / 2); - fzeroes = _mm_setzero_ps(); - fones = _mm_set1_ps(1.0); - ftwos = _mm_set1_ps(2.0); - ffours = _mm_set1_ps(4.0); - - for (; number < quarterPoints; number++) { - aVal = _mm_loadu_ps(aPtr); - z = aVal; - condition = _mm_cmplt_ps(z, fzeroes); - z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); - condition = _mm_cmplt_ps(z, fones); - x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); - - for (i = 0; i < 2; i++) - x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); - x = _mm_div_ps(fones, x); - y = fzeroes; - for (j = TERMS - 1; j >= 0; j--) - y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), - _mm_set1_ps(pow(-1, j) / (2 * j + 1))); - - y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); - condition = _mm_cmpgt_ps(z, fones); - - y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); - arctangent = y; - condition = _mm_cmplt_ps(aVal, fzeroes); - arctangent = - _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition)); - - _mm_storeu_ps(bPtr, arctangent); - aPtr += 4; - bPtr += 4; + unsigned int quarter_points = num_points / 4; + for (; number < quarter_points; number++) { + __m128 x = _mm_loadu_ps(in); + __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one); + __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask), + _mm_blendv_ps(one, x, swap_mask)); + __m128 result = _mm_arctan_approximation_sse(x_star); + __m128 term = _mm_and_ps(x_star, sign_mask); + term = _mm_or_ps(pi_over_2, term); + term = _mm_sub_ps(term, result); + result = _mm_blendv_ps(result, term, swap_mask); + _mm_storeu_ps(out, result); + in += 4; + out += 4; } - number = quarterPoints * 4; + number = quarter_points * 4; for (; number < num_points; number++) { - *bPtr++ = atanf(*aPtr++); + *out++ = arctan(*in++); } } - #endif /* LV_HAVE_SSE4_1 for unaligned */ #ifdef LV_HAVE_GENERIC - static inline void -volk_32f_atan_32f_generic(float* bVector, const float* aVector, unsigned int num_points) +volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; unsigned int number = 0; + for (; number < num_points; number++) { + *out++ = arctan(*in++); + } +} +#endif /* LV_HAVE_GENERIC */ - for (number = 0; number < num_points; number++) { - *bPtr++ = atanf(*aPtr++); +#ifdef LV_HAVE_GENERIC +static inline void +volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points) +{ + unsigned int number = 0; + for (; number < num_points; number++) { + *out++ = atanf(*in++); } } #endif /* LV_HAVE_GENERIC */ diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h index d7e4ad179..530beb1ee 100644 --- a/lib/kernel_tests.h +++ b/lib/kernel_tests.h @@ -1,6 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2014 - 2021 Free Software Foundation, Inc. + * Copyright 2023 Magnus Lundmark * * This file is part of VOLK * @@ -85,7 +86,7 @@ std::vector init_test_list(volk_test_params_t test_params) QA(VOLK_INIT_TEST(volk_32f_sin_32f, test_params_inacc)) QA(VOLK_INIT_TEST(volk_32f_cos_32f, test_params_inacc)) QA(VOLK_INIT_TEST(volk_32f_tan_32f, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32f_atan_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_atan_32f, test_params)) QA(VOLK_INIT_TEST(volk_32f_asin_32f, test_params_inacc)) QA(VOLK_INIT_TEST(volk_32f_acos_32f, test_params_inacc)) QA(VOLK_INIT_TEST(volk_32fc_s32f_power_32fc, test_params_power)) From d82647387b9f0525207037cfe7ffd7e55e606aa6 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Wed, 27 Sep 2023 15:38:59 +0200 Subject: [PATCH 6/7] moved arctan poly to volk_common Signed-off-by: Magnus Lundmark --- include/volk/volk_common.h | 47 ++++++++++- kernels/volk/volk_32f_atan_32f.h | 140 ++----------------------------- 2 files changed, 53 insertions(+), 134 deletions(-) diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h index 55a814532..dc217cc97 100644 --- a/include/volk/volk_common.h +++ b/include/volk/volk_common.h @@ -1,6 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2010, 2011, 2015-2017, 2019, 2020 Free Software Foundation, Inc. + * Copyright 2023 Magnus Lundmark * * This file is part of VOLK * @@ -166,6 +167,50 @@ static inline float log2f_non_ieee(float f) // Constant used to do log10 calculations as faster log2 //////////////////////////////////////////////////////////////////////// // precalculated 10.0 / log2f_non_ieee(10.0) to allow for constexpr -#define volk_log2to10factor 3.01029995663981209120 +#define volk_log2to10factor (0x1.815182p1) // 3.01029995663981209120 + +//////////////////////////////////////////////////////////////////////// +// arctan(x) +//////////////////////////////////////////////////////////////////////// +static inline float volk_arctan_poly(const float x) +{ + /* + * arctan(x) polynomial expansion on the interval [-1, 1] + * Maximum relative error < 6.6e-7 + */ + const float a1 = +0x1.ffffeap-1f; + const float a3 = -0x1.55437p-2f; + const float a5 = +0x1.972be6p-3f; + const float a7 = -0x1.1436ap-3f; + const float a9 = +0x1.5785aap-4f; + const float a11 = -0x1.2f3004p-5f; + const float a13 = +0x1.01a37cp-7f; + + const float x_times_x = x * x; + float arctan = a13; + arctan = fmaf(x_times_x, arctan, a11); + arctan = fmaf(x_times_x, arctan, a9); + arctan = fmaf(x_times_x, arctan, a7); + arctan = fmaf(x_times_x, arctan, a5); + arctan = fmaf(x_times_x, arctan, a3); + arctan = fmaf(x_times_x, arctan, a1); + arctan *= x; + + return arctan; +} + +static inline float volk_arctan(const float x) +{ + /* + * arctan(x) + arctan(1 / x) == sign(x) * pi / 2 + */ + const float pi_over_2 = 0x1.921fb6p0f; + + if (fabs(x) < 1.f) { + return volk_arctan_poly(x); + } else { + return copysignf(pi_over_2, x) - volk_arctan_poly(1.f / x); + } +} #endif /*INCLUDED_LIBVOLK_COMMON_H*/ diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h index f20ff1a4f..b1d68ed97 100644 --- a/kernels/volk/volk_32f_atan_32f.h +++ b/kernels/volk/volk_32f_atan_32f.h @@ -56,135 +56,9 @@ */ #include -#define POLY_ORDER (13) // Use either 11, 12, 13 or 15 -/* - * arctan(x) polynomial expansion on the interval [-1, 1] - */ -#if (POLY_ORDER == 11) -static inline float arctan_approximation(const float x) -{ - /* - * Max relative error < 4.4e-6 - */ - const float a1 = +0x1.ffff6ep-1f; - const float a3 = -0x1.54fca2p-2f; - const float a5 = +0x1.90aaa2p-3f; - const float a7 = -0x1.f09d2ep-4f; - const float a9 = +0x1.d6e42cp-5f; - const float a11 = -0x1.b9c81ep-7f; - - const float x_times_x = x * x; - float arctan = a11; - arctan = fmaf(x_times_x, arctan, a9); - arctan = fmaf(x_times_x, arctan, a7); - arctan = fmaf(x_times_x, arctan, a5); - arctan = fmaf(x_times_x, arctan, a3); - arctan = fmaf(x_times_x, arctan, a1); - arctan *= x; - - return arctan; -} -#elif (POLY_ORDER == 12) // Order 13 with a1 set to 1 -static inline float arctan_approximation(const float x) -{ - /* - * Max relative error < 7.5e-7 - */ - // a1 == 1 implicitly - const float a3 = -0x1.5548a4p-2f; - const float a5 = +0x1.978224p-3f; - const float a7 = -0x1.156488p-3f; - const float a9 = +0x1.5b822cp-4f; - const float a11 = -0x1.35a172p-5f; - const float a13 = +0x1.09a14ep-7f; - - const float x_times_x = x * x; - float arctan = a13; - arctan = fmaf(x_times_x, arctan, a11); - arctan = fmaf(x_times_x, arctan, a9); - arctan = fmaf(x_times_x, arctan, a7); - arctan = fmaf(x_times_x, arctan, a5); - arctan = fmaf(x_times_x, arctan, a3); - arctan *= x_times_x; - arctan = fmaf(x, arctan, x); - - return arctan; -} -#elif (POLY_ORDER == 13) -static inline float arctan_approximation(const float x) -{ - /* - * Max relative error < 6.6e-7 - */ - const float a1 = +0x1.ffffeap-1f; - const float a3 = -0x1.55437p-2f; - const float a5 = +0x1.972be6p-3f; - const float a7 = -0x1.1436ap-3f; - const float a9 = +0x1.5785aap-4f; - const float a11 = -0x1.2f3004p-5f; - const float a13 = +0x1.01a37cp-7f; - - const float x_times_x = x * x; - float arctan = a13; - arctan = fmaf(x_times_x, arctan, a11); - arctan = fmaf(x_times_x, arctan, a9); - arctan = fmaf(x_times_x, arctan, a7); - arctan = fmaf(x_times_x, arctan, a5); - arctan = fmaf(x_times_x, arctan, a3); - arctan = fmaf(x_times_x, arctan, a1); - arctan *= x; - - return arctan; -} -#elif (POLY_ORDER == 15) -static inline float arctan_approximation(const float x) -{ - /* - * Max relative error < 1.0e-7 - */ - const float a1 = +0x1.fffffcp-1f; - const float a3 = -0x1.55519ep-2f; - const float a5 = +0x1.98f6a8p-3f; - const float a7 = -0x1.1f0a92p-3f; - const float a9 = +0x1.95b654p-4f; - const float a11 = -0x1.e65492p-5f; - const float a13 = +0x1.8c0c36p-6f; - const float a15 = -0x1.32316ep-8f; - - const float x_times_x = x * x; - float arctan = a15; - arctan = fmaf(x_times_x, arctan, a13); - arctan = fmaf(x_times_x, arctan, a11); - arctan = fmaf(x_times_x, arctan, a9); - arctan = fmaf(x_times_x, arctan, a7); - arctan = fmaf(x_times_x, arctan, a5); - arctan = fmaf(x_times_x, arctan, a3); - arctan = fmaf(x_times_x, arctan, a1); - arctan *= x; - - return arctan; -} -#else -#error Undefined polynomial order. -#endif - #ifndef INCLUDED_volk_32f_atan_32f_a_H #define INCLUDED_volk_32f_atan_32f_a_H -static inline float arctan(const float x) -{ - /* - * arctan(x) + arctan(1 / x) == sign(x) * pi / 2 - */ - const float pi_over_2 = 0x1.921fb6p0f; - - if (fabs(x) < 1.f) { - return arctan_approximation(x); - } else { - return copysignf(pi_over_2, x) - arctan_approximation(1.f / x); - } -} - #if LV_HAVE_AVX2 && LV_HAVE_FMA #include #include @@ -215,7 +89,7 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point number = eighth_points * 8; for (; number < num_points; number++) { - *out++ = arctan(*in++); + *out++ = volk_arctan(*in++); } } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ @@ -250,7 +124,7 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points) number = eighth_points * 8; for (; number < num_points; number++) { - *out++ = arctan(*in++); + *out++ = volk_arctan(*in++); } } #endif /* LV_HAVE_AVX for aligned */ @@ -285,7 +159,7 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points) number = quarter_points * 4; for (; number < num_points; number++) { - *out++ = arctan(*in++); + *out++ = volk_arctan(*in++); } } #endif /* LV_HAVE_SSE4_1 for aligned */ @@ -323,7 +197,7 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point number = eighth_points * 8; for (; number < num_points; number++) { - *out++ = arctan(*in++); + *out++ = volk_arctan(*in++); } } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ @@ -357,7 +231,7 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points) number = eighth_points * 8; for (; number < num_points; number++) { - *out++ = arctan(*in++); + *out++ = volk_arctan(*in++); } } #endif /* LV_HAVE_AVX for unaligned */ @@ -392,7 +266,7 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points) number = quarter_points * 4; for (; number < num_points; number++) { - *out++ = arctan(*in++); + *out++ = volk_arctan(*in++); } } #endif /* LV_HAVE_SSE4_1 for unaligned */ @@ -403,7 +277,7 @@ volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_point { unsigned int number = 0; for (; number < num_points; number++) { - *out++ = arctan(*in++); + *out++ = volk_arctan(*in++); } } #endif /* LV_HAVE_GENERIC */ From 0343e3c7b5dbc3aeef1633948db602b07fbde2cc Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Thu, 28 Sep 2023 08:37:58 +0200 Subject: [PATCH 7/7] renamed for consistency Signed-off-by: Magnus Lundmark --- include/volk/volk_avx2_fma_intrinsics.h | 2 +- include/volk/volk_avx_intrinsics.h | 2 +- include/volk/volk_sse_intrinsics.h | 2 +- kernels/volk/volk_32f_atan_32f.h | 12 ++++++------ 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/volk/volk_avx2_fma_intrinsics.h b/include/volk/volk_avx2_fma_intrinsics.h index 1027200f6..03b24e6c0 100644 --- a/include/volk/volk_avx2_fma_intrinsics.h +++ b/include/volk/volk_avx2_fma_intrinsics.h @@ -23,7 +23,7 @@ * Maximum relative error ~6.5e-7 * Polynomial evaluated via Horner's method */ -static inline __m256 _m256_arctan_approximation_avx2_fma(const __m256 x) +static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x) { const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h index 1277a053c..2fc0f064e 100644 --- a/include/volk/volk_avx_intrinsics.h +++ b/include/volk/volk_avx_intrinsics.h @@ -24,7 +24,7 @@ * Maximum relative error ~6.5e-7 * Polynomial evaluated via Horner's method */ -static inline __m256 _m256_arctan_approximation_avx(const __m256 x) +static inline __m256 _m256_arctan_poly_avx(const __m256 x) { const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); diff --git a/include/volk/volk_sse_intrinsics.h b/include/volk/volk_sse_intrinsics.h index 2cd0dd017..0ede17845 100644 --- a/include/volk/volk_sse_intrinsics.h +++ b/include/volk/volk_sse_intrinsics.h @@ -24,7 +24,7 @@ * Maximum relative error ~6.5e-7 * Polynomial evaluated via Horner's method */ -static inline __m128 _mm_arctan_approximation_sse(const __m128 x) +static inline __m128 _mm_arctan_poly_sse(const __m128 x) { const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f); const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f); diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h index b1d68ed97..dc5987cb8 100644 --- a/kernels/volk/volk_32f_atan_32f.h +++ b/kernels/volk/volk_32f_atan_32f.h @@ -77,7 +77,7 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); - __m256 result = _m256_arctan_approximation_avx2_fma(x_star); + __m256 result = _m256_arctan_poly_avx2_fma(x_star); __m256 term = _mm256_and_ps(x_star, sign_mask); term = _mm256_or_ps(pi_over_2, term); term = _mm256_sub_ps(term, result); @@ -112,7 +112,7 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points) __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); - __m256 result = _m256_arctan_approximation_avx(x_star); + __m256 result = _m256_arctan_poly_avx(x_star); __m256 term = _mm256_and_ps(x_star, sign_mask); term = _mm256_or_ps(pi_over_2, term); term = _mm256_sub_ps(term, result); @@ -147,7 +147,7 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points) __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one); __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask), _mm_blendv_ps(one, x, swap_mask)); - __m128 result = _mm_arctan_approximation_sse(x_star); + __m128 result = _mm_arctan_poly_sse(x_star); __m128 term = _mm_and_ps(x_star, sign_mask); term = _mm_or_ps(pi_over_2, term); term = _mm_sub_ps(term, result); @@ -185,7 +185,7 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); - __m256 result = _m256_arctan_approximation_avx2_fma(x_star); + __m256 result = _m256_arctan_poly_avx2_fma(x_star); __m256 term = _mm256_and_ps(x_star, sign_mask); term = _mm256_or_ps(pi_over_2, term); term = _mm256_sub_ps(term, result); @@ -219,7 +219,7 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points) __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); - __m256 result = _m256_arctan_approximation_avx(x_star); + __m256 result = _m256_arctan_poly_avx(x_star); __m256 term = _mm256_and_ps(x_star, sign_mask); term = _mm256_or_ps(pi_over_2, term); term = _mm256_sub_ps(term, result); @@ -254,7 +254,7 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points) __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one); __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask), _mm_blendv_ps(one, x, swap_mask)); - __m128 result = _mm_arctan_approximation_sse(x_star); + __m128 result = _mm_arctan_poly_sse(x_star); __m128 term = _mm_and_ps(x_star, sign_mask); term = _mm_or_ps(pi_over_2, term); term = _mm_sub_ps(term, result);