From c3bad58f0121e06ba9772758efd16433bb9e6252 Mon Sep 17 00:00:00 2001
From: Ashley Brighthope <ashley.b@reddegrees.com>
Date: Sat, 16 Sep 2023 18:28:15 +1000
Subject: [PATCH 1/7] cpu_features: Update submodule pointer and new CMake
 target name

- Update cpu_features to v0.9.0
- Added BUILD_EXECUTABLE for cpu_features to build list_cpu_features for
  github workflow tests
- Added ENABLE_INSTALL to install cpu_feature for old behaviour
- Renamed CpuFeature -> CpuFeatures to match new name

Signed-off-by: Ashley Brighthope <ashley.b@reddegrees.com>
---
 .github/workflows/run-tests.yml | 8 ++++----
 CMakeLists.txt                  | 1 +
 cpu_features                    | 2 +-
 lib/CMakeLists.txt              | 4 ++--
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
index 1b7024d1f..ce318d2a4 100644
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests.yml
@@ -44,7 +44,7 @@ jobs:
       env:
         CC: ${{ matrix.compiler.cc }}
         CXX: ${{ matrix.compiler.cxx }}
-      run: mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-Werror" ..
+      run: mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON ..
     - name: Build
       run: |
         echo "Build with $(nproc) thread(s)"
@@ -150,7 +150,7 @@ jobs:
           run: |
             cd /volk
             cd build
-            cmake -DCMAKE_CXX_FLAGS="-Werror" ..
+            cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON ..
             echo "Build with $(nproc) thread(s)"
             make -j$(nproc)
             ./cpu_features/list_cpu_features
@@ -173,7 +173,7 @@ jobs:
     - name: dependencies
       run: sudo apt install python3-mako liborc-dev
     - name: configure
-      run: mkdir build && cd build && cmake -DENABLE_STATIC_LIBS=True ..
+      run: mkdir build && cd build && cmake -DENABLE_STATIC_LIBS=True -DBUILD_EXECUTABLE=ON ..
     - name: build
       run: cmake --build build -j$(nproc)
     - name: Print info
@@ -248,7 +248,7 @@ jobs:
     - name: dependencies
       run: pip3 install mako
     - name: configure
-      run: mkdir build && cd build && cmake ..
+      run: mkdir build && cd build && cmake -DBUILD_EXECUTABLE=ON ..
     - name: build
       run: cmake --build build --config Debug -j3
     - name: Print info
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 92d5097fa..bbf1e2f56 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -144,6 +144,7 @@ if (VOLK_CPU_FEATURES)
       FORCE)
     set(BUILD_SHARED_LIBS_SAVED "${BUILD_SHARED_LIBS}")
     set(BUILD_SHARED_LIBS OFF)
+    set(ENABLE_INSTALL ON)
     add_subdirectory(cpu_features)
     set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}")
   endif()
diff --git a/cpu_features b/cpu_features
index 41e206e43..ba4bffa86 160000
--- a/cpu_features
+++ b/cpu_features
@@ -1 +1 @@
-Subproject commit 41e206e435b3c84a6fdd937dfe2a07e8ee73e611
+Subproject commit ba4bffa86cbb5456bdb34426ad22b9551278e2c0
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index f2d6b7e7c..b5b165fe6 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -548,7 +548,7 @@ endif()
 add_library(volk SHARED $<TARGET_OBJECTS:volk_obj>)
 target_link_libraries(volk PUBLIC ${volk_libraries})
 if(VOLK_CPU_FEATURES)
-  target_link_libraries(volk PRIVATE CpuFeature::cpu_features)
+  target_link_libraries(volk PRIVATE CpuFeatures::cpu_features)
 endif()
 target_include_directories(volk
     PUBLIC $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
@@ -591,7 +591,7 @@ if(ENABLE_STATIC_LIBS)
   add_library(volk_static STATIC $<TARGET_OBJECTS:volk_obj>)
   target_link_libraries(volk_static PUBLIC ${volk_libraries})
   if(VOLK_CPU_FEATURES)
-    target_link_libraries(volk_static PRIVATE CpuFeature::cpu_features)
+    target_link_libraries(volk_static PRIVATE CpuFeatures::cpu_features)
   endif()
   if(ORC_FOUND)
     target_link_libraries(volk_static PUBLIC ${ORC_LIBRARIES_STATIC})

From 6383ddb722da75d7e50516c6bc2562fefb35aff8 Mon Sep 17 00:00:00 2001
From: Ashley Brighthope <ashley.b@reddegrees.com>
Date: Sat, 16 Sep 2023 18:56:22 +1000
Subject: [PATCH 2/7] cmake: Removed duplicated logic

The else path can not be reached since the root CMake file aborts
if CpuFeatures_FOUND if false

Signed-off-by: Ashley Brighthope <ashley.b@reddegrees.com>
---
 lib/CMakeLists.txt | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index b5b165fe6..ca39f99e2 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -521,15 +521,9 @@ target_include_directories(volk_obj
 )
 if(VOLK_CPU_FEATURES)
   set_source_files_properties(volk_cpu.c PROPERTIES COMPILE_DEFINITIONS "VOLK_CPU_FEATURES=1")
-  if(CpuFeatures_FOUND)
-    target_include_directories(volk_obj
-        PRIVATE $<TARGET_PROPERTY:CpuFeatures::cpu_features,INTERFACE_INCLUDE_DIRECTORIES>
-    )
-  else()
-    target_include_directories(volk_obj
-        PRIVATE $<TARGET_PROPERTY:cpu_features,INTERFACE_INCLUDE_DIRECTORIES>
-    )
-  endif()
+  target_include_directories(volk_obj
+      PRIVATE $<TARGET_PROPERTY:CpuFeatures::cpu_features,INTERFACE_INCLUDE_DIRECTORIES>
+  )
 endif()
 
 #Configure object target properties

From 49c158144588a3b965558aff09926a4b5b5b3982 Mon Sep 17 00:00:00 2001
From: Ashley Brighthope <ashley.b@reddegrees.com>
Date: Wed, 20 Sep 2023 19:36:36 +1000
Subject: [PATCH 3/7] cmake: Do not install cpu_features with volk

Signed-off-by: Ashley Brighthope <ashley.b@reddegrees.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bbf1e2f56..be1dd6ebc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -144,7 +144,7 @@ if (VOLK_CPU_FEATURES)
       FORCE)
     set(BUILD_SHARED_LIBS_SAVED "${BUILD_SHARED_LIBS}")
     set(BUILD_SHARED_LIBS OFF)
-    set(ENABLE_INSTALL ON)
+    set(ENABLE_INSTALL OFF)
     add_subdirectory(cpu_features)
     set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}")
   endif()

From 492568169a38b246576c0b9e71f00e5a46481eff Mon Sep 17 00:00:00 2001
From: Andrej Rode <mail@andrejro.de>
Date: Sat, 23 Sep 2023 14:15:10 +0200
Subject: [PATCH 4/7] cmake: Link to cpu_features only in BUILD_INTERFACE

This avoids issues in the static build since otherwise cmake will add cpu_features to the export set.

Signed-off-by: Andrej Rode <mail@andrejro.de>
---
 lib/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index ca39f99e2..a27be22b4 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -542,7 +542,7 @@ endif()
 add_library(volk SHARED $<TARGET_OBJECTS:volk_obj>)
 target_link_libraries(volk PUBLIC ${volk_libraries})
 if(VOLK_CPU_FEATURES)
-  target_link_libraries(volk PRIVATE CpuFeatures::cpu_features)
+  target_link_libraries(volk PRIVATE $<BUILD_INTERFACE:CpuFeatures::cpu_features>)
 endif()
 target_include_directories(volk
     PUBLIC $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
@@ -585,7 +585,7 @@ if(ENABLE_STATIC_LIBS)
   add_library(volk_static STATIC $<TARGET_OBJECTS:volk_obj>)
   target_link_libraries(volk_static PUBLIC ${volk_libraries})
   if(VOLK_CPU_FEATURES)
-    target_link_libraries(volk_static PRIVATE CpuFeatures::cpu_features)
+    target_link_libraries(volk_static PRIVATE $<BUILD_INTERFACE:CpuFeatures::cpu_features>)
   endif()
   if(ORC_FOUND)
     target_link_libraries(volk_static PUBLIC ${ORC_LIBRARIES_STATIC})

From ba9a6f22b217e61de35c6c586753dee70513fff2 Mon Sep 17 00:00:00 2001
From: Magnus Lundmark <magnus@skysense.io>
Date: Thu, 21 Sep 2023 17:46:40 +0200
Subject: [PATCH 5/7] new kernels

Signed-off-by: Magnus Lundmark <magnuslundmark@gmail.com>
---
 CMakeLists.txt                          |   2 +
 include/volk/volk_avx2_fma_intrinsics.h |  50 ++
 include/volk/volk_avx_intrinsics.h      |  38 ++
 include/volk/volk_sse_intrinsics.h      |  38 ++
 kernels/volk/volk_32f_atan_32f.h        | 594 +++++++++++-------------
 lib/kernel_tests.h                      |   3 +-
 6 files changed, 403 insertions(+), 322 deletions(-)
 create mode 100644 include/volk/volk_avx2_fma_intrinsics.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index be1dd6ebc..d9141b154 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,6 @@
 #
 # Copyright 2011-2020 Free Software Foundation, Inc.
+# Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
 #
 # This file is part of VOLK
 #
@@ -249,6 +250,7 @@ install(FILES
     ${CMAKE_SOURCE_DIR}/include/volk/saturation_arithmetic.h
     ${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h
     ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h
+    ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h
     ${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h
     ${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h
     ${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h
diff --git a/include/volk/volk_avx2_fma_intrinsics.h b/include/volk/volk_avx2_fma_intrinsics.h
new file mode 100644
index 000000000..1027200f6
--- /dev/null
+++ b/include/volk/volk_avx2_fma_intrinsics.h
@@ -0,0 +1,50 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*
+ * This file is intended to hold AVX2 FMA intrinsics of intrinsics.
+ * They should be used in VOLK kernels to avoid copy-paste.
+ */
+
+#ifndef INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
+#define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
+#include <immintrin.h>
+
+/*
+ * Approximate arctan(x) via polynomial expansion
+ * on the interval [-1, 1]
+ *
+ * Maximum relative error ~6.5e-7
+ * Polynomial evaluated via Horner's method
+ */
+static inline __m256 _m256_arctan_approximation_avx2_fma(const __m256 x)
+{
+    const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
+    const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
+    const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
+    const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
+    const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
+    const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
+    const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
+
+    const __m256 x_times_x = _mm256_mul_ps(x, x);
+    __m256 arctan;
+    arctan = a13;
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a11);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a9);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a7);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a5);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a3);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a1);
+    arctan = _mm256_mul_ps(x, arctan);
+
+    return arctan;
+}
+
+#endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */
diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h
index 0de88d84d..1277a053c 100644
--- a/include/volk/volk_avx_intrinsics.h
+++ b/include/volk/volk_avx_intrinsics.h
@@ -1,6 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2015 Free Software Foundation, Inc.
+ * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
  *
  * This file is part of VOLK
  *
@@ -16,6 +17,43 @@
 #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
 #include <immintrin.h>
 
+/*
+ * Approximate arctan(x) via polynomial expansion
+ * on the interval [-1, 1]
+ *
+ * Maximum relative error ~6.5e-7
+ * Polynomial evaluated via Horner's method
+ */
+static inline __m256 _m256_arctan_approximation_avx(const __m256 x)
+{
+    const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
+    const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
+    const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
+    const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
+    const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
+    const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
+    const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
+
+    const __m256 x_times_x = _mm256_mul_ps(x, x);
+    __m256 arctan;
+    arctan = a13;
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a11);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a9);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a7);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a5);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a3);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a1);
+    arctan = _mm256_mul_ps(x, arctan);
+
+    return arctan;
+}
+
 static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
 {
     __m256 yl, yh, tmp1, tmp2;
diff --git a/include/volk/volk_sse_intrinsics.h b/include/volk/volk_sse_intrinsics.h
index ce900651c..2cd0dd017 100644
--- a/include/volk/volk_sse_intrinsics.h
+++ b/include/volk/volk_sse_intrinsics.h
@@ -1,6 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2015 Free Software Foundation, Inc.
+ * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
  *
  * This file is part of VOLK
  *
@@ -16,6 +17,43 @@
 #define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
 #include <xmmintrin.h>
 
+/*
+ * Approximate arctan(x) via polynomial expansion
+ * on the interval [-1, 1]
+ *
+ * Maximum relative error ~6.5e-7
+ * Polynomial evaluated via Horner's method
+ */
+static inline __m128 _mm_arctan_approximation_sse(const __m128 x)
+{
+    const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f);
+    const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f);
+    const __m128 a5 = _mm_set1_ps(+0x1.972be6p-3f);
+    const __m128 a7 = _mm_set1_ps(-0x1.1436ap-3f);
+    const __m128 a9 = _mm_set1_ps(+0x1.5785aap-4f);
+    const __m128 a11 = _mm_set1_ps(-0x1.2f3004p-5f);
+    const __m128 a13 = _mm_set1_ps(+0x1.01a37cp-7f);
+
+    const __m128 x_times_x = _mm_mul_ps(x, x);
+    __m128 arctan;
+    arctan = a13;
+    arctan = _mm_mul_ps(x_times_x, arctan);
+    arctan = _mm_add_ps(arctan, a11);
+    arctan = _mm_mul_ps(x_times_x, arctan);
+    arctan = _mm_add_ps(arctan, a9);
+    arctan = _mm_mul_ps(x_times_x, arctan);
+    arctan = _mm_add_ps(arctan, a7);
+    arctan = _mm_mul_ps(x_times_x, arctan);
+    arctan = _mm_add_ps(arctan, a5);
+    arctan = _mm_mul_ps(x_times_x, arctan);
+    arctan = _mm_add_ps(arctan, a3);
+    arctan = _mm_mul_ps(x_times_x, arctan);
+    arctan = _mm_add_ps(arctan, a1);
+    arctan = _mm_mul_ps(x, arctan);
+
+    return arctan;
+}
+
 static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
 {
     __m128 iValue, qValue;
diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h
index 96078f4a1..f20ff1a4f 100644
--- a/kernels/volk/volk_32f_atan_32f.h
+++ b/kernels/volk/volk_32f_atan_32f.h
@@ -1,6 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2014 Free Software Foundation, Inc.
+ * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
  *
  * This file is part of VOLK
  *
@@ -53,210 +54,241 @@
  *   volk_free(out);
  * \endcode
  */
-
-#include <inttypes.h>
 #include <math.h>
-#include <stdio.h>
 
-/* This is the number of terms of Taylor series to evaluate, increase this for more
- * accuracy*/
-#define TERMS 2
+#define POLY_ORDER (13) // Use either 11, 12, 13 or 15
+/*
+ * arctan(x) polynomial expansion on the interval [-1, 1]
+ */
+#if (POLY_ORDER == 11)
+static inline float arctan_approximation(const float x)
+{
+    /*
+     * Max relative error < 4.4e-6
+     */
+    const float a1 = +0x1.ffff6ep-1f;
+    const float a3 = -0x1.54fca2p-2f;
+    const float a5 = +0x1.90aaa2p-3f;
+    const float a7 = -0x1.f09d2ep-4f;
+    const float a9 = +0x1.d6e42cp-5f;
+    const float a11 = -0x1.b9c81ep-7f;
+
+    const float x_times_x = x * x;
+    float arctan = a11;
+    arctan = fmaf(x_times_x, arctan, a9);
+    arctan = fmaf(x_times_x, arctan, a7);
+    arctan = fmaf(x_times_x, arctan, a5);
+    arctan = fmaf(x_times_x, arctan, a3);
+    arctan = fmaf(x_times_x, arctan, a1);
+    arctan *= x;
+
+    return arctan;
+}
+#elif (POLY_ORDER == 12) // Order 13 with a1 set to 1
+static inline float arctan_approximation(const float x)
+{
+    /*
+     * Max relative error < 7.5e-7
+     */
+    //          a1 == 1 implicitly
+    const float a3 = -0x1.5548a4p-2f;
+    const float a5 = +0x1.978224p-3f;
+    const float a7 = -0x1.156488p-3f;
+    const float a9 = +0x1.5b822cp-4f;
+    const float a11 = -0x1.35a172p-5f;
+    const float a13 = +0x1.09a14ep-7f;
+
+    const float x_times_x = x * x;
+    float arctan = a13;
+    arctan = fmaf(x_times_x, arctan, a11);
+    arctan = fmaf(x_times_x, arctan, a9);
+    arctan = fmaf(x_times_x, arctan, a7);
+    arctan = fmaf(x_times_x, arctan, a5);
+    arctan = fmaf(x_times_x, arctan, a3);
+    arctan *= x_times_x;
+    arctan = fmaf(x, arctan, x);
+
+    return arctan;
+}
+#elif (POLY_ORDER == 13)
+static inline float arctan_approximation(const float x)
+{
+    /*
+     * Max relative error < 6.6e-7
+     */
+    const float a1 = +0x1.ffffeap-1f;
+    const float a3 = -0x1.55437p-2f;
+    const float a5 = +0x1.972be6p-3f;
+    const float a7 = -0x1.1436ap-3f;
+    const float a9 = +0x1.5785aap-4f;
+    const float a11 = -0x1.2f3004p-5f;
+    const float a13 = +0x1.01a37cp-7f;
+
+    const float x_times_x = x * x;
+    float arctan = a13;
+    arctan = fmaf(x_times_x, arctan, a11);
+    arctan = fmaf(x_times_x, arctan, a9);
+    arctan = fmaf(x_times_x, arctan, a7);
+    arctan = fmaf(x_times_x, arctan, a5);
+    arctan = fmaf(x_times_x, arctan, a3);
+    arctan = fmaf(x_times_x, arctan, a1);
+    arctan *= x;
+
+    return arctan;
+}
+#elif (POLY_ORDER == 15)
+static inline float arctan_approximation(const float x)
+{
+    /*
+     * Max relative error < 1.0e-7
+     */
+    const float a1 = +0x1.fffffcp-1f;
+    const float a3 = -0x1.55519ep-2f;
+    const float a5 = +0x1.98f6a8p-3f;
+    const float a7 = -0x1.1f0a92p-3f;
+    const float a9 = +0x1.95b654p-4f;
+    const float a11 = -0x1.e65492p-5f;
+    const float a13 = +0x1.8c0c36p-6f;
+    const float a15 = -0x1.32316ep-8f;
+
+    const float x_times_x = x * x;
+    float arctan = a15;
+    arctan = fmaf(x_times_x, arctan, a13);
+    arctan = fmaf(x_times_x, arctan, a11);
+    arctan = fmaf(x_times_x, arctan, a9);
+    arctan = fmaf(x_times_x, arctan, a7);
+    arctan = fmaf(x_times_x, arctan, a5);
+    arctan = fmaf(x_times_x, arctan, a3);
+    arctan = fmaf(x_times_x, arctan, a1);
+    arctan *= x;
+
+    return arctan;
+}
+#else
+#error Undefined polynomial order.
+#endif
 
 #ifndef INCLUDED_volk_32f_atan_32f_a_H
 #define INCLUDED_volk_32f_atan_32f_a_H
 
+static inline float arctan(const float x)
+{
+    /*
+     *  arctan(x) + arctan(1 / x) == sign(x) * pi / 2
+     */
+    const float pi_over_2 = 0x1.921fb6p0f;
+
+    if (fabs(x) < 1.f) {
+        return arctan_approximation(x);
+    } else {
+        return copysignf(pi_over_2, x) - arctan_approximation(1.f / x);
+    }
+}
+
 #if LV_HAVE_AVX2 && LV_HAVE_FMA
 #include <immintrin.h>
-
-static inline void volk_32f_atan_32f_a_avx2_fma(float* bVector,
-                                                const float* aVector,
-                                                unsigned int num_points)
+#include <volk/volk_avx2_fma_intrinsics.h>
+static inline void
+volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_points)
 {
-    float* bPtr = bVector;
-    const float* aPtr = aVector;
+    const __m256 one = _mm256_set1_ps(1.f);
+    const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
+    const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
+    const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
 
     unsigned int number = 0;
-    unsigned int eighthPoints = num_points / 8;
-    int i, j;
-
-    __m256 aVal, pio2, x, y, z, arctangent;
-    __m256 fzeroes, fones, ftwos, ffours, condition;
-
-    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
-    fzeroes = _mm256_setzero_ps();
-    fones = _mm256_set1_ps(1.0);
-    ftwos = _mm256_set1_ps(2.0);
-    ffours = _mm256_set1_ps(4.0);
-
-    for (; number < eighthPoints; number++) {
-        aVal = _mm256_load_ps(aPtr);
-        z = aVal;
-        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
-        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
-        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
-        x = _mm256_add_ps(
-            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
-        for (i = 0; i < 2; i++) {
-            x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
-        }
-        x = _mm256_div_ps(fones, x);
-        y = fzeroes;
-        for (j = TERMS - 1; j >= 0; j--) {
-            y = _mm256_fmadd_ps(
-                y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
-        }
-
-        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
-        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
-        y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
-        arctangent = y;
-        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
-        arctangent = _mm256_sub_ps(
-            arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
-
-        _mm256_store_ps(bPtr, arctangent);
-        aPtr += 8;
-        bPtr += 8;
+    unsigned int eighth_points = num_points / 8;
+    for (; number < eighth_points; number++) {
+        __m256 x = _mm256_load_ps(in);
+        __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
+        __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
+                                      _mm256_blendv_ps(one, x, swap_mask));
+        __m256 result = _m256_arctan_approximation_avx2_fma(x_star);
+        __m256 term = _mm256_and_ps(x_star, sign_mask);
+        term = _mm256_or_ps(pi_over_2, term);
+        term = _mm256_sub_ps(term, result);
+        result = _mm256_blendv_ps(result, term, swap_mask);
+        _mm256_store_ps(out, result);
+        in += 8;
+        out += 8;
     }
 
-    number = eighthPoints * 8;
+    number = eighth_points * 8;
     for (; number < num_points; number++) {
-        *bPtr++ = atan(*aPtr++);
+        *out++ = arctan(*in++);
     }
 }
-
 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
 
-
-#ifdef LV_HAVE_AVX
+#if LV_HAVE_AVX
 #include <immintrin.h>
-
+#include <volk/volk_avx_intrinsics.h>
 static inline void
-volk_32f_atan_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
+volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
 {
-    float* bPtr = bVector;
-    const float* aPtr = aVector;
+    const __m256 one = _mm256_set1_ps(1.f);
+    const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
+    const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
+    const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
 
     unsigned int number = 0;
-    unsigned int eighthPoints = num_points / 8;
-    int i, j;
-
-    __m256 aVal, pio2, x, y, z, arctangent;
-    __m256 fzeroes, fones, ftwos, ffours, condition;
-
-    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
-    fzeroes = _mm256_setzero_ps();
-    fones = _mm256_set1_ps(1.0);
-    ftwos = _mm256_set1_ps(2.0);
-    ffours = _mm256_set1_ps(4.0);
-
-    for (; number < eighthPoints; number++) {
-        aVal = _mm256_load_ps(aPtr);
-        z = aVal;
-        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
-        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
-        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
-        x = _mm256_add_ps(
-            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
-        for (i = 0; i < 2; i++) {
-            x = _mm256_add_ps(x,
-                              _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
-        }
-        x = _mm256_div_ps(fones, x);
-        y = fzeroes;
-        for (j = TERMS - 1; j >= 0; j--) {
-            y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
-                              _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
-        }
-
-        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
-        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
-        y = _mm256_add_ps(
-            y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
-        arctangent = y;
-        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
-        arctangent = _mm256_sub_ps(
-            arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
-
-        _mm256_store_ps(bPtr, arctangent);
-        aPtr += 8;
-        bPtr += 8;
+    unsigned int eighth_points = num_points / 8;
+    for (; number < eighth_points; number++) {
+        __m256 x = _mm256_load_ps(in);
+        __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
+        __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
+                                      _mm256_blendv_ps(one, x, swap_mask));
+        __m256 result = _m256_arctan_approximation_avx(x_star);
+        __m256 term = _mm256_and_ps(x_star, sign_mask);
+        term = _mm256_or_ps(pi_over_2, term);
+        term = _mm256_sub_ps(term, result);
+        result = _mm256_blendv_ps(result, term, swap_mask);
+        _mm256_store_ps(out, result);
+        in += 8;
+        out += 8;
     }
 
-    number = eighthPoints * 8;
+    number = eighth_points * 8;
     for (; number < num_points; number++) {
-        *bPtr++ = atan(*aPtr++);
+        *out++ = arctan(*in++);
     }
 }
-
 #endif /* LV_HAVE_AVX for aligned */
 
 #ifdef LV_HAVE_SSE4_1
 #include <smmintrin.h>
-
+#include <volk/volk_sse_intrinsics.h>
 static inline void
-volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
 {
-    float* bPtr = bVector;
-    const float* aPtr = aVector;
+    const __m128 one = _mm_set1_ps(1.f);
+    const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f);
+    const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
+    const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
 
     unsigned int number = 0;
-    unsigned int quarterPoints = num_points / 4;
-    int i, j;
-
-    __m128 aVal, pio2, x, y, z, arctangent;
-    __m128 fzeroes, fones, ftwos, ffours, condition;
-
-    pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
-    fzeroes = _mm_setzero_ps();
-    fones = _mm_set1_ps(1.0);
-    ftwos = _mm_set1_ps(2.0);
-    ffours = _mm_set1_ps(4.0);
-
-    for (; number < quarterPoints; number++) {
-        aVal = _mm_load_ps(aPtr);
-        z = aVal;
-        condition = _mm_cmplt_ps(z, fzeroes);
-        z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
-        condition = _mm_cmplt_ps(z, fones);
-        x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
-
-        for (i = 0; i < 2; i++) {
-            x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
-        }
-        x = _mm_div_ps(fones, x);
-        y = fzeroes;
-        for (j = TERMS - 1; j >= 0; j--) {
-            y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
-                           _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
-        }
-
-        y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
-        condition = _mm_cmpgt_ps(z, fones);
-
-        y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
-        arctangent = y;
-        condition = _mm_cmplt_ps(aVal, fzeroes);
-        arctangent =
-            _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
-
-        _mm_store_ps(bPtr, arctangent);
-        aPtr += 4;
-        bPtr += 4;
+    unsigned int quarter_points = num_points / 4;
+    for (; number < quarter_points; number++) {
+        __m128 x = _mm_load_ps(in);
+        __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
+        __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask),
+                                   _mm_blendv_ps(one, x, swap_mask));
+        __m128 result = _mm_arctan_approximation_sse(x_star);
+        __m128 term = _mm_and_ps(x_star, sign_mask);
+        term = _mm_or_ps(pi_over_2, term);
+        term = _mm_sub_ps(term, result);
+        result = _mm_blendv_ps(result, term, swap_mask);
+        _mm_store_ps(out, result);
+        in += 4;
+        out += 4;
     }
 
-    number = quarterPoints * 4;
+    number = quarter_points * 4;
     for (; number < num_points; number++) {
-        *bPtr++ = atanf(*aPtr++);
+        *out++ = arctan(*in++);
     }
 }
-
 #endif /* LV_HAVE_SSE4_1 for aligned */
-
 #endif /* INCLUDED_volk_32f_atan_32f_a_H */
 
 #ifndef INCLUDED_volk_32f_atan_32f_u_H
@@ -264,205 +296,125 @@ volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu
 
 #if LV_HAVE_AVX2 && LV_HAVE_FMA
 #include <immintrin.h>
-
-static inline void volk_32f_atan_32f_u_avx2_fma(float* bVector,
-                                                const float* aVector,
-                                                unsigned int num_points)
+static inline void
+volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_points)
 {
-    float* bPtr = bVector;
-    const float* aPtr = aVector;
+    const __m256 one = _mm256_set1_ps(1.f);
+    const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
+    const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
+    const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
 
     unsigned int number = 0;
-    unsigned int eighthPoints = num_points / 8;
-    int i, j;
-
-    __m256 aVal, pio2, x, y, z, arctangent;
-    __m256 fzeroes, fones, ftwos, ffours, condition;
-
-    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
-    fzeroes = _mm256_setzero_ps();
-    fones = _mm256_set1_ps(1.0);
-    ftwos = _mm256_set1_ps(2.0);
-    ffours = _mm256_set1_ps(4.0);
-
-    for (; number < eighthPoints; number++) {
-        aVal = _mm256_loadu_ps(aPtr);
-        z = aVal;
-        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
-        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
-        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
-        x = _mm256_add_ps(
-            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
-        for (i = 0; i < 2; i++) {
-            x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
-        }
-        x = _mm256_div_ps(fones, x);
-        y = fzeroes;
-        for (j = TERMS - 1; j >= 0; j--) {
-            y = _mm256_fmadd_ps(
-                y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
-        }
-
-        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
-        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
-        y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
-        arctangent = y;
-        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
-        arctangent = _mm256_sub_ps(
-            arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
-
-        _mm256_storeu_ps(bPtr, arctangent);
-        aPtr += 8;
-        bPtr += 8;
+    unsigned int eighth_points = num_points / 8;
+    for (; number < eighth_points; number++) {
+        __m256 x = _mm256_loadu_ps(in);
+        __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
+        __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
+                                      _mm256_blendv_ps(one, x, swap_mask));
+        __m256 result = _m256_arctan_approximation_avx2_fma(x_star);
+        __m256 term = _mm256_and_ps(x_star, sign_mask);
+        term = _mm256_or_ps(pi_over_2, term);
+        term = _mm256_sub_ps(term, result);
+        result = _mm256_blendv_ps(result, term, swap_mask);
+        _mm256_storeu_ps(out, result);
+        in += 8;
+        out += 8;
     }
 
-    number = eighthPoints * 8;
+    number = eighth_points * 8;
     for (; number < num_points; number++) {
-        *bPtr++ = atan(*aPtr++);
+        *out++ = arctan(*in++);
     }
 }
-
 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
 
-
-#ifdef LV_HAVE_AVX
+#if LV_HAVE_AVX
 #include <immintrin.h>
-
 static inline void
-volk_32f_atan_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
+volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
 {
-    float* bPtr = bVector;
-    const float* aPtr = aVector;
+    const __m256 one = _mm256_set1_ps(1.f);
+    const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
+    const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
+    const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
 
     unsigned int number = 0;
-    unsigned int eighthPoints = num_points / 8;
-    int i, j;
-
-    __m256 aVal, pio2, x, y, z, arctangent;
-    __m256 fzeroes, fones, ftwos, ffours, condition;
-
-    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
-    fzeroes = _mm256_setzero_ps();
-    fones = _mm256_set1_ps(1.0);
-    ftwos = _mm256_set1_ps(2.0);
-    ffours = _mm256_set1_ps(4.0);
-
-    for (; number < eighthPoints; number++) {
-        aVal = _mm256_loadu_ps(aPtr);
-        z = aVal;
-        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
-        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
-        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
-        x = _mm256_add_ps(
-            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
-
-        for (i = 0; i < 2; i++) {
-            x = _mm256_add_ps(x,
-                              _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
-        }
-        x = _mm256_div_ps(fones, x);
-        y = fzeroes;
-        for (j = TERMS - 1; j >= 0; j--) {
-            y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
-                              _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
-        }
-
-        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
-        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
-
-        y = _mm256_add_ps(
-            y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
-        arctangent = y;
-        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
-        arctangent = _mm256_sub_ps(
-            arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
-
-        _mm256_storeu_ps(bPtr, arctangent);
-        aPtr += 8;
-        bPtr += 8;
+    unsigned int eighth_points = num_points / 8;
+    for (; number < eighth_points; number++) {
+        __m256 x = _mm256_loadu_ps(in);
+        __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
+        __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
+                                      _mm256_blendv_ps(one, x, swap_mask));
+        __m256 result = _m256_arctan_approximation_avx(x_star);
+        __m256 term = _mm256_and_ps(x_star, sign_mask);
+        term = _mm256_or_ps(pi_over_2, term);
+        term = _mm256_sub_ps(term, result);
+        result = _mm256_blendv_ps(result, term, swap_mask);
+        _mm256_storeu_ps(out, result);
+        in += 8;
+        out += 8;
     }
 
-    number = eighthPoints * 8;
+    number = eighth_points * 8;
     for (; number < num_points; number++) {
-        *bPtr++ = atan(*aPtr++);
+        *out++ = arctan(*in++);
     }
 }
-
 #endif /* LV_HAVE_AVX for unaligned */
 
 #ifdef LV_HAVE_SSE4_1
 #include <smmintrin.h>
-
+#include <volk/volk_sse_intrinsics.h>
 static inline void
-volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
 {
-    float* bPtr = bVector;
-    const float* aPtr = aVector;
+    const __m128 one = _mm_set1_ps(1.f);
+    const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f);
+    const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
+    const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
 
     unsigned int number = 0;
-    unsigned int quarterPoints = num_points / 4;
-    int i, j;
-
-    __m128 aVal, pio2, x, y, z, arctangent;
-    __m128 fzeroes, fones, ftwos, ffours, condition;
-
-    pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
-    fzeroes = _mm_setzero_ps();
-    fones = _mm_set1_ps(1.0);
-    ftwos = _mm_set1_ps(2.0);
-    ffours = _mm_set1_ps(4.0);
-
-    for (; number < quarterPoints; number++) {
-        aVal = _mm_loadu_ps(aPtr);
-        z = aVal;
-        condition = _mm_cmplt_ps(z, fzeroes);
-        z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
-        condition = _mm_cmplt_ps(z, fones);
-        x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
-
-        for (i = 0; i < 2; i++)
-            x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
-        x = _mm_div_ps(fones, x);
-        y = fzeroes;
-        for (j = TERMS - 1; j >= 0; j--)
-            y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
-                           _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
-
-        y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
-        condition = _mm_cmpgt_ps(z, fones);
-
-        y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
-        arctangent = y;
-        condition = _mm_cmplt_ps(aVal, fzeroes);
-        arctangent =
-            _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
-
-        _mm_storeu_ps(bPtr, arctangent);
-        aPtr += 4;
-        bPtr += 4;
+    unsigned int quarter_points = num_points / 4;
+    for (; number < quarter_points; number++) {
+        __m128 x = _mm_loadu_ps(in);
+        __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
+        __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask),
+                                   _mm_blendv_ps(one, x, swap_mask));
+        __m128 result = _mm_arctan_approximation_sse(x_star);
+        __m128 term = _mm_and_ps(x_star, sign_mask);
+        term = _mm_or_ps(pi_over_2, term);
+        term = _mm_sub_ps(term, result);
+        result = _mm_blendv_ps(result, term, swap_mask);
+        _mm_storeu_ps(out, result);
+        in += 4;
+        out += 4;
     }
 
-    number = quarterPoints * 4;
+    number = quarter_points * 4;
     for (; number < num_points; number++) {
-        *bPtr++ = atanf(*aPtr++);
+        *out++ = arctan(*in++);
     }
 }
-
 #endif /* LV_HAVE_SSE4_1 for unaligned */
 
 #ifdef LV_HAVE_GENERIC
-
 static inline void
-volk_32f_atan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points)
 {
-    float* bPtr = bVector;
-    const float* aPtr = aVector;
     unsigned int number = 0;
+    for (; number < num_points; number++) {
+        *out++ = arctan(*in++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
 
-    for (number = 0; number < num_points; number++) {
-        *bPtr++ = atanf(*aPtr++);
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
+{
+    unsigned int number = 0;
+    for (; number < num_points; number++) {
+        *out++ = atanf(*in++);
     }
 }
 #endif /* LV_HAVE_GENERIC */
diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h
index d7e4ad179..530beb1ee 100644
--- a/lib/kernel_tests.h
+++ b/lib/kernel_tests.h
@@ -1,6 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2014 - 2021 Free Software Foundation, Inc.
+ * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
  *
  * This file is part of VOLK
  *
@@ -85,7 +86,7 @@ std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
     QA(VOLK_INIT_TEST(volk_32f_sin_32f, test_params_inacc))
     QA(VOLK_INIT_TEST(volk_32f_cos_32f, test_params_inacc))
     QA(VOLK_INIT_TEST(volk_32f_tan_32f, test_params_inacc))
-    QA(VOLK_INIT_TEST(volk_32f_atan_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_atan_32f, test_params))
     QA(VOLK_INIT_TEST(volk_32f_asin_32f, test_params_inacc))
     QA(VOLK_INIT_TEST(volk_32f_acos_32f, test_params_inacc))
     QA(VOLK_INIT_TEST(volk_32fc_s32f_power_32fc, test_params_power))

From d82647387b9f0525207037cfe7ffd7e55e606aa6 Mon Sep 17 00:00:00 2001
From: Magnus Lundmark <magnuslundmark@gmail.com>
Date: Wed, 27 Sep 2023 15:38:59 +0200
Subject: [PATCH 6/7] moved arctan poly to volk_common

Signed-off-by: Magnus Lundmark <magnuslundmark@gmail.com>
---
 include/volk/volk_common.h       |  47 ++++++++++-
 kernels/volk/volk_32f_atan_32f.h | 140 ++-----------------------------
 2 files changed, 53 insertions(+), 134 deletions(-)

diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h
index 55a814532..dc217cc97 100644
--- a/include/volk/volk_common.h
+++ b/include/volk/volk_common.h
@@ -1,6 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2010, 2011, 2015-2017, 2019, 2020 Free Software Foundation, Inc.
+ * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
  *
  * This file is part of VOLK
  *
@@ -166,6 +167,50 @@ static inline float log2f_non_ieee(float f)
 // Constant used to do log10 calculations as faster log2
 ////////////////////////////////////////////////////////////////////////
 // precalculated 10.0 / log2f_non_ieee(10.0) to allow for constexpr
-#define volk_log2to10factor 3.01029995663981209120
+#define volk_log2to10factor (0x1.815182p1) // 3.01029995663981209120
+
+////////////////////////////////////////////////////////////////////////
+// arctan(x)
+////////////////////////////////////////////////////////////////////////
+static inline float volk_arctan_poly(const float x)
+{
+    /*
+     * arctan(x) polynomial expansion on the interval [-1, 1]
+     * Maximum relative error < 6.6e-7
+     */
+    const float a1 = +0x1.ffffeap-1f;
+    const float a3 = -0x1.55437p-2f;
+    const float a5 = +0x1.972be6p-3f;
+    const float a7 = -0x1.1436ap-3f;
+    const float a9 = +0x1.5785aap-4f;
+    const float a11 = -0x1.2f3004p-5f;
+    const float a13 = +0x1.01a37cp-7f;
+
+    const float x_times_x = x * x;
+    float arctan = a13;
+    arctan = fmaf(x_times_x, arctan, a11);
+    arctan = fmaf(x_times_x, arctan, a9);
+    arctan = fmaf(x_times_x, arctan, a7);
+    arctan = fmaf(x_times_x, arctan, a5);
+    arctan = fmaf(x_times_x, arctan, a3);
+    arctan = fmaf(x_times_x, arctan, a1);
+    arctan *= x;
+
+    return arctan;
+}
+
+static inline float volk_arctan(const float x)
+{
+    /*
+     *  arctan(x) + arctan(1 / x) == sign(x) * pi / 2
+     */
+    const float pi_over_2 = 0x1.921fb6p0f;
+
+    if (fabs(x) < 1.f) {
+        return volk_arctan_poly(x);
+    } else {
+        return copysignf(pi_over_2, x) - volk_arctan_poly(1.f / x);
+    }
+}
 
 #endif /*INCLUDED_LIBVOLK_COMMON_H*/
diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h
index f20ff1a4f..b1d68ed97 100644
--- a/kernels/volk/volk_32f_atan_32f.h
+++ b/kernels/volk/volk_32f_atan_32f.h
@@ -56,135 +56,9 @@
  */
 #include <math.h>
 
-#define POLY_ORDER (13) // Use either 11, 12, 13 or 15
-/*
- * arctan(x) polynomial expansion on the interval [-1, 1]
- */
-#if (POLY_ORDER == 11)
-static inline float arctan_approximation(const float x)
-{
-    /*
-     * Max relative error < 4.4e-6
-     */
-    const float a1 = +0x1.ffff6ep-1f;
-    const float a3 = -0x1.54fca2p-2f;
-    const float a5 = +0x1.90aaa2p-3f;
-    const float a7 = -0x1.f09d2ep-4f;
-    const float a9 = +0x1.d6e42cp-5f;
-    const float a11 = -0x1.b9c81ep-7f;
-
-    const float x_times_x = x * x;
-    float arctan = a11;
-    arctan = fmaf(x_times_x, arctan, a9);
-    arctan = fmaf(x_times_x, arctan, a7);
-    arctan = fmaf(x_times_x, arctan, a5);
-    arctan = fmaf(x_times_x, arctan, a3);
-    arctan = fmaf(x_times_x, arctan, a1);
-    arctan *= x;
-
-    return arctan;
-}
-#elif (POLY_ORDER == 12) // Order 13 with a1 set to 1
-static inline float arctan_approximation(const float x)
-{
-    /*
-     * Max relative error < 7.5e-7
-     */
-    //          a1 == 1 implicitly
-    const float a3 = -0x1.5548a4p-2f;
-    const float a5 = +0x1.978224p-3f;
-    const float a7 = -0x1.156488p-3f;
-    const float a9 = +0x1.5b822cp-4f;
-    const float a11 = -0x1.35a172p-5f;
-    const float a13 = +0x1.09a14ep-7f;
-
-    const float x_times_x = x * x;
-    float arctan = a13;
-    arctan = fmaf(x_times_x, arctan, a11);
-    arctan = fmaf(x_times_x, arctan, a9);
-    arctan = fmaf(x_times_x, arctan, a7);
-    arctan = fmaf(x_times_x, arctan, a5);
-    arctan = fmaf(x_times_x, arctan, a3);
-    arctan *= x_times_x;
-    arctan = fmaf(x, arctan, x);
-
-    return arctan;
-}
-#elif (POLY_ORDER == 13)
-static inline float arctan_approximation(const float x)
-{
-    /*
-     * Max relative error < 6.6e-7
-     */
-    const float a1 = +0x1.ffffeap-1f;
-    const float a3 = -0x1.55437p-2f;
-    const float a5 = +0x1.972be6p-3f;
-    const float a7 = -0x1.1436ap-3f;
-    const float a9 = +0x1.5785aap-4f;
-    const float a11 = -0x1.2f3004p-5f;
-    const float a13 = +0x1.01a37cp-7f;
-
-    const float x_times_x = x * x;
-    float arctan = a13;
-    arctan = fmaf(x_times_x, arctan, a11);
-    arctan = fmaf(x_times_x, arctan, a9);
-    arctan = fmaf(x_times_x, arctan, a7);
-    arctan = fmaf(x_times_x, arctan, a5);
-    arctan = fmaf(x_times_x, arctan, a3);
-    arctan = fmaf(x_times_x, arctan, a1);
-    arctan *= x;
-
-    return arctan;
-}
-#elif (POLY_ORDER == 15)
-static inline float arctan_approximation(const float x)
-{
-    /*
-     * Max relative error < 1.0e-7
-     */
-    const float a1 = +0x1.fffffcp-1f;
-    const float a3 = -0x1.55519ep-2f;
-    const float a5 = +0x1.98f6a8p-3f;
-    const float a7 = -0x1.1f0a92p-3f;
-    const float a9 = +0x1.95b654p-4f;
-    const float a11 = -0x1.e65492p-5f;
-    const float a13 = +0x1.8c0c36p-6f;
-    const float a15 = -0x1.32316ep-8f;
-
-    const float x_times_x = x * x;
-    float arctan = a15;
-    arctan = fmaf(x_times_x, arctan, a13);
-    arctan = fmaf(x_times_x, arctan, a11);
-    arctan = fmaf(x_times_x, arctan, a9);
-    arctan = fmaf(x_times_x, arctan, a7);
-    arctan = fmaf(x_times_x, arctan, a5);
-    arctan = fmaf(x_times_x, arctan, a3);
-    arctan = fmaf(x_times_x, arctan, a1);
-    arctan *= x;
-
-    return arctan;
-}
-#else
-#error Undefined polynomial order.
-#endif
-
 #ifndef INCLUDED_volk_32f_atan_32f_a_H
 #define INCLUDED_volk_32f_atan_32f_a_H
 
-static inline float arctan(const float x)
-{
-    /*
-     *  arctan(x) + arctan(1 / x) == sign(x) * pi / 2
-     */
-    const float pi_over_2 = 0x1.921fb6p0f;
-
-    if (fabs(x) < 1.f) {
-        return arctan_approximation(x);
-    } else {
-        return copysignf(pi_over_2, x) - arctan_approximation(1.f / x);
-    }
-}
-
 #if LV_HAVE_AVX2 && LV_HAVE_FMA
 #include <immintrin.h>
 #include <volk/volk_avx2_fma_intrinsics.h>
@@ -215,7 +89,7 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point
 
     number = eighth_points * 8;
     for (; number < num_points; number++) {
-        *out++ = arctan(*in++);
+        *out++ = volk_arctan(*in++);
     }
 }
 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
@@ -250,7 +124,7 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
 
     number = eighth_points * 8;
     for (; number < num_points; number++) {
-        *out++ = arctan(*in++);
+        *out++ = volk_arctan(*in++);
     }
 }
 #endif /* LV_HAVE_AVX for aligned */
@@ -285,7 +159,7 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
 
     number = quarter_points * 4;
     for (; number < num_points; number++) {
-        *out++ = arctan(*in++);
+        *out++ = volk_arctan(*in++);
     }
 }
 #endif /* LV_HAVE_SSE4_1 for aligned */
@@ -323,7 +197,7 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point
 
     number = eighth_points * 8;
     for (; number < num_points; number++) {
-        *out++ = arctan(*in++);
+        *out++ = volk_arctan(*in++);
     }
 }
 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
@@ -357,7 +231,7 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
 
     number = eighth_points * 8;
     for (; number < num_points; number++) {
-        *out++ = arctan(*in++);
+        *out++ = volk_arctan(*in++);
     }
 }
 #endif /* LV_HAVE_AVX for unaligned */
@@ -392,7 +266,7 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
 
     number = quarter_points * 4;
     for (; number < num_points; number++) {
-        *out++ = arctan(*in++);
+        *out++ = volk_arctan(*in++);
     }
 }
 #endif /* LV_HAVE_SSE4_1 for unaligned */
@@ -403,7 +277,7 @@ volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_point
 {
     unsigned int number = 0;
     for (; number < num_points; number++) {
-        *out++ = arctan(*in++);
+        *out++ = volk_arctan(*in++);
     }
 }
 #endif /* LV_HAVE_GENERIC */

From 0343e3c7b5dbc3aeef1633948db602b07fbde2cc Mon Sep 17 00:00:00 2001
From: Magnus Lundmark <magnuslundmark@gmail.com>
Date: Thu, 28 Sep 2023 08:37:58 +0200
Subject: [PATCH 7/7] renamed for consistency

Signed-off-by: Magnus Lundmark <magnuslundmark@gmail.com>
---
 include/volk/volk_avx2_fma_intrinsics.h |  2 +-
 include/volk/volk_avx_intrinsics.h      |  2 +-
 include/volk/volk_sse_intrinsics.h      |  2 +-
 kernels/volk/volk_32f_atan_32f.h        | 12 ++++++------
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/volk/volk_avx2_fma_intrinsics.h b/include/volk/volk_avx2_fma_intrinsics.h
index 1027200f6..03b24e6c0 100644
--- a/include/volk/volk_avx2_fma_intrinsics.h
+++ b/include/volk/volk_avx2_fma_intrinsics.h
@@ -23,7 +23,7 @@
  * Maximum relative error ~6.5e-7
  * Polynomial evaluated via Horner's method
  */
-static inline __m256 _m256_arctan_approximation_avx2_fma(const __m256 x)
+static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
 {
     const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
     const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h
index 1277a053c..2fc0f064e 100644
--- a/include/volk/volk_avx_intrinsics.h
+++ b/include/volk/volk_avx_intrinsics.h
@@ -24,7 +24,7 @@
  * Maximum relative error ~6.5e-7
  * Polynomial evaluated via Horner's method
  */
-static inline __m256 _m256_arctan_approximation_avx(const __m256 x)
+static inline __m256 _m256_arctan_poly_avx(const __m256 x)
 {
     const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
     const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
diff --git a/include/volk/volk_sse_intrinsics.h b/include/volk/volk_sse_intrinsics.h
index 2cd0dd017..0ede17845 100644
--- a/include/volk/volk_sse_intrinsics.h
+++ b/include/volk/volk_sse_intrinsics.h
@@ -24,7 +24,7 @@
  * Maximum relative error ~6.5e-7
  * Polynomial evaluated via Horner's method
  */
-static inline __m128 _mm_arctan_approximation_sse(const __m128 x)
+static inline __m128 _mm_arctan_poly_sse(const __m128 x)
 {
     const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f);
     const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f);
diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h
index b1d68ed97..dc5987cb8 100644
--- a/kernels/volk/volk_32f_atan_32f.h
+++ b/kernels/volk/volk_32f_atan_32f.h
@@ -77,7 +77,7 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
         __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
                                       _mm256_blendv_ps(one, x, swap_mask));
-        __m256 result = _m256_arctan_approximation_avx2_fma(x_star);
+        __m256 result = _m256_arctan_poly_avx2_fma(x_star);
         __m256 term = _mm256_and_ps(x_star, sign_mask);
         term = _mm256_or_ps(pi_over_2, term);
         term = _mm256_sub_ps(term, result);
@@ -112,7 +112,7 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
         __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
                                       _mm256_blendv_ps(one, x, swap_mask));
-        __m256 result = _m256_arctan_approximation_avx(x_star);
+        __m256 result = _m256_arctan_poly_avx(x_star);
         __m256 term = _mm256_and_ps(x_star, sign_mask);
         term = _mm256_or_ps(pi_over_2, term);
         term = _mm256_sub_ps(term, result);
@@ -147,7 +147,7 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
         __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
         __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask),
                                    _mm_blendv_ps(one, x, swap_mask));
-        __m128 result = _mm_arctan_approximation_sse(x_star);
+        __m128 result = _mm_arctan_poly_sse(x_star);
         __m128 term = _mm_and_ps(x_star, sign_mask);
         term = _mm_or_ps(pi_over_2, term);
         term = _mm_sub_ps(term, result);
@@ -185,7 +185,7 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
         __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
                                       _mm256_blendv_ps(one, x, swap_mask));
-        __m256 result = _m256_arctan_approximation_avx2_fma(x_star);
+        __m256 result = _m256_arctan_poly_avx2_fma(x_star);
         __m256 term = _mm256_and_ps(x_star, sign_mask);
         term = _mm256_or_ps(pi_over_2, term);
         term = _mm256_sub_ps(term, result);
@@ -219,7 +219,7 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
         __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
                                       _mm256_blendv_ps(one, x, swap_mask));
-        __m256 result = _m256_arctan_approximation_avx(x_star);
+        __m256 result = _m256_arctan_poly_avx(x_star);
         __m256 term = _mm256_and_ps(x_star, sign_mask);
         term = _mm256_or_ps(pi_over_2, term);
         term = _mm256_sub_ps(term, result);
@@ -254,7 +254,7 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
         __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
         __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask),
                                    _mm_blendv_ps(one, x, swap_mask));
-        __m128 result = _mm_arctan_approximation_sse(x_star);
+        __m128 result = _mm_arctan_poly_sse(x_star);
         __m128 term = _mm_and_ps(x_star, sign_mask);
         term = _mm_or_ps(pi_over_2, term);
         term = _mm_sub_ps(term, result);