moved intrinsic

Signed-off-by: Magnus Lundmark <magnuslundmark@gmail.com>
gnuradio · Sep 22, 2023 · b9f8c87 · b9f8c87
1 parent fa90d4c
commit b9f8c87
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 64 deletions.
diff --git a/include/volk/volk_avx2_intrinsics.h b/include/volk/volk_avx2_intrinsics.h
@@ -26,61 +26,25 @@
  */
 static inline __m256 _m256_arctan_approximation_avx2_fma(const __m256 x)
 {
-    const __m256 a1 = _mm256_set1_ps(0.9999993329f);
-    const __m256 a3 = _mm256_set1_ps(-0.3332985605f);
-    const __m256 a5 = _mm256_set1_ps(0.1994653599f);
-    const __m256 a7 = _mm256_set1_ps(-0.1390853351f);
-    const __m256 a9 = _mm256_set1_ps(0.0964200441f);
+    const __m256  a1 = _mm256_set1_ps( 0.9999993329f);
+    const __m256  a3 = _mm256_set1_ps(-0.3332985605f);
+    const __m256  a5 = _mm256_set1_ps( 0.1994653599f);
+    const __m256  a7 = _mm256_set1_ps(-0.1390853351f);
+    const __m256  a9 = _mm256_set1_ps( 0.0964200441f);
     const __m256 a11 = _mm256_set1_ps(-0.0559098861f);
-    const __m256 a13 = _mm256_set1_ps(0.0218612288f);
+    const __m256 a13 = _mm256_set1_ps( 0.0218612288f);
     const __m256 a15 = _mm256_set1_ps(-0.0040540580f);
 
     const __m256 x_times_x = _mm256_mul_ps(x, x);
     __m256 arctan;
     arctan = a15;
     arctan = _mm256_fmadd_ps(x_times_x, arctan, a13);
     arctan = _mm256_fmadd_ps(x_times_x, arctan, a11);
-    arctan = _mm256_fmadd_ps(x_times_x, arctan, a9);
-    arctan = _mm256_fmadd_ps(x_times_x, arctan, a7);
-    arctan = _mm256_fmadd_ps(x_times_x, arctan, a5);
-    arctan = _mm256_fmadd_ps(x_times_x, arctan, a3);
-    arctan = _mm256_fmadd_ps(x_times_x, arctan, a1);
-    arctan = _mm256_mul_ps(x, arctan);
-
-    return arctan;
-}
-
-/*
- * As above but without FMA
- */
-static inline __m256 _m256_arctan_approximation_avx2(__m256 x)
-{
-    const __m256 a1 = _mm256_set1_ps(0.9999993329f);
-    const __m256 a3 = _mm256_set1_ps(-0.3332985605f);
-    const __m256 a5 = _mm256_set1_ps(0.1994653599f);
-    const __m256 a7 = _mm256_set1_ps(-0.1390853351f);
-    const __m256 a9 = _mm256_set1_ps(0.0964200441f);
-    const __m256 a11 = _mm256_set1_ps(-0.0559098861f);
-    const __m256 a13 = _mm256_set1_ps(0.0218612288f);
-    const __m256 a15 = _mm256_set1_ps(-0.0040540580f);
-
-    const __m256 x_times_x = _mm256_mul_ps(x, x);
-    __m256 arctan;
-    arctan = a15;
-    arctan = _mm256_mul_ps(x_times_x, arctan);
-    arctan = _mm256_add_ps(arctan, a13);
-    arctan = _mm256_mul_ps(x_times_x, arctan);
-    arctan = _mm256_add_ps(arctan, a11);
-    arctan = _mm256_mul_ps(x_times_x, arctan);
-    arctan = _mm256_add_ps(arctan, a9);
-    arctan = _mm256_mul_ps(x_times_x, arctan);
-    arctan = _mm256_add_ps(arctan, a7);
-    arctan = _mm256_mul_ps(x_times_x, arctan);
-    arctan = _mm256_add_ps(arctan, a5);
-    arctan = _mm256_mul_ps(x_times_x, arctan);
-    arctan = _mm256_add_ps(arctan, a3);
-    arctan = _mm256_mul_ps(x_times_x, arctan);
-    arctan = _mm256_add_ps(arctan, a1);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan,  a9);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan,  a7);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan,  a5);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan,  a3);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan,  a1);
     arctan = _mm256_mul_ps(x, arctan);
 
     return arctan;

diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h
@@ -16,6 +16,46 @@
 #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
 #include <immintrin.h>
 
+/*
+ * Approximate arctan(x) via polynomial expansion
+ * on the interval [-1, 1]
+ *
+ * Maximum absolute error ~5e-8
+ * Polynomial evaluated via Horner's method
+ */
+static inline __m256 _m256_arctan_approximation_avx(__m256 x)
+{
+    const __m256  a1 = _mm256_set1_ps( 0.9999993329f);
+    const __m256  a3 = _mm256_set1_ps(-0.3332985605f);
+    const __m256  a5 = _mm256_set1_ps( 0.1994653599f);
+    const __m256  a7 = _mm256_set1_ps(-0.1390853351f);
+    const __m256  a9 = _mm256_set1_ps( 0.0964200441f);
+    const __m256 a11 = _mm256_set1_ps(-0.0559098861f);
+    const __m256 a13 = _mm256_set1_ps( 0.0218612288f);
+    const __m256 a15 = _mm256_set1_ps(-0.0040540580f);
+
+    const __m256 x_times_x = _mm256_mul_ps(x, x);
+    __m256 arctan;
+    arctan = a15;
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a13);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a11);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a9);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a7);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a5);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a3);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a1);
+    arctan = _mm256_mul_ps(x, arctan);
+
+    return arctan;
+}
+
 static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
 {
     __m256 yl, yh, tmp1, tmp2;

diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h
@@ -61,25 +61,25 @@
 
 static inline float arctan_approximation(const float x)
 {
-    const float a1 = 0.9999993329f;
-    const float a3 = -0.3332985605f;
-    const float a5 = 0.1994653599f;
-    const float a7 = -0.1390853351f;
-    const float a9 = 0.0964200441f;
+    const float  a1 =  0.9999993329f;
+    const float  a3 = -0.3332985605f;
+    const float  a5 =  0.1994653599f;
+    const float  a7 = -0.1390853351f;
+    const float  a9 =  0.0964200441f;
     const float a11 = -0.0559098861f;
-    const float a13 = 0.0218612288f;
+    const float a13 =  0.0218612288f;
     const float a15 = -0.0040540580f;
 
     const float x_times_x = x * x;
     float arctan;
     arctan = a15;
     arctan = fmaf(x_times_x, arctan, a13);
     arctan = fmaf(x_times_x, arctan, a11);
-    arctan = fmaf(x_times_x, arctan, a9);
-    arctan = fmaf(x_times_x, arctan, a7);
-    arctan = fmaf(x_times_x, arctan, a5);
-    arctan = fmaf(x_times_x, arctan, a3);
-    arctan = fmaf(x_times_x, arctan, a1);
+    arctan = fmaf(x_times_x, arctan,  a9);
+    arctan = fmaf(x_times_x, arctan,  a7);
+    arctan = fmaf(x_times_x, arctan,  a5);
+    arctan = fmaf(x_times_x, arctan,  a3);
+    arctan = fmaf(x_times_x, arctan,  a1);
     arctan *= x;
 
     return arctan;
@@ -131,7 +131,7 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, uint32_t num_points)
 }
 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
 
-#if LV_HAVE_AVX2
+#if LV_HAVE_AVX
 #include <immintrin.h>
 static inline void
 volk_32f_atan_32f_a_avx2(float* out, const float* in, uint32_t num_points)
@@ -148,7 +148,7 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, uint32_t num_points)
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
         __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
                                       _mm256_blendv_ps(one, x, swap_mask));
-        __m256 result = _m256_arctan_approximation_avx2(x_star);
+        __m256 result = _m256_arctan_approximation_avx(x_star);
         __m256 term = _mm256_and_ps(x_star, sign_mask);
         term = _mm256_or_ps(pi_2, term);
         term = _mm256_sub_ps(term, result);
@@ -163,7 +163,7 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, uint32_t num_points)
         *out++ = arctan(*in++);
     }
 }
-#endif /* LV_HAVE_AVX2 for aligned */
+#endif /* LV_HAVE_AVX for aligned */
 
 #ifdef LV_HAVE_SSE4_1
 #include <smmintrin.h>
@@ -238,7 +238,7 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, uint32_t num_points)
 }
 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
 
-#if LV_HAVE_AVX2
+#if LV_HAVE_AVX
 #include <immintrin.h>
 static inline void
 volk_32f_atan_32f_u_avx2(float* out, const float* in, uint32_t num_points)
@@ -255,7 +255,7 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, uint32_t num_points)
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
         __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
                                       _mm256_blendv_ps(one, x, swap_mask));
-        __m256 result = _m256_arctan_approximation_avx2(x_star);
+        __m256 result = _m256_arctan_approximation_avx(x_star);
         __m256 term = _mm256_and_ps(x_star, sign_mask);
         term = _mm256_or_ps(pi_2, term);
         term = _mm256_sub_ps(term, result);
@@ -270,7 +270,7 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, uint32_t num_points)
         *out++ = arctan(*in++);
     }
 }
-#endif /* LV_HAVE_AVX2 for unaligned */
+#endif /* LV_HAVE_AVX for unaligned */
 
 #ifdef LV_HAVE_SSE4_1
 #include <smmintrin.h>