Skip to content

Commit

Permalink
moved intrinsic
Browse files Browse the repository at this point in the history
Signed-off-by: Magnus Lundmark <magnuslundmark@gmail.com>
  • Loading branch information
Ka-zam committed Sep 22, 2023
1 parent fa90d4c commit b9f8c87
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 64 deletions.
58 changes: 11 additions & 47 deletions include/volk/volk_avx2_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,61 +26,25 @@
*/
static inline __m256 _m256_arctan_approximation_avx2_fma(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(0.9999993329f);
const __m256 a3 = _mm256_set1_ps(-0.3332985605f);
const __m256 a5 = _mm256_set1_ps(0.1994653599f);
const __m256 a7 = _mm256_set1_ps(-0.1390853351f);
const __m256 a9 = _mm256_set1_ps(0.0964200441f);
const __m256 a1 = _mm256_set1_ps( 0.9999993329f);
const __m256 a3 = _mm256_set1_ps(-0.3332985605f);
const __m256 a5 = _mm256_set1_ps( 0.1994653599f);
const __m256 a7 = _mm256_set1_ps(-0.1390853351f);
const __m256 a9 = _mm256_set1_ps( 0.0964200441f);
const __m256 a11 = _mm256_set1_ps(-0.0559098861f);
const __m256 a13 = _mm256_set1_ps(0.0218612288f);
const __m256 a13 = _mm256_set1_ps( 0.0218612288f);
const __m256 a15 = _mm256_set1_ps(-0.0040540580f);

const __m256 x_times_x = _mm256_mul_ps(x, x);
__m256 arctan;
arctan = a15;
arctan = _mm256_fmadd_ps(x_times_x, arctan, a13);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a11);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a9);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a7);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a5);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a3);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a1);
arctan = _mm256_mul_ps(x, arctan);

return arctan;
}

/*
* As above but without FMA
*/
static inline __m256 _m256_arctan_approximation_avx2(__m256 x)
{
const __m256 a1 = _mm256_set1_ps(0.9999993329f);
const __m256 a3 = _mm256_set1_ps(-0.3332985605f);
const __m256 a5 = _mm256_set1_ps(0.1994653599f);
const __m256 a7 = _mm256_set1_ps(-0.1390853351f);
const __m256 a9 = _mm256_set1_ps(0.0964200441f);
const __m256 a11 = _mm256_set1_ps(-0.0559098861f);
const __m256 a13 = _mm256_set1_ps(0.0218612288f);
const __m256 a15 = _mm256_set1_ps(-0.0040540580f);

const __m256 x_times_x = _mm256_mul_ps(x, x);
__m256 arctan;
arctan = a15;
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a13);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a11);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a9);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a7);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a5);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a3);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a1);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a9);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a7);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a5);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a3);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a1);
arctan = _mm256_mul_ps(x, arctan);

return arctan;
Expand Down
40 changes: 40 additions & 0 deletions include/volk/volk_avx_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,46 @@
#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
#include <immintrin.h>

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
*
* Maximum absolute error ~5e-8
* Polynomial evaluated via Horner's method
*/
static inline __m256 _m256_arctan_approximation_avx(__m256 x)
{
const __m256 a1 = _mm256_set1_ps( 0.9999993329f);
const __m256 a3 = _mm256_set1_ps(-0.3332985605f);
const __m256 a5 = _mm256_set1_ps( 0.1994653599f);
const __m256 a7 = _mm256_set1_ps(-0.1390853351f);
const __m256 a9 = _mm256_set1_ps( 0.0964200441f);
const __m256 a11 = _mm256_set1_ps(-0.0559098861f);
const __m256 a13 = _mm256_set1_ps( 0.0218612288f);
const __m256 a15 = _mm256_set1_ps(-0.0040540580f);

const __m256 x_times_x = _mm256_mul_ps(x, x);
__m256 arctan;
arctan = a15;
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a13);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a11);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a9);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a7);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a5);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a3);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a1);
arctan = _mm256_mul_ps(x, arctan);

return arctan;
}

static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
{
__m256 yl, yh, tmp1, tmp2;
Expand Down
34 changes: 17 additions & 17 deletions kernels/volk/volk_32f_atan_32f.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,25 +61,25 @@

static inline float arctan_approximation(const float x)
{
const float a1 = 0.9999993329f;
const float a3 = -0.3332985605f;
const float a5 = 0.1994653599f;
const float a7 = -0.1390853351f;
const float a9 = 0.0964200441f;
const float a1 = 0.9999993329f;
const float a3 = -0.3332985605f;
const float a5 = 0.1994653599f;
const float a7 = -0.1390853351f;
const float a9 = 0.0964200441f;
const float a11 = -0.0559098861f;
const float a13 = 0.0218612288f;
const float a13 = 0.0218612288f;
const float a15 = -0.0040540580f;

const float x_times_x = x * x;
float arctan;
arctan = a15;
arctan = fmaf(x_times_x, arctan, a13);
arctan = fmaf(x_times_x, arctan, a11);
arctan = fmaf(x_times_x, arctan, a9);
arctan = fmaf(x_times_x, arctan, a7);
arctan = fmaf(x_times_x, arctan, a5);
arctan = fmaf(x_times_x, arctan, a3);
arctan = fmaf(x_times_x, arctan, a1);
arctan = fmaf(x_times_x, arctan, a9);
arctan = fmaf(x_times_x, arctan, a7);
arctan = fmaf(x_times_x, arctan, a5);
arctan = fmaf(x_times_x, arctan, a3);
arctan = fmaf(x_times_x, arctan, a1);
arctan *= x;

return arctan;
Expand Down Expand Up @@ -131,7 +131,7 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, uint32_t num_points)
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */

#if LV_HAVE_AVX2
#if LV_HAVE_AVX
#include <immintrin.h>
static inline void
volk_32f_atan_32f_a_avx2(float* out, const float* in, uint32_t num_points)
Expand All @@ -148,7 +148,7 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, uint32_t num_points)
__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
__m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
_mm256_blendv_ps(one, x, swap_mask));
__m256 result = _m256_arctan_approximation_avx2(x_star);
__m256 result = _m256_arctan_approximation_avx(x_star);
__m256 term = _mm256_and_ps(x_star, sign_mask);
term = _mm256_or_ps(pi_2, term);
term = _mm256_sub_ps(term, result);
Expand All @@ -163,7 +163,7 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, uint32_t num_points)
*out++ = arctan(*in++);
}
}
#endif /* LV_HAVE_AVX2 for aligned */
#endif /* LV_HAVE_AVX for aligned */

#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
Expand Down Expand Up @@ -238,7 +238,7 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, uint32_t num_points)
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */

#if LV_HAVE_AVX2
#if LV_HAVE_AVX
#include <immintrin.h>
static inline void
volk_32f_atan_32f_u_avx2(float* out, const float* in, uint32_t num_points)
Expand All @@ -255,7 +255,7 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, uint32_t num_points)
__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
__m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
_mm256_blendv_ps(one, x, swap_mask));
__m256 result = _m256_arctan_approximation_avx2(x_star);
__m256 result = _m256_arctan_approximation_avx(x_star);
__m256 term = _mm256_and_ps(x_star, sign_mask);
term = _mm256_or_ps(pi_2, term);
term = _mm256_sub_ps(term, result);
Expand All @@ -270,7 +270,7 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, uint32_t num_points)
*out++ = arctan(*in++);
}
}
#endif /* LV_HAVE_AVX2 for unaligned */
#endif /* LV_HAVE_AVX for unaligned */

#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
Expand Down

0 comments on commit b9f8c87

Please sign in to comment.