Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New kernels for arctan #633

Merged
merged 7 commits into from
Oct 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/run-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:
env:
CC: ${{ matrix.compiler.cc }}
CXX: ${{ matrix.compiler.cxx }}
run: mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-Werror" ..
run: mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON ..
- name: Build
run: |
echo "Build with $(nproc) thread(s)"
Expand Down Expand Up @@ -150,7 +150,7 @@ jobs:
run: |
cd /volk
cd build
cmake -DCMAKE_CXX_FLAGS="-Werror" ..
cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON ..
echo "Build with $(nproc) thread(s)"
make -j$(nproc)
./cpu_features/list_cpu_features
Expand All @@ -173,7 +173,7 @@ jobs:
- name: dependencies
run: sudo apt install python3-mako liborc-dev
- name: configure
run: mkdir build && cd build && cmake -DENABLE_STATIC_LIBS=True ..
run: mkdir build && cd build && cmake -DENABLE_STATIC_LIBS=True -DBUILD_EXECUTABLE=ON ..
- name: build
run: cmake --build build -j$(nproc)
- name: Print info
Expand Down Expand Up @@ -248,7 +248,7 @@ jobs:
- name: dependencies
run: pip3 install mako
- name: configure
run: mkdir build && cd build && cmake ..
run: mkdir build && cd build && cmake -DBUILD_EXECUTABLE=ON ..
- name: build
run: cmake --build build --config Debug -j3
- name: Print info
Expand Down
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#
# Copyright 2011-2020 Free Software Foundation, Inc.
# Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
#
# This file is part of VOLK
#
Expand Down Expand Up @@ -144,6 +145,7 @@ if (VOLK_CPU_FEATURES)
FORCE)
set(BUILD_SHARED_LIBS_SAVED "${BUILD_SHARED_LIBS}")
set(BUILD_SHARED_LIBS OFF)
set(ENABLE_INSTALL OFF)
add_subdirectory(cpu_features)
set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}")
endif()
Expand Down Expand Up @@ -248,6 +250,7 @@ install(FILES
${CMAKE_SOURCE_DIR}/include/volk/saturation_arithmetic.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h
Expand Down
50 changes: 50 additions & 0 deletions include/volk/volk_avx2_fma_intrinsics.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/* -*- c++ -*- */
/*
* Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
*
* This file is part of VOLK
*
* SPDX-License-Identifier: LGPL-3.0-or-later
*/

/*
* This file is intended to hold AVX2 FMA intrinsics of intrinsics.
* They should be used in VOLK kernels to avoid copy-paste.
*/

#ifndef INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
#define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
#include <immintrin.h>

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
*
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);

const __m256 x_times_x = _mm256_mul_ps(x, x);
__m256 arctan;
arctan = a13;
arctan = _mm256_fmadd_ps(x_times_x, arctan, a11);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a9);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a7);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a5);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a3);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a1);
arctan = _mm256_mul_ps(x, arctan);

return arctan;
}

#endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */
38 changes: 38 additions & 0 deletions include/volk/volk_avx_intrinsics.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/* -*- c++ -*- */
/*
* Copyright 2015 Free Software Foundation, Inc.
* Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
*
* This file is part of VOLK
*
Expand All @@ -16,6 +17,43 @@
#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
#include <immintrin.h>

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
*
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m256 _m256_arctan_poly_avx(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);

const __m256 x_times_x = _mm256_mul_ps(x, x);
__m256 arctan;
arctan = a13;
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a11);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a9);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a7);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a5);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a3);
arctan = _mm256_mul_ps(x_times_x, arctan);
arctan = _mm256_add_ps(arctan, a1);
arctan = _mm256_mul_ps(x, arctan);

return arctan;
}

static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
{
__m256 yl, yh, tmp1, tmp2;
Expand Down
47 changes: 46 additions & 1 deletion include/volk/volk_common.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/* -*- c++ -*- */
/*
* Copyright 2010, 2011, 2015-2017, 2019, 2020 Free Software Foundation, Inc.
* Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
*
* This file is part of VOLK
*
Expand Down Expand Up @@ -166,6 +167,50 @@ static inline float log2f_non_ieee(float f)
// Constant used to do log10 calculations as faster log2
////////////////////////////////////////////////////////////////////////
// precalculated 10.0 / log2f_non_ieee(10.0) to allow for constexpr
#define volk_log2to10factor 3.01029995663981209120
#define volk_log2to10factor (0x1.815182p1) // 3.01029995663981209120

////////////////////////////////////////////////////////////////////////
// arctan(x)
////////////////////////////////////////////////////////////////////////
static inline float volk_arctan_poly(const float x)
{
/*
* arctan(x) polynomial expansion on the interval [-1, 1]
* Maximum relative error < 6.6e-7
*/
const float a1 = +0x1.ffffeap-1f;
const float a3 = -0x1.55437p-2f;
const float a5 = +0x1.972be6p-3f;
const float a7 = -0x1.1436ap-3f;
const float a9 = +0x1.5785aap-4f;
const float a11 = -0x1.2f3004p-5f;
const float a13 = +0x1.01a37cp-7f;

const float x_times_x = x * x;
float arctan = a13;
arctan = fmaf(x_times_x, arctan, a11);
arctan = fmaf(x_times_x, arctan, a9);
arctan = fmaf(x_times_x, arctan, a7);
arctan = fmaf(x_times_x, arctan, a5);
arctan = fmaf(x_times_x, arctan, a3);
arctan = fmaf(x_times_x, arctan, a1);
arctan *= x;

return arctan;
}

static inline float volk_arctan(const float x)
{
/*
* arctan(x) + arctan(1 / x) == sign(x) * pi / 2
*/
const float pi_over_2 = 0x1.921fb6p0f;

if (fabs(x) < 1.f) {
return volk_arctan_poly(x);
} else {
return copysignf(pi_over_2, x) - volk_arctan_poly(1.f / x);
}
}

#endif /*INCLUDED_LIBVOLK_COMMON_H*/
38 changes: 38 additions & 0 deletions include/volk/volk_sse_intrinsics.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/* -*- c++ -*- */
/*
* Copyright 2015 Free Software Foundation, Inc.
* Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
*
* This file is part of VOLK
*
Expand All @@ -16,6 +17,43 @@
#define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
#include <xmmintrin.h>

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
*
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m128 _mm_arctan_poly_sse(const __m128 x)
{
const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f);
const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f);
const __m128 a5 = _mm_set1_ps(+0x1.972be6p-3f);
const __m128 a7 = _mm_set1_ps(-0x1.1436ap-3f);
const __m128 a9 = _mm_set1_ps(+0x1.5785aap-4f);
const __m128 a11 = _mm_set1_ps(-0x1.2f3004p-5f);
const __m128 a13 = _mm_set1_ps(+0x1.01a37cp-7f);

const __m128 x_times_x = _mm_mul_ps(x, x);
__m128 arctan;
arctan = a13;
arctan = _mm_mul_ps(x_times_x, arctan);
arctan = _mm_add_ps(arctan, a11);
arctan = _mm_mul_ps(x_times_x, arctan);
arctan = _mm_add_ps(arctan, a9);
arctan = _mm_mul_ps(x_times_x, arctan);
arctan = _mm_add_ps(arctan, a7);
arctan = _mm_mul_ps(x_times_x, arctan);
arctan = _mm_add_ps(arctan, a5);
arctan = _mm_mul_ps(x_times_x, arctan);
arctan = _mm_add_ps(arctan, a3);
arctan = _mm_mul_ps(x_times_x, arctan);
arctan = _mm_add_ps(arctan, a1);
arctan = _mm_mul_ps(x, arctan);

return arctan;
}

static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
{
__m128 iValue, qValue;
Expand Down
Loading
Loading